#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/granite_speech_plus/modular_granite_speech_plus.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_granite_speech_plus.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import CONFIG_MAPPING, AutoConfig


@auto_docstring(checkpoint="ibm-granite/granite-speech-4.1-2b-plus")
@strict
class GraniteSpeechPlusEncoderConfig(PreTrainedConfig):
    r"""
    feedforward_mult (`int`, *optional*, defaults to 4):
        Multiplier for the up/down projections in the encoder's feedforward layers;
        The projections will have intermediate dim of size `hidden_dim * feedforward_mult`.
    output_dim (`int`, *optional*, defaults to 42):
        Intermediate dimension of the feedforward projections in the conformer
        to be added to every other encoder block's output.
    context_size (`int`, *optional*, defaults to 200):
        Context size to be used in conformer attention.
    max_pos_emb (`int`, *optional*, defaults to 512):
        Max pos embeds to be used in attention (shaw's relative positional encoding).
    conv_expansion_factor (`int`, *optional*, defaults to 2):
        Intermediate dimension to be used in conformer convolutions.
    cat_hidden_layers (`list[int]`, *optional*):
        Indices of encoder conformer layers whose outputs are concatenated with the final encoder
        output (along the feature dimension) before being passed to the projector. When set, the
        projector's ``encoder_hidden_size`` must equal
        ``encoder_config.hidden_dim * (len(cat_hidden_layers) + 1)``.

    Example:

    ```python
    >>> from transformers import GraniteSpeechPlusEncoderConfig, GraniteSpeechPlusCTCEncoder

    >>> # Initializing a GraniteSpeechPlusEncoderConfig
    >>> configuration = GraniteSpeechPlusEncoderConfig()

    >>> # Initializing a GraniteSpeechPlusCTCEncoder (with random weights)
    >>> model = GraniteSpeechPlusCTCEncoder(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "granite_speech_plus_encoder"

    input_dim: int = 160
    num_layers: int = 10
    hidden_dim: int = 1024
    feedforward_mult: int = 4
    num_heads: int = 8
    dim_head: int = 128
    output_dim: int = 42
    context_size: int = 200
    max_pos_emb: int = 512
    dropout: float | int = 0.1
    conv_kernel_size: int = 15
    conv_expansion_factor: int = 2

    cat_hidden_layers: list[int] | None = None


@auto_docstring(checkpoint="ibm-granite/granite-speech-4.1-2b-plus")
@strict
class GraniteSpeechPlusConfig(PreTrainedConfig):
    r"""
    projector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Blip2QFormerConfig`):
        The config object or dictionary of the audio projector.
    has_lora_adapter (`bool`, *optional*, defaults to `True`):
        Indicates whether or not the model has a lora adapter that should only
        be activate when processing audio inputs.
    downsample_rate (`int`, *optional*, defaults to 5):
        Downsample rate for the audio feature extractor.
    window_size (`int`, *optional*, defaults to 15):
        Window size for the audio feature projector.

    Example:

    ```python
    >>> from transformers import GraniteSpeechPlusConfig, GraniteSpeechPlusForConditionalGeneration

    >>> # Initializing a GraniteSpeechPlusConfig
    >>> configuration = GraniteSpeechPlusConfig()

    >>> # Initializing a GraniteSpeechPlusForConditionalGeneration (with random weights)
    >>> model = GraniteSpeechPlusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "granite_speech_plus"
    attribute_map = {
        "audio_token_id": "audio_token_index",
    }
    sub_configs = {
        "text_config": AutoConfig,
        "encoder_config": GraniteSpeechPlusEncoderConfig,
        "projector_config": AutoConfig,
    }

    text_config: dict | PreTrainedConfig | None = None
    encoder_config: dict | PreTrainedConfig | None = None
    projector_config: dict | PreTrainedConfig | None = None
    audio_token_index: int = 49155
    initializer_range: float = 0.02
    has_lora_adapter: bool = True
    downsample_rate: int = 5
    window_size: int = 15

    def __post_init__(self, **kwargs):
        if isinstance(self.text_config, dict):
            self.text_config["model_type"] = self.text_config.get("model_type", "granite")
            self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config)
        elif self.text_config is None:
            self.text_config = CONFIG_MAPPING["granite"]()

        if isinstance(self.projector_config, dict):
            self.projector_config["model_type"] = self.projector_config.get("model_type", "blip_2_qformer")
            self.projector_config = CONFIG_MAPPING[self.projector_config["model_type"]](**self.projector_config)
        elif self.projector_config is None:
            self.projector_config = CONFIG_MAPPING["blip_2_qformer"]()

        if not isinstance(self.encoder_config, GraniteSpeechPlusEncoderConfig):
            self.encoder_config = {} if self.encoder_config is None else self.encoder_config
            self.encoder_config = GraniteSpeechPlusEncoderConfig(**self.encoder_config)

        super().__post_init__(**kwargs)

        if self.encoder_config.cat_hidden_layers is not None:
            for idx in self.encoder_config.cat_hidden_layers:
                if idx < 0 or idx >= self.encoder_config.num_layers:
                    raise ValueError(
                        f"cat_hidden_layers index {idx} is out of range [0, {self.encoder_config.num_layers})."
                    )
        if self.encoder_config.cat_hidden_layers is not None:
            num_concat = len(self.encoder_config.cat_hidden_layers) + 1
            if self.projector_config.encoder_hidden_size != self.encoder_config.hidden_dim * num_concat:
                raise ValueError(
                    f"projector encoder_hidden_size {self.projector_config.encoder_hidden_size} "
                    f"must equal encoder hidden_dim * {num_concat} = "
                    f"{self.encoder_config.hidden_dim * num_concat}."
                )


__all__ = ["GraniteSpeechPlusConfig", "GraniteSpeechPlusEncoderConfig"]
