#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/granite4_vision/modular_granite4_vision.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_granite4_vision.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 IBM and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Literal

from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring
from ..auto import CONFIG_MAPPING, AutoConfig


@auto_docstring(checkpoint="ibm-granite4_vision_text/granite4_vision_text-3.0-8b-base")
@strict
class Granite4VisionTextConfig(PreTrainedConfig):
    r"""
    ```python
    >>> from transformers import Granite4VisionTextModel, Granite4VisionTextConfig

    >>> # Initializing a Granite4VisionText granite4_vision_text-3b style configuration
    >>> configuration = Granite4VisionTextConfig()

    >>> # Initializing a model from the granite4_vision_text-7b style configuration
    >>> model = Granite4VisionTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "granite4_vision_text"
    keys_to_ignore_at_inference = ["past_key_values"]
    # Default tensor parallel plan for base model `Granite4VisionTextModel`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    vocab_size: int = 32000
    hidden_size: int = 4096
    intermediate_size: int = 11008
    num_hidden_layers: int = 32
    num_attention_heads: int = 32
    num_key_value_heads: int | None = None
    hidden_act: str = "silu"
    max_position_embeddings: int = 2048
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-6
    use_cache: bool = True
    pad_token_id: int | None = None
    bos_token_id: int | None = 1
    eos_token_id: int | list[int] | None = 2
    tie_word_embeddings: bool = False
    rope_parameters: RopeParameters | dict | None = None
    attention_bias: bool = False
    attention_dropout: float | int = 0.0
    mlp_bias: bool = False
    embedding_multiplier: float | int = 1.0
    logits_scaling: float | int = 1.0
    residual_multiplier: float | int = 1.0
    attention_multiplier: float | int = 1.0
    base_config_key = "text_config"

    def __post_init__(self, **kwargs):
        if self.num_key_value_heads is None:
            self.num_key_value_heads = self.num_attention_heads

        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="llava-hf/llava-v1.6-mistral-7b-hf")
@strict
class Granite4VisionConfig(PreTrainedConfig):
    r"""
    image_grid_pinpoints (`list`, *optional*):
        A list of possible resolutions to use for processing high resolution images. Each item in the list should be a
        tuple or list of the form `(height, width)`.
    downsample_rate (`str`, *optional*):
        Fractional downsample rate for the Window Q-Former projector, e.g. `"1/4"` or `"3/8"`.
        The numerator is the query window side, the denominator is the key window side.
    deepstack_layer_map (`list`, *optional*):
        List of `[vision_layer_idx, llm_layer_idx]` pairs. Features from each vision encoder layer
        are projected and injected at the corresponding LLM decoder layer during forward pass.
    spatial_vision_layer (`int`, *optional*, defaults to `-1`):
        Index of the vision encoder layer used for spatial sampling.
    spatial_target_layers (`list`, *optional*, defaults to `[12, 15, 18, 21]`):
        Target LLM layers for the 4 spatial offset groups.
    projector_dropout (`float`, *optional*, defaults to `0.1`):
        Dropout probability in the Window Q-Former projector.
    qformer_config (`dict` or `Blip2QFormerConfig`, *optional*):
        Configuration for the Window Q-Former projector. If `None`, defaults are derived from
        `vision_config.hidden_size`.
    """

    model_type = "granite4_vision"
    attribute_map = {"image_token_id": "image_token_index"}
    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "qformer_config": AutoConfig}

    vision_config: dict | PreTrainedConfig | None = None
    text_config: dict | PreTrainedConfig | None = None
    image_token_index: int = 32000
    vision_feature_select_strategy: Literal["default", "full"] = "default"
    vision_feature_layer: int | list[int] = -2
    tie_word_embeddings: bool = False
    image_grid_pinpoints: list | None = None
    image_seq_length: int = 576

    downsample_rate: str | None = None
    deepstack_layer_map: list | None = None
    spatial_vision_layer: int = -1
    spatial_target_layers: list | None = None
    projector_dropout: float = 0.1
    qformer_config: dict | PreTrainedConfig | None = None

    def __post_init__(self, **kwargs):
        self.image_grid_pinpoints = (
            self.image_grid_pinpoints
            if self.image_grid_pinpoints is not None
            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
        )

        if self.deepstack_layer_map is not None:
            self.deepstack_layer_map = [(int(v), int(l)) for v, l in self.deepstack_layer_map]

        if self.spatial_target_layers is None:
            self.spatial_target_layers = [12, 15, 18, 21]

        if isinstance(self.vision_config, dict):
            self.vision_config["model_type"] = self.vision_config.get("model_type", "clip_vision_model")
            self.vision_config = CONFIG_MAPPING[self.vision_config["model_type"]](**self.vision_config)
        elif self.vision_config is None:
            self.vision_config = CONFIG_MAPPING["siglip_vision_model"]()

        if isinstance(self.text_config, dict):
            self.text_config["model_type"] = self.text_config.get("model_type", "granite4_vision_text")
            self.text_config = CONFIG_MAPPING[self.text_config["model_type"]](**self.text_config)
        elif self.text_config is None:
            self.text_config = CONFIG_MAPPING["llama"]()

        if isinstance(self.qformer_config, dict):
            model_type = self.qformer_config.get("model_type", "blip_2_qformer")
            self.qformer_config = CONFIG_MAPPING[model_type](**self.qformer_config)
        if self.qformer_config is None:
            vision_hidden_size = self.vision_config.hidden_size
            self.qformer_config = CONFIG_MAPPING["blip_2_qformer"](
                num_hidden_layers=1,
                intermediate_size=3072,
                cross_attention_frequency=1,
                max_position_embeddings=2048,
                use_qformer_text_input=False,
                hidden_size=vision_hidden_size,
                num_attention_heads=vision_hidden_size // 64,
                encoder_hidden_size=vision_hidden_size,
            )
        super().__post_init__(**kwargs)


__all__ = ["Granite4VisionConfig", "Granite4VisionTextConfig"]