#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_qwen3_omni_moe.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from huggingface_hub.dataclasses import strict

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters
from ...utils import auto_docstring, logging


logger = logging.get_logger(__name__)


@auto_docstring(checkpoint="Qwen/Qwen2.5-Omni-7B")
@strict
class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig):
    r"""
    max_source_positions (`int`, *optional*, defaults to 1500):
        Maximum sequence length for the inputs
    n_window (`int`, *optional*, defaults to 100):
        Number of windwos
    output_dim (`int`, *optional*, defaults to 3584):
        Dimensionality of the output
    n_window_infer (`int`, *optional*, defaults to `400`):
        Number of windows during inference
    conv_chunksize (`int`, *optional*, defaults to `500`):
        Chunk size of each input to convolutional layer
    downsample_hidden_size (`int`, *optional*, defaults to `480`):
        Hidden size in donwsampling layer
    """

    model_type = "qwen3_omni_moe_audio_encoder"
    attribute_map = {"num_hidden_layers": "encoder_layers"}

    num_mel_bins: int = 128
    encoder_layers: int = 32
    encoder_attention_heads: int = 20
    encoder_ffn_dim: int = 5120
    d_model: int = 1280
    dropout: float | int = 0.0
    attention_dropout: float | int = 0.0
    activation_function: str = "gelu"
    activation_dropout: float | int = 0.0
    scale_embedding: bool = False
    initializer_range: float = 0.02
    max_source_positions: int = 1500

    n_window: int = 100
    output_dim: int = 3584

    n_window_infer: int = 400
    conv_chunksize: int = 500
    downsample_hidden_size: int = 480


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeVisionEncoderConfig(PreTrainedConfig):
    r"""
    out_hidden_size (`int`, *optional*, defaults to 3584):
        The output hidden size of the vision model.
    num_position_embeddings (`int`, *optional*, defaults to 2304):
        The maximum sequence length that this model might ever be used with
    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[8, 16, 24]`):
        Indexed of layers for deepstack embeddings.
    """

    model_type = "qwen3_omni_moe_vision_encoder"
    base_config_key = "vision_config"

    depth: int = 27
    hidden_size: int = 1152
    hidden_act: str = "gelu_pytorch_tanh"
    intermediate_size: int = 4304
    num_heads: int = 16
    in_channels: int = 3
    patch_size: int | list[int] | tuple[int, int] = 16
    spatial_merge_size: int = 2
    temporal_patch_size: int | list[int] | tuple[int, int] = 2
    out_hidden_size: int = 3584
    num_position_embeddings: int = 2304
    deepstack_visual_indexes: list[int] | tuple[int, ...] = (8, 16, 24)
    initializer_range: float = 0.02


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeTextConfig(PreTrainedConfig):
    r"""
    decoder_sparse_step (`int`, *optional*, defaults to 1):
        The frequency of the MoE layer.
    mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
        Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock
        The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
        If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.

    ```python
    >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig

    >>> # Initializing a Qwen3OmniMoeText style configuration
    >>> configuration = Qwen3OmniMoeTextConfig()

    >>> # Initializing a model from the Qwen3-15B-A2B" style configuration
    >>> model = Qwen3OmniMoeTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen3_omni_moe_text"
    keys_to_ignore_at_inference = ["past_key_values"]
    default_theta = 1000000.0

    # Default tensor parallel plan for base model `Qwen3OmniMoeText`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
        "layers.*.mlp.experts.down_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }
    ignore_keys_at_rope_validation = {"mrope_section", "interleaved", "mrope_interleaved"}

    vocab_size: int = 3584
    hidden_size: int = 2048
    intermediate_size: int = 18944
    num_hidden_layers: int = 28
    num_attention_heads: int = 28
    num_key_value_heads: int = 4
    hidden_act: str = "silu"
    max_position_embeddings: int = 32768
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-6
    use_cache: bool = True
    rope_parameters: RopeParameters | dict | None = None
    attention_bias: bool = False
    sliding_window: int | None = None
    attention_dropout: float | int = 0.0
    decoder_sparse_step: int = 1
    moe_intermediate_size: int = 768
    num_experts_per_tok: int = 8
    num_experts: int = 128
    norm_topk_prob: bool = True
    output_router_logits: bool = False
    router_aux_loss_coef: float = 0.001
    mlp_only_layers: list[int] | None = None
    pad_token_id: int | None = None
    bos_token_id: int | None = None
    eos_token_id: int | list[int] | None = None

    def __post_init__(self, **kwargs):
        self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers

        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeThinkerConfig(PreTrainedConfig):
    r"""
    position_id_per_seconds (`int`, *optional*, defaults to 25):
        The increment of position id per second.
    audio_start_token_id (`int`, *optional*, defaults to 151647):
        The audio start token id to encode the audio prompt.
    user_token_id (`int`, *optional*, defaults to 872):
        The user token id to encode the user token.

    Example:

    ```python
    >>> from transformers import Qwen3OmniMoeThinkerModel, Qwen3OmniMoeThinkerConfig

    >>> # Initializing a default Qwen3OmniMoeThinkerConfig
    >>> configuration = Qwen3OmniMoeThinkerConfig()

    >>> # Initializing a model (with random weights) from the default configuration
    >>> model = Qwen3OmniMoeThinkerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen3_omni_moe_thinker"
    # Override parent's attribute_map as we use audio_token_id directly, not audio_token_index
    attribute_map = {}
    sub_configs = {
        "audio_config": Qwen3OmniMoeAudioEncoderConfig,
        "vision_config": Qwen3OmniMoeVisionEncoderConfig,
        "text_config": Qwen3OmniMoeTextConfig,
    }

    audio_config: dict | PreTrainedConfig | None = None
    vision_config: dict | PreTrainedConfig | None = None
    text_config: dict | PreTrainedConfig | None = None
    position_id_per_seconds: int = 25
    audio_start_token_id: int = 151647
    user_token_id: int = 872
    initializer_range: float = 0.02
    tie_word_embeddings: bool = False

    audio_token_id: int = 151646
    image_token_id: int = 151655
    video_token_id: int = 151656

    def __post_init__(self, **kwargs):
        if isinstance(self.vision_config, dict):
            self.vision_config = Qwen3OmniMoeVisionEncoderConfig(**self.vision_config)
        elif self.vision_config is None:
            self.vision_config = Qwen3OmniMoeVisionEncoderConfig()

        if isinstance(self.audio_config, dict):
            self.audio_config = Qwen3OmniMoeAudioEncoderConfig(**self.audio_config)
        elif self.audio_config is None:
            self.audio_config = Qwen3OmniMoeAudioEncoderConfig()

        if isinstance(self.text_config, dict):
            self.text_config = Qwen3OmniMoeTextConfig(**self.text_config)
        elif self.text_config is None:
            self.text_config = Qwen3OmniMoeTextConfig()

        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="Qwen/Qwen3OmniMoeTalkerCodePredictor-8B")
@strict
class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig):
    r"""
    num_code_groups (`int`, *optional*, defaults to 32):
        Number of codebook groups used in the predicted acoustic token sequence, corresponding to multi-codebook VQ representation.
    """

    model_type = "qwen3_omni_moe_talker_code_predictor"
    keys_to_ignore_at_inference = ["past_key_values"]

    # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerCodePredictor`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    vocab_size: int = 2048
    hidden_size: int = 1024
    intermediate_size: int = 3072
    num_hidden_layers: int = 5
    num_attention_heads: int = 16
    num_key_value_heads: int = 8
    head_dim: int = 128
    hidden_act: str = "silu"
    max_position_embeddings: int = 32768
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-6
    use_cache: bool = True
    tie_word_embeddings: bool = False
    rope_parameters: RopeParameters | dict | None = None
    attention_bias: bool = False
    sliding_window: int | None = None
    max_window_layers: int = 28
    layer_types: list[str] | None = None
    attention_dropout: float | int = 0.0
    pad_token_id: int | None = None
    bos_token_id: int | None = None
    eos_token_id: int | list[int] | None = None
    num_code_groups: int = 32

    def __post_init__(self, **kwargs):
        self.sliding_window = self.sliding_window
        if self.num_key_value_heads is None:
            self.num_key_value_heads = self.num_attention_heads

        if self.layer_types is None:
            self.layer_types = [
                "sliding_attention"
                if self.sliding_window is not None and i >= self.max_window_layers
                else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig):
    r"""
    decoder_sparse_step (`int`, *optional*, defaults to 1):
        The frequency of the MoE layer.
    mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
        Indicate which layers use Qwen3OmniMoeTalkerTextMLP rather than Qwen3OmniMoeTalkerTextSparseMoeBlock
        The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
        If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.

    ```python
    >>> from transformers import Qwen3OmniMoeTalkerTextModel, Qwen3OmniMoeTalkerTextConfig

    >>> # Initializing a Qwen3OmniMoeTalkerText style configuration
    >>> configuration = Qwen3OmniMoeTalkerTextConfig()

    >>> # Initializing a model from the Qwen3-15B-A2B" style configuration
    >>> model = Qwen3OmniMoeTalkerTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "qwen3_omni_moe_talker_text"
    keys_to_ignore_at_inference = ["past_key_values"]

    attribute_map = {
        "num_experts": "num_local_experts",
    }

    # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
        "layers.*.mlp.experts.down_proj": "rowwise",
        "layers.*.mlp.experts": "moe_tp_experts",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
    base_model_ep_plan = {
        "layers.*.mlp.gate": "ep_router",
        "layers.*.mlp.experts.gate_up_proj": "grouped_gemm",
        "layers.*.mlp.experts.down_proj": "grouped_gemm",
        "layers.*.mlp.experts": "moe_tp_experts",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    vocab_size: int = 3072
    hidden_size: int = 1024
    intermediate_size: int = 2048
    num_hidden_layers: int = 20
    num_attention_heads: int = 16
    num_key_value_heads: int = 2
    hidden_act: str = "silu"
    max_position_embeddings: int = 32768
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-6
    use_cache: bool = True
    tie_word_embeddings: bool = False
    rope_parameters: RopeParameters | dict | None = None
    attention_bias: bool = False
    sliding_window: int | None = None
    attention_dropout: float | int = 0.0
    decoder_sparse_step: int = 1
    moe_intermediate_size: int = 384
    num_experts_per_tok: int = 8
    num_experts: int = 128
    norm_topk_prob: bool = False
    output_router_logits: bool = False
    router_aux_loss_coef: float = 0.001
    mlp_only_layers: list[int] | None = None
    pad_token_id: int | None = None
    bos_token_id: int | None = None
    eos_token_id: int | list[int] | None = None

    def __post_init__(self, **kwargs):
        self.sliding_window = self.sliding_window
        self.mlp_only_layers = [] if self.mlp_only_layers is None else self.mlp_only_layers
        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeTalkerConfig(PreTrainedConfig):
    r"""
    code_predictor_config (`dict`, *optional*):
        A dictionary of configuration parameters used to initialize a [`Qwen3OmniMoeTalkerCodePredictorConfig`].
        If not provided, defaults will be used.
    num_code_groups (`int`, *optional*, defaults to 32):
        Number of codebook groups used in the predicted acoustic token sequence, corresponding to multi-codebook VQ representation.
    thinker_hidden_size (`int`, *optional*, defaults to 2048):
        Hidden dimension size of the thinker module used for intermediate reasoning or latent planning before audio generation.
    codec_eos_token_id (`int`, *optional*, defaults to 4198):
        Token ID representing the end-of-speech token in the codec-generated sequence.
    accept_hidden_layer (`int`, *optional*, defaults to 18):
        Index of the hidden layer whose output is used for accepting or refining generated tokens during think-and-speak process.
    codec_nothink_id (`int`, *optional*, defaults to 4203):
        Token ID indicating no thinking step is required during generation.
    codec_think_bos_id (`int`, *optional*, defaults to 4204):
        Token ID marking the beginning of a thinking sequence.
    codec_think_eos_id (`int`, *optional*, defaults to 4205):
        Token ID marking the end of a thinking sequence.
    codec_pad_id (`int`, *optional*, defaults to 4196):
        Padding token ID used in codec input sequences.
    codec_bos_id (`int`, *optional*, defaults to 4197):
        Beginning-of-speech token ID in codec sequences.
    position_id_per_seconds (`int`, *optional*, defaults to 25):
        Number of position IDs allocated per second of audio content, used for temporal alignment in generation.
    audio_start_token_id (`int`, *optional*, defaults to 151669):
        Token ID that indicates the start of an audio generation segment in the output.
    speaker_id (`dict`, *optional*):
        Speaker name to speaker id dict.

    Example:

    ```python
    >>> from transformers import Qwen3OmniMoeTalkerConfig, Qwen3OmniMoeTalker

    >>> # Initialize a Qwen3OmniMoeTalkerConfig with default sub-configurations
    >>> config = Qwen3OmniMoeTalkerConfig(
    ...     num_code_groups=32,
    ...     thinker_hidden_size=2048,
    ... )

    >>> # Initialize the full Qwen3-Omni Talker model
    >>> model = Qwen3OmniMoeTalker(config)

    >>> # Access the model configuration
    >>> config = model.config
    >>> print(config.text_config)  # Access text decoder configuration
    >>> print(config.code_predictor_config)  # Access code predictor configuration
    ```"""

    sub_configs = {
        "code_predictor_config": Qwen3OmniMoeTalkerCodePredictorConfig,
        "text_config": Qwen3OmniMoeTalkerTextConfig,
    }

    code_predictor_config: dict | PreTrainedConfig | None = None
    text_config: dict | PreTrainedConfig | None = None
    num_code_groups: int = 32
    thinker_hidden_size: int = 2048
    codec_eos_token_id: int = 4198
    accept_hidden_layer: int = 18
    codec_nothink_id: int = 4203
    codec_think_bos_id: int = 4204
    codec_think_eos_id: int = 4205
    codec_pad_id: int = 4196
    codec_bos_id: int = 4197
    audio_token_id: int = 151646
    image_token_id: int = 151655
    video_token_id: int = 151656
    vision_start_token_id: int = 151652
    position_id_per_seconds: int = 25
    audio_start_token_id: int = 151669
    speaker_id: dict | None = None
    initializer_range: float = 0.02
    tie_word_embeddings: bool = False

    def __post_init__(self, **kwargs):
        if self.code_predictor_config is None:
            self.code_predictor_config = {}
            self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig()
            logger.info("code_predictor_config is None. Initializing code_predictor_config model with default values")
        else:
            self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(**self.code_predictor_config)

        if self.text_config is None:
            self.text_config = {}
            self.text_config = Qwen3OmniMoeTalkerTextConfig()
            logger.info("talker text_config is None. Initializing talker text model with default values")
        else:
            self.text_config = Qwen3OmniMoeTalkerTextConfig(**self.text_config)
        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
    r"""
    num_quantizers (`int`, *optional*, defaults to 16):
        Number of residual vector quantizers used in the vocoder for fine-grained audio reconstruction.
    upsample_rates (`Tuple[int]`, *optional*, defaults to `(8, 5, 4, 3)`):
        Rate at which features are upsampled in the final waveform synthesis stage.
    upsampling_ratios (`Tuple[int]`, *optional*, defaults to `(2, 2)`):
        Ratios used in transposed convolutional layers to progressively upsample feature maps to waveform.
    decoder_dim (`int`, *optional*, defaults to 1536):
        Final dimensionality of the decoder's output before waveform generation.

    Example:

    ```python
    >>> from transformers import Qwen3OmniMoeCode2WavConfig, Qwen3OmniMoeCode2WavModel

    >>> # Initializing a default Qwen3OmniMoeCode2WavConfig
    >>> config = Qwen3OmniMoeCode2WavConfig()

    >>> # Initializing the Code2Wav model with the configuration
    >>> model = Qwen3OmniMoeCode2WavModel(config)

    >>> # Accessing configuration
    >>> config = model.config
    ```"""

    codebook_size: int = 2048
    hidden_size: int = 1024
    max_position_embeddings: int = 8000
    rope_parameters: RopeParameters | dict | None = None
    num_attention_heads: int = 16
    num_key_value_heads: int = 16
    attention_bias: bool = False
    sliding_window: int = 72
    intermediate_size: int = 3072
    hidden_act: str = "silu"
    layer_scale_initial_scale: float = 0.01
    rms_norm_eps: float = 1e-5
    num_hidden_layers: int = 8
    num_quantizers: int = 16
    upsample_rates: list[int] | tuple[int, ...] = (8, 5, 4, 3)
    upsampling_ratios: list[int] | tuple[int, ...] = (2, 2)
    decoder_dim: int = 1536
    attention_dropout: float | int = 0.0
    initializer_range: float = 0.02

    @property
    def layer_types(self):
        """
        All layer in code2wav should be sliding attention
        """
        return ["sliding_attention"] * self.num_hidden_layers


@auto_docstring(checkpoint="Qwen/Qwen3-30B-A3B-Base")
@strict
class Qwen3OmniMoeConfig(PreTrainedConfig):
    r"""
    thinker_config (`dict`, *optional*):
        Configuration of the underlying thinker sub-model.
    talker_config (`dict`, *optional*):
        Configuration of the underlying talker sub-model.
    code2wav_config (`dict`, *optional*):
        Configuration of the underlying code2wav sub-model.
    enable_audio_output (`bool`, *optional*, defaults to `True`):
        Whether enable audio output and load talker and code2wav module.
    im_start_token_id (`int`, *optional*, defaults to 151644):
        Token id for the start of image
    im_end_token_id (`int`, *optional*, defaults to 151645):
        Token id for the end of image
    tts_pad_token_id (`int`, *optional*, defaults to 151671):
        Token id for the padding in TTS
    tts_bos_token_id (`int`, *optional*, defaults to 151672):
        Token id for the start of sequence in TTS
    tts_eos_token_id (`int`, *optional*, defaults to 151673):
        Token id for the end of sequence in TTS of image
    system_token_id (`int`, *optional*, defaults to 8948):
        Token id for the system prompt
    user_token_id (`int`, *optional*, defaults to 872):
        Token id for the user prompt
    assistant_token_id (`int`, *optional*, defaults to 77091):
        Token id for the assistant prompt

    Example:

    ```python
    >>> from transformers import (
    ...     Qwen3OmniMoeThinkerConfig,
    ...     Qwen3OmniMoeTalkerConfig,
    ...     Qwen3OmniMoeCode2WavConfig,
    ...     Qwen3OmniMoeForConditionalGeneration,
    ...     Qwen3OmniMoeConfig,
    ... )

    >>> # Initializing a Qwen3OmniMoe style configuration
    >>> configuration = Qwen3OmniMoeConfig()

    >>> # Initializing a model from the configuration
    >>> model = Qwen3OmniMoeForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "qwen3_omni_moe"
    sub_configs = {
        "thinker_config": Qwen3OmniMoeThinkerConfig,
        "talker_config": Qwen3OmniMoeTalkerConfig,
        "code2wav_config": Qwen3OmniMoeCode2WavConfig,
    }

    thinker_config: dict | PreTrainedConfig | None = None
    talker_config: dict | PreTrainedConfig | None = None
    code2wav_config: dict | PreTrainedConfig | None = None
    enable_audio_output: bool = True
    im_start_token_id: int = 151644
    im_end_token_id: int = 151645
    tts_pad_token_id: int = 151671
    tts_bos_token_id: int = 151672
    tts_eos_token_id: int = 151673
    system_token_id: int = 8948
    user_token_id: int = 872
    assistant_token_id: int = 77091
    initializer_range: float | None = None

    def __post_init__(self, **kwargs):
        if self.thinker_config is None:
            self.thinker_config = Qwen3OmniMoeThinkerConfig()
            logger.info("thinker_config is None. Initializing thinker model with default values")
        elif isinstance(self.thinker_config, dict):
            self.thinker_config = Qwen3OmniMoeThinkerConfig(**self.thinker_config)

        if self.talker_config is None:
            self.talker_config = Qwen3OmniMoeTalkerConfig()
            logger.info("talker_config is None. Initializing talker model with default values")
        elif isinstance(self.talker_config, dict):
            self.talker_config = Qwen3OmniMoeTalkerConfig(**self.talker_config)

        if self.code2wav_config is None:
            self.code2wav_config = Qwen3OmniMoeCode2WavConfig()
            logger.info("code2wav_config is None. Initializing code2wav_config model with default values")
        elif isinstance(self.code2wav_config, dict):
            self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**self.code2wav_config)

        if self.initializer_range is None:
            self.initializer_range = self.thinker_config.initializer_range

        super().__post_init__(**kwargs)

    def get_text_config(self, decoder=False) -> "PreTrainedConfig":
        """
        Returns the config that is meant to be used with text IO. On most models, it is the original config instance
        itself. On specific composite models, it is under a set of valid names.

        Args:
            decoder (`Optional[bool]`, *optional*, defaults to `False`):
                If set to `True`, then only search for decoder config names.
        """
        # Overridden for deeply nested config like Qwen2-Omni. We don't have any omni model
        # except for Qwen yet. This has to be generalized if more deeply nested configs are
        # added. NOTE: currently method used only by vLLM
        return self.thinker_config.get_text_config()


__all__ = [
    "Qwen3OmniMoeConfig",
    "Qwen3OmniMoeThinkerConfig",
    "Qwen3OmniMoeTalkerConfig",
    "Qwen3OmniMoeAudioEncoderConfig",
    "Qwen3OmniMoeTalkerCodePredictorConfig",
    "Qwen3OmniMoeTalkerTextConfig",
    "Qwen3OmniMoeTextConfig",
    "Qwen3OmniMoeVisionEncoderConfig",
]
