
    Z j<                     <   S SK JrJr  S SKJr  SSKJr  SSKJrJ	r	  SSK
Jr  \	R                  " \5      r\" SS9\ " S	 S
\5      5       5       r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r/ SQrg)    )AnyLiteral)strict   )PreTrainedConfig)auto_docstringlogging)intervalzgoogle/gemma-4-e2b-it)
checkpointc                   H  ^  \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S	'   S
r\\S'   Sr\\   \\\4   -  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   \" S#S$S%9" S&S'9r\\S('   U 4S) jrS*r U =r!$ )+Gemma4AudioConfig   a  
subsampling_conv_channels (`list[int]`, defaults to `[128, 32]`):
    Channel sizes for the convolutional layers in the Sub-sample Convolution Projection.
residual_weight (`float`, defaults to `0.5`):
    Scaling applied to hidden_states prior to combining with the residual in the feedforward.
attention_chunk_size (`int`, defaults to `12`):
    The sub-sequence size for attention processing.
attention_context_left (`int`, defaults to `13`):
    The leftward context size for the attention chunk.
attention_context_right (`int`, defaults to `0`):
    The rightward context size for the attention chunk.
attention_logit_cap (`float`, defaults to `50.0`):
    Cap applied to attention weights.
attention_invalid_logits_value (`float`, defaults to `1e-9`):
    Value to use for invalid logits in attention.
use_clipped_linears (`bool`, defaults to `True`):
    If true, apply clipping to the Linear layers, drawing bounds from the model checkpoint.
gradient_clipping (`float`, defaults to `1e10`):
    Clipping value used to stabilize extremely large gradient values.
output_proj_dims (`int`, defaults to `1536`):
    Dimension of the final linear projection from `hidden_size` to the model's output.
gemma4_audioi   hidden_size   num_hidden_layers   num_attention_headssilu
hidden_act)       subsampling_conv_channels   conv_kernel_sizeg      ?residual_weightattention_chunk_size   attention_context_leftr   attention_context_rightg      I@attention_logit_capg    eattention_invalid_logits_valueTuse_clipped_linearsư>rms_norm_epsg    _Bgradient_clippingi   output_proj_dims        g      ?)minmax{Gz?)defaultinitializer_rangec                    > [        U R                  [        5      (       a  [        U R                  5      U l        [        TU ]  " S0 UD6  g )N )
isinstancer   tuplelistsuper__post_init__selfkwargs	__class__s     ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma4/configuration_gemma4.pyr4   Gemma4AudioConfig.__post_init__N   s9    d44e<<-1$2P2P-QD*''    )r   )"__name__
__module____qualname____firstlineno____doc__
model_typer   int__annotations__r   r   r   strr   r2   r1   r   r   floatr   r   r    r!   r"   r#   boolr%   r&   r'   r
   r-   r4   __static_attributes____classcell__r8   s   @r9   r   r      s    .  JKs  J >GtCy5c?:F c OU  "#""$C$#$S$!%%,2"E2 $$L%#u# c 'CS9$GuG( (r;   r   c                     ^  \ rS rSr% SrSrS/rSSSSSSSSSSSS	S
.rSSSSSSS	S.rS/S/4SS/S/4S/S/4S.r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   S#r\\S$'   S%r\\S&'   S'r\\S('   S)r\\S*'   S+r\S,-  \S-'   S.r\\\   -  S,-  \S/'   S0r\S,-  \S1'   S)r\\S2'   S,r \!S,-  \S3'   S4r"\\S5'   S6r#\\-  S,-  \S7'   S8r$\\S9'   S,r%\\   S,-  \S:'   S,r&\S,-  \S;'   S,r'\(S<   S,-  \S='   Sr)\\S>'   Sr*\\S?'   S,r+\S,-  \S@'   S8r,\\SA'   S4r-\\SB'   S+r.\\SC'   S4r/\\SD'   S4r0\\SE'   S,r1\S,-  \SF'   S,r2\S,-  \SG'   S,r3\S,-  \SH'   U 4SI jr4SJ r5SKr6U =r7$ )LGemma4TextConfigU   a  
use_bidirectional_attention (`str`, *optional*):
    Controls bidirectional attention behavior. When set to `"vision"`, vision tokens
    attend bidirectionally while text tokens use causal attention. When set to `"all"`,
    all tokens use bidirectional attention.
vocab_size_per_layer_input (`int`, defaults to 262144):
    Vocabulary size for the per-layer input embeddings (PLE). Used by models with
    per-layer residual streams where a smaller embedding is added at each decoder layer.
hidden_size_per_layer_input (`int`, defaults to 256):
    Per-layer hidden dimension for the PLE system. The actual embedding weight has shape
    `[vocab_size_per_layer_input, num_hidden_layers * hidden_size_per_layer_input]`
    because all layers are packed into a single table. See the [Gemma4](https://huggingface.co/docs/transformers/main/en/model_doc/gemma4#per-layer-embeddings-ple) docs
    for a description of the full PLE pipeline.
num_global_key_value_heads (`int`, *optional*):
    Number of key-value heads for global (full) attention layers. If `None`, defaults
    to `num_key_value_heads`.
global_head_dim (`int`, defaults to 512):
    Dimension of each attention head in global (full) attention layers.
attention_k_eq_v (`bool`, defaults to `False`):
    Whether keys and values share the same projection weights. When `True`, the key
    projection output is reused as the value projection.
num_kv_shared_layers (`int`, defaults to 0):
    Number of consecutive decoder layers that share the same key-value projections.
    A value of 0 means no sharing (each layer has independent KV projections).
enable_moe_block (`bool`, defaults to `False`):
    Whether to enable Mixture-of-Experts (MoE) blocks in the decoder layers. When
    `True`, eligible layers will use a sparse MoE feed-forward network.
use_double_wide_mlp (`bool`, defaults to `False`):
    Whether to use a double-width MLP with fused gate and up projections.
top_k_experts (`int`, *optional*):
    Number of experts activated per token in MoE layers. Only used when
    `enable_moe_block=True`.
moe_intermediate_size (`int`, *optional*):
    Intermediate (hidden) size of each expert's feed-forward network in MoE layers.
    Only used when `enable_moe_block=True`.
gemma4_textpast_key_valuescolwisereplicated_with_grad_allreducerowwisepacked_colwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projlayers.*.mlp.gate_projlayers.*.mlp.up_projlayers.*.mlp.down_projlayers.*.experts.gate_up_projlayers.*.experts.down_projlayers.*.experts	ep_routergrouped_gemm)rT   rU   rV   zlayers.*.routerrW   rX   rY   	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei 	  r   i $  intermediate_size   r   r   r      num_key_value_heads   head_dimgelu_pytorch_tanhhidden_activation   max_position_embeddingsr+   r-   r$   r%   T	use_cacher   Npad_token_id   eos_token_id   bos_token_idtie_word_embeddingsrope_parametersFattention_biasr(   attention_dropouti   sliding_windowlayer_typesfinal_logit_softcapping)allvisionuse_bidirectional_attentionvocab_size_per_layer_inputhidden_size_per_layer_inputnum_global_key_value_headsglobal_head_dimattention_k_eq_vnum_kv_shared_layersenable_moe_blockuse_double_wide_mlpnum_expertstop_k_expertsmoe_intermediate_sizec                   > U R                   S:X  a  U R                  S-  S-   U l        U R                  cE  Sn[        U R                  5       Vs/ s H  n[        US-   U-  5      (       a  SOSPM     snU l        U R                  (       a=  U R                  S   =nS:w  a(  [        R                  SU S	35        SU R                  S'   S
SS.SSSS.S.nU R                  c  XPl        [        TU ](  " S0 UD6  g s  snf )Nr{   rr   rp      sliding_attentionfull_attentionz/Last layer must use `full_attention`, but got `z*`. Forcing last layer to `full_attention`.r,   g     @	rope_type
rope_thetaproportionalg      ?g    .A)r   partial_rotary_factorr   )r   r   r/   )r}   rx   ry   ranger   rF   loggerwarningru   r3   r4   )r6   r7   sliding_window_patternilast_layer_typedefault_rope_paramsr8   s         r9   r4   Gemma4TextConfig.__post_init__   s   ++u4#'#6#6!#;q"@D#%&" t556 6A (,QU6L,L'M'M#Scc6 D
 D4D4DR4H!HM] ]NNA/ARR|} $4DR  09!Q,:UYituf
 '#6 ''% s   $C9c                     U$ )Nr/   )r6   r7   s     r9   convert_rope_params_to_dict,Gemma4TextConfig.convert_rope_params_to_dict   s    r;   )ry   ru   rx   )8r<   r=   r>   r?   r@   rA   keys_to_ignore_at_inferencebase_model_tp_planbase_model_ep_planbase_model_pp_planrc   rB   rC   r   rd   r   r   rg   ri   rk   rD   rm   r-   rE   r%   rn   rF   ro   rq   r2   rs   rt   ru   dictrv   rw   rx   ry   rz   r}   r   r~   r   r   r   r   r   r   r   r   r   r   r4   r   rG   rH   rI   s   @r9   rK   rK   U   s[   #J J#4"5%.%.%.%E%E%."+ )"+)9&/,  #, )"+&)7&4,	 &(9:#%568IJ!"_$56 JK!s!s    Hc0s0#*S*#u#L%It L#* +,L#S	/D(, L#*  $$#'OTD[' ND ,/sU{T)/NC$(KcT!(,0UT\0CG!9D!@G&--'**-1d
1OS"d" !#!"d" %%"Kt" $M3:$(,3:,(4 r;   rK   c            
       \  ^  \ rS rSr% SrSrSSSSSSSSSS.	rSrS	r\	\
S
'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\	\
S'   Sr\S-  \
S'   Sr\S-  \
S'   Sr\S-  \
S'   S r\	\
S!'   Sr\	\
S"'   S#r\	\
S$'   Sr\\
S%'   Sr\\
S&'   S'r\\
S('   U 4S) jr S*r!U =r"$ )+Gemma4VisionConfig   a  
pooling_kernel_size (`int`, *optional*):
    Spatial pooling kernel size applied after patchification.
position_embedding_size (`int`, defaults to 10240):
    Maximum number of position embeddings for the vision encoder. Controls the size of
    the learned 2D position embedding table used by the patch embedder.
use_clipped_linears (`bool`, defaults to `False`):
    Whether to use weight-clipped linear layers. When enabled, linear layer weights are
    clamped to a fixed range during the forward pass to improve numerical stability.
standardize (`bool`, defaults to `False`):
    If true, applies a bias and scale to the soft tokens returned from the pooler.
gemma4_visionrO   rP   rQ   )	z!encoder.layers.*.self_attn.q_projz!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.q_normz!encoder.layers.*.self_attn.k_normz!encoder.layers.*.self_attn.o_projzencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_proj      Y@i   r   i   rd      r   r   r   rg   @   ri   rj   rk   r$   r%   rl   rm   FNrv   r(   rw   ru   r   pooling_kernel_size
patch_sizei (  position_embedding_sizer#   standardizer+   r-   c                 T   > U R                   c
  SSS.U l         [        TU ]  " S0 UD6  g )Nr,   r   r   r/   )ru   r3   r4   r5   s     r9   r4    Gemma4VisionConfig.__post_init__  s,    '1:%#PD ''r;   )ru   )#r<   r=   r>   r?   r@   rA   r   default_thetar   rB   rC   rd   r   r   rg   ri   rk   rD   r%   rE   rm   rv   rF   rw   ru   r   r   r   r   r#   r   r-   r4   rG   rH   rI   s   @r9   r   r      s    !J-6-6-6-M-M-6*3(1*3
 MK!s!s!!!!Hc0s0L%#*S*"'ND4K'&)ut|)#'OTD['  J#,S, %%K#u#( (r;   r   c                   X  ^  \ rS rSr% SrSr\\\S.r	Sr
\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   S	r\S-  \S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   U 4S jrSrU =r$ )Gemma4Configi  a  
boi_token_id (`int`, *optional*, defaults to 255999):
    The begin-of-image token index to wrap the image prompt.
eoi_token_id (`int`, *optional*, defaults to 258882):
    The end-of-image token index to wrap the image prompt.
boa_token_id (`int`, *optional*, defaults to 256000):
    The begin-of-audio token index to wrap the audio prompt.
eoa_token_index (`int`, *optional*, defaults to 258883):
    The end-of-audio token index to wrap the audio prompt.

Example:

```python
>>> from transformers import (
>>>     Gemma4AudioConfig,
>>>     Gemma4Config,
>>>     Gemma4ForConditionalGeneration,
>>>     Gemma4TextConfig,
>>>     Gemma4VisionConfig,
>>> )

>>> # Initializing a Gemma 4 Audio config.
>>> audio_config = Gemma4AudioConfig()

>>> # Initializing a Gemma 4 Text config.
>>> text_config = Gemma4TextConfig()

>>> # Initializing a Gemma 4 vision config.
>>> vision_config = Gemma4VisionConfig()

>>> # Initializing a Gemma 4 config similar to google/gemma-4-e2b-it
>>> configuration = Gemma4Config(text_config, vision_config, audio_config)

>>> # Initializing a model from the google/gemma-4-e2b-it configuration
>>> model = Gemma4ForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma4)text_configvision_configaudio_configNr   r   r   i boi_token_idiB eoi_token_idi@ image_token_idiD video_token_idi  boa_token_idiC eoa_token_indexiA audio_token_idr+   r-   Trt   c                 h  > U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         U R                  c  [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        U R                  c  [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        [        TU ],  " S0 UD6  g )Nz4text_config is None. Using default Gemma4TextConfig.zHvision_config is None. Gemma4Model.vision_tower will not be initialized.zFaudio_config is None. Gemma4Model.audio_tower will not be initialized.r/   )r   rK   r   infor0   r   r   r   r   r   r3   r4   r5   s     r9   r4   Gemma4Config.__post_init__V  s    #/1DKKNO(($///C$2B2BCD%KKbcd(($//!3!Id6H6H!ID$KK`ad''.. 1 FD4E4E FD''r;   )r   r   r   ) r<   r=   r>   r?   r@   rA   rK   r   r   sub_configsr   r   rD   r   rC   r   r   r   rB   r   r   r   r   r   r   r-   rE   rt   rF   r4   rG   rH   rI   s   @r9   r   r     s    &P J'+)K =AK!DcN2T9@@DM%S#X6=D>BL#d38n4t;B&L#*&&L#*&!(NC$J(!(NC$J(&L#*&")OS4Z)!(NC$J(&*ut|* $$( (r;   r   )r   r   rK   r   N)typingr   r   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   utils.type_validatorsr
   
get_loggerr<   r   r   rK   r   r   __all__r/   r;   r9   <module>r      s      . 3 , - 
		H	% 235(( 5(  45(p 23E' E  4EP 233() 3(  43(l 23N(# N(  4N(b Zr;   