ó
    Z– jž<  ã                   ó<  • S SK JrJr  S SKJr  SSKJr  SSKJrJ	r	  SSK
Jr  \	R                  " \5      r\" SS9\ " S	 S
\5      5       5       r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r/ SQrg)é    )ÚAnyÚLiteral)Ústricté   )ÚPreTrainedConfig)Úauto_docstringÚlogging)Úintervalzgoogle/gemma-4-e2b-it)Ú
checkpointc                   óH  ^ • \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S	'   S
r\\S'   Sr\\   \\\4   -  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   \" S#S$S%9" S&S'9r\\S('   U 4S) jrS*r U =r!$ )+ÚGemma4AudioConfigé   a²  
subsampling_conv_channels (`list[int]`, defaults to `[128, 32]`):
    Channel sizes for the convolutional layers in the Sub-sample Convolution Projection.
residual_weight (`float`, defaults to `0.5`):
    Scaling applied to hidden_states prior to combining with the residual in the feedforward.
attention_chunk_size (`int`, defaults to `12`):
    The sub-sequence size for attention processing.
attention_context_left (`int`, defaults to `13`):
    The leftward context size for the attention chunk.
attention_context_right (`int`, defaults to `0`):
    The rightward context size for the attention chunk.
attention_logit_cap (`float`, defaults to `50.0`):
    Cap applied to attention weights.
attention_invalid_logits_value (`float`, defaults to `1e-9`):
    Value to use for invalid logits in attention.
use_clipped_linears (`bool`, defaults to `True`):
    If true, apply clipping to the Linear layers, drawing bounds from the model checkpoint.
gradient_clipping (`float`, defaults to `1e10`):
    Clipping value used to stabilize extremely large gradient values.
output_proj_dims (`int`, defaults to `1536`):
    Dimension of the final linear projection from `hidden_size` to the model's output.
Úgemma4_audioi   Úhidden_sizeé   Únum_hidden_layersé   Únum_attention_headsÚsiluÚ
hidden_act)é€   é    Úsubsampling_conv_channelsé   Úconv_kernel_sizeg      à?Úresidual_weightÚattention_chunk_sizeé   Úattention_context_leftr   Úattention_context_rightg      I@Úattention_logit_capg    eÍÍÁÚattention_invalid_logits_valueTÚuse_clipped_linearsçíµ ÷Æ°>Úrms_norm_epsg    _ BÚgradient_clippingi   Úoutput_proj_dimsç        g      ð?)ÚminÚmaxç{®Gáz”?)ÚdefaultÚinitializer_rangec                 ó˜   >• [        U R                  [        5      (       a  [        U R                  5      U l        [        TU ]  " S0 UD6  g )N© )Ú
isinstancer   ÚtupleÚlistÚsuperÚ__post_init__©ÚselfÚkwargsÚ	__class__s     €Ú€/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma4/configuration_gemma4.pyr4   ÚGemma4AudioConfig.__post_init__N   s9   ø€ äd×4Ñ4´e×<Ñ<Ü-1°$×2PÑ2PÓ-QˆDÔ*Ü‰ÒÑ' Ó'ó    )r   )"Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typer   ÚintÚ__annotations__r   r   r   Ústrr   r2   r1   r   r   Úfloatr   r   r    r!   r"   r#   Úboolr%   r&   r'   r
   r-   r4   Ú__static_attributes__Ú__classcell__©r8   s   @r9   r   r      sò   ø‡ ñð.  €Jà€KÓØÐsÓØ Ð˜Ó Ø€JÓð >GÐ˜t C™y¨5°°c°©?Ñ:ÓFð ÐcÓØ €OUÓ Ø "Ð˜#Ó"Ø"$Ð˜CÓ$Ø#$Ð˜SÓ$Ø!%Ð˜Ó%Ø,2Ð" EÓ2à $Ð˜Ó$Ø€L%ÓØ#ÐuÓ#Ø ÐcÓ Ù'¨C°SÒ9À$ÑGÐuÓG÷(ó (r;   r   c                   óØ  ^ • \ rS rSr% SrSrS/rSSSSSSSSSSSS	S
.rSSSSSSS	S.rS/S/4SS/S/4S/S/4S.r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   S#r\\S$'   S%r\\S&'   S'r\\S('   S)r\\S*'   S+r\S,-  \S-'   S.r\\\   -  S,-  \S/'   S0r\S,-  \S1'   S)r\\S2'   S,r \!S,-  \S3'   S4r"\\S5'   S6r#\\-  S,-  \S7'   S8r$\\S9'   S,r%\\   S,-  \S:'   S,r&\S,-  \S;'   S,r'\(S<   S,-  \S='   Sr)\\S>'   Sr*\\S?'   S,r+\S,-  \S@'   S8r,\\SA'   S4r-\\SB'   S+r.\\SC'   S4r/\\SD'   S4r0\\SE'   S,r1\S,-  \SF'   S,r2\S,-  \SG'   S,r3\S,-  \SH'   U 4SI jr4SJ r5SKr6U =r7$ )LÚGemma4TextConfigéU   aÇ  
use_bidirectional_attention (`str`, *optional*):
    Controls bidirectional attention behavior. When set to `"vision"`, vision tokens
    attend bidirectionally while text tokens use causal attention. When set to `"all"`,
    all tokens use bidirectional attention.
vocab_size_per_layer_input (`int`, defaults to 262144):
    Vocabulary size for the per-layer input embeddings (PLE). Used by models with
    per-layer residual streams where a smaller embedding is added at each decoder layer.
hidden_size_per_layer_input (`int`, defaults to 256):
    Per-layer hidden dimension for the PLE system. The actual embedding weight has shape
    `[vocab_size_per_layer_input, num_hidden_layers * hidden_size_per_layer_input]`
    because all layers are packed into a single table. See the [Gemma4](https://huggingface.co/docs/transformers/main/en/model_doc/gemma4#per-layer-embeddings-ple) docs
    for a description of the full PLE pipeline.
num_global_key_value_heads (`int`, *optional*):
    Number of key-value heads for global (full) attention layers. If `None`, defaults
    to `num_key_value_heads`.
global_head_dim (`int`, defaults to 512):
    Dimension of each attention head in global (full) attention layers.
attention_k_eq_v (`bool`, defaults to `False`):
    Whether keys and values share the same projection weights. When `True`, the key
    projection output is reused as the value projection.
num_kv_shared_layers (`int`, defaults to 0):
    Number of consecutive decoder layers that share the same key-value projections.
    A value of 0 means no sharing (each layer has independent KV projections).
enable_moe_block (`bool`, defaults to `False`):
    Whether to enable Mixture-of-Experts (MoE) blocks in the decoder layers. When
    `True`, eligible layers will use a sparse MoE feed-forward network.
use_double_wide_mlp (`bool`, defaults to `False`):
    Whether to use a double-width MLP with fused gate and up projections.
top_k_experts (`int`, *optional*):
    Number of experts activated per token in MoE layers. Only used when
    `enable_moe_block=True`.
moe_intermediate_size (`int`, *optional*):
    Intermediate (hidden) size of each expert's feed-forward network in MoE layers.
    Only used when `enable_moe_block=True`.
Úgemma4_textÚpast_key_valuesÚcolwiseÚreplicated_with_grad_allreduceÚrowwiseÚpacked_colwiseÚmoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projúlayers.*.mlp.gate_projúlayers.*.mlp.up_projúlayers.*.mlp.down_projúlayers.*.experts.gate_up_projúlayers.*.experts.down_projúlayers.*.expertsÚ	ep_routerÚgrouped_gemm)rT   rU   rV   zlayers.*.routerrW   rX   rY   Ú	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormi   Ú
vocab_sizei 	  r   i $  Úintermediate_sizeé   r   r   r   é   Únum_key_value_headsé   Úhead_dimÚgelu_pytorch_tanhÚhidden_activationé   Úmax_position_embeddingsr+   r-   r$   r%   TÚ	use_cacher   NÚpad_token_idé   Úeos_token_idé   Úbos_token_idÚtie_word_embeddingsÚrope_parametersFÚattention_biasr(   Úattention_dropouti   Úsliding_windowÚlayer_typesÚfinal_logit_softcapping)ÚallÚvisionÚuse_bidirectional_attentionÚvocab_size_per_layer_inputÚhidden_size_per_layer_inputÚnum_global_key_value_headsÚglobal_head_dimÚattention_k_eq_vÚnum_kv_shared_layersÚenable_moe_blockÚuse_double_wide_mlpÚnum_expertsÚtop_k_expertsÚmoe_intermediate_sizec                 óü  >• U R                   S:X  a  U R                  S-  S-   U l        U R                  cE  Sn[        U R                  5       Vs/ s H  n[        US-   U-  5      (       a  SOSPM     snU l        U R                  (       a=  U R                  S   =nS:w  a(  [        R                  SU S	35        SU R                  S'   S
SS.SSSS.S.nU R                  c  XPl        [        TU ](  " S0 UD6  g s  snf )Nr{   rr   rp   é   Úsliding_attentionÚfull_attentionéÿÿÿÿz/Last layer must use `full_attention`, but got `z*`. Forcing last layer to `full_attention`.r,   g     ˆÃ@©Ú	rope_typeÚ
rope_thetaÚproportionalg      Ð?g    €„.A)r   Úpartial_rotary_factorr   )r‹   rŒ   r/   )r}   rx   ry   Úranger   rF   ÚloggerÚwarningru   r3   r4   )r6   r7   Úsliding_window_patternÚiÚlast_layer_typeÚdefault_rope_paramsr8   s         €r9   r4   ÚGemma4TextConfig.__post_init__À   s  ø€ Ø×+Ñ+¨uÓ4Ø#'×#6Ñ#6¸!Ñ#;¸qÑ"@ˆDÔà×ÑÑ#Ø%&Ð"ô ˜t×5Ñ5Ô6ó â6Aô (,¨Q°©UÐ6LÑ,L×'MÑ'MÑ#ÐScÒcÙ6ñ ˆDÔð
 ××°D×4DÑ4DÀRÑ4HÐ!H ÐM]Ó ]ÜN‰NØAÀ/ÐARÐR|Ð}ôð $4ˆD×Ñ˜RÑ ð 09ÈÑ!QØ,:ÐUYÐitÑuñf
Ðð ×ÑÑ'Ø#6Ô ä‰ÒÑ' Ó'ùò% s   Á$C9c                 ó   • U$ )Nr/   )r6   r7   s     r9   Úconvert_rope_params_to_dictÚ,Gemma4TextConfig.convert_rope_params_to_dictÚ   s   € àˆr;   )ry   ru   rx   )8r<   r=   r>   r?   r@   rA   Úkeys_to_ignore_at_inferenceÚbase_model_tp_planÚbase_model_ep_planÚbase_model_pp_planrc   rB   rC   r   rd   r   r   rg   ri   rk   rD   rm   r-   rE   r%   rn   rF   ro   rq   r2   rs   rt   ru   Údictrv   rw   rx   ry   rz   r}   r   r~   r   r€   r   r‚   rƒ   r„   r…   r†   r‡   rˆ   r4   rœ   rG   rH   rI   s   @r9   rK   rK   U   s[  ø‡ ñ#ðJ €JØ#4Ð"5Ðà%.Ø%.Ø%.Ø%EØ%EØ%.Ø"+Ø )Ø"+Ø)9Ø&/Ø,ñÐð  #,Ø )Ø"+Ø&Ø)7Ø&4Ø,ñ	Ðð &˜¨Ð(9Ð:Ø#Ð%5Ð6¸Ð8IÐJØ!Ð" _Ð$5Ð6ñÐð €JÓØ€KÓØ!ÐsÓ!ØÐsÓØ Ð˜Ó Ø Ð˜Ó Ø€HˆcÓØ0ÐsÓ0Ø#*Ð˜SÓ*Ø#ÐuÓ#Ø€L%ÓØ€IˆtÓØ €L#˜‘*Ó Ø+,€L#˜˜S™	‘/ DÑ(Ó,Ø €L#˜‘*Ó Ø $Ð˜Ó$Ø#'€OT˜D‘[Ó'Ø €NDÓ Ø,/Ðs˜U‘{ TÑ)Ó/Ø€NCÓØ$(€Kc‘˜TÑ!Ó(Ø,0Ð˜U T™\Ó0ØCGÐ ¨Ñ!9¸DÑ!@ÓGØ&-Ð Ó-Ø'*Ð Ó*Ø-1Ð  d¡
Ó1Ø€OSÓØ"ÐdÓ"Ø !Ð˜#Ó!Ø"ÐdÓ"Ø %Ð˜Ó%Ø"€Kt‘Ó"Ø $€M3˜‘:Ó$Ø(,Ð˜3 ™:Ó,õ(÷4ð r;   rK   c            
       ó\  ^ • \ rS rSr% SrSrSSSSSSSSSS.	rSrS	r\	\
S
'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\	\
S'   Sr\S-  \
S'   Sr\S-  \
S'   Sr\S-  \
S'   S r\	\
S!'   Sr\	\
S"'   S#r\	\
S$'   Sr\\
S%'   Sr\\
S&'   S'r\\
S('   U 4S) jr S*r!U =r"$ )+ÚGemma4VisionConfigéß   a™  
pooling_kernel_size (`int`, *optional*):
    Spatial pooling kernel size applied after patchification.
position_embedding_size (`int`, defaults to 10240):
    Maximum number of position embeddings for the vision encoder. Controls the size of
    the learned 2D position embedding table used by the patch embedder.
use_clipped_linears (`bool`, defaults to `False`):
    Whether to use weight-clipped linear layers. When enabled, linear layer weights are
    clamped to a fixed range during the forward pass to improve numerical stability.
standardize (`bool`, defaults to `False`):
    If true, applies a bias and scale to the soft tokens returned from the pooler.
Úgemma4_visionrO   rP   rQ   )	z!encoder.layers.*.self_attn.q_projz!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.q_normz!encoder.layers.*.self_attn.k_normz!encoder.layers.*.self_attn.o_projzencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projç      Y@i   r   i   rd   é   r   r   r   rg   é@   ri   rj   rk   r$   r%   rl   rm   FNrv   r(   rw   ru   r   Úpooling_kernel_sizeÚ
patch_sizei (  Úposition_embedding_sizer#   Ústandardizer+   r-   c                 óT   >• U R                   c
  SSS.U l         [        TU ]  " S0 UD6  g )Nr,   r§   rŽ   r/   )ru   r3   r4   r5   s     €r9   r4   Ú Gemma4VisionConfig.__post_init__  s,   ø€ Ø×ÑÑ'Ø1:È%Ñ#PˆDÔ ä‰ÒÑ' Ó'r;   )ru   )#r<   r=   r>   r?   r@   rA   rŸ   Údefault_thetar   rB   rC   rd   r   r   rg   ri   rk   rD   r%   rE   rm   rv   rF   rw   ru   r¢   rª   r«   r¬   r#   r­   r-   r4   rG   rH   rI   s   @r9   r¤   r¤   ß   s  ø‡ ñð !€Jà-6Ø-6Ø-6Ø-MØ-MØ-6Ø*3Ø(1Ø*3ñ
Ðð €Mà€KÓØ!ÐsÓ!ØÐsÓØ!Ð˜Ó!Ø!Ð˜Ó!Ø€HˆcÓØ0ÐsÓ0Ø€L%ÓØ#*Ð˜SÓ*Ø"'€ND˜4‘KÓ'Ø&)Ðu˜t‘|Ó)Ø#'€OT˜D‘[Ó'Ø Ð˜Ó Ø€JÓØ#,Ð˜SÓ,Ø %Ð˜Ó%Ø€KÓØ#ÐuÓ#÷(ó (r;   r¤   c                   óX  ^ • \ rS rSr% SrSr\\\S.r	Sr
\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   S	r\S-  \S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   U 4S jrSrU =r$ )ÚGemma4Configi  aï  
boi_token_id (`int`, *optional*, defaults to 255999):
    The begin-of-image token index to wrap the image prompt.
eoi_token_id (`int`, *optional*, defaults to 258882):
    The end-of-image token index to wrap the image prompt.
boa_token_id (`int`, *optional*, defaults to 256000):
    The begin-of-audio token index to wrap the audio prompt.
eoa_token_index (`int`, *optional*, defaults to 258883):
    The end-of-audio token index to wrap the audio prompt.

Example:

```python
>>> from transformers import (
>>>     Gemma4AudioConfig,
>>>     Gemma4Config,
>>>     Gemma4ForConditionalGeneration,
>>>     Gemma4TextConfig,
>>>     Gemma4VisionConfig,
>>> )

>>> # Initializing a Gemma 4 Audio config.
>>> audio_config = Gemma4AudioConfig()

>>> # Initializing a Gemma 4 Text config.
>>> text_config = Gemma4TextConfig()

>>> # Initializing a Gemma 4 vision config.
>>> vision_config = Gemma4VisionConfig()

>>> # Initializing a Gemma 4 config similar to google/gemma-4-e2b-it
>>> configuration = Gemma4Config(text_config, vision_config, audio_config)

>>> # Initializing a model from the google/gemma-4-e2b-it configuration
>>> model = Gemma4ForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úgemma4)Útext_configÚvision_configÚaudio_configNr´   rµ   r¶   iÿç Úboi_token_idiBó Úeoi_token_idi@ó Úimage_token_idiDó Úvideo_token_idi è Úboa_token_idiCó Úeoa_token_indexiAó Úaudio_token_idr+   r-   Trt   c                 óh  >• U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         U R                  c  [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        U R                  c  [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        [        TU ],  " S0 UD6  g )Nz4text_config is None. Using default Gemma4TextConfig.zHvision_config is None. Gemma4Model.vision_tower will not be initialized.zFaudio_config is None. Gemma4Model.audio_tower will not be initialized.r/   )r´   rK   r”   Úinfor0   r¢   rµ   r¤   r¶   r   r3   r4   r5   s     €r9   r4   ÚGemma4Config.__post_init__V  sä   ø€ Ø×ÑÑ#Ü/Ó1ˆDÔÜK‰KÐNÕOÜ˜×(Ñ(¬$×/Ñ/Ü/ÑC°$×2BÑ2BÑCˆDÔà×ÑÑ%ÜK‰KÐbÔcÜd×(Ñ(¬$×/Ñ/Ü!3Ñ!I°d×6HÑ6HÑ!IˆDÔà×ÑÑ$ÜK‰KÐ`ÔaÜd×'Ñ'¬×.Ñ.Ü 1Ñ F°D×4EÑ4EÑ FˆDÔä‰ÒÑ' Ó'r;   )r¶   r´   rµ   ) r<   r=   r>   r?   r@   rA   rK   r¤   r   Úsub_configsr´   r¢   rD   r   rC   rµ   r¶   r·   rB   r¸   r¹   rº   r»   r¼   r½   r-   rE   rt   rF   r4   rG   rH   rI   s   @r9   r²   r²     s   ø‡ ñ&ðP €Jà'Ø+Ø)ñ€Kð =A€KÐ! D¨¨c¨¡NÑ2°TÑ9Ó@Ø@D€MÐ%¨¨S°#¨X©Ñ6¸Ñ=ÓDØ>B€LÐ# d¨3°¨8¡nÑ4°tÑ;ÓBØ&€L#˜‘*Ó&Ø&€L#˜‘*Ó&Ø!(€NC˜$‘JÓ(Ø!(€NC˜$‘JÓ(Ø&€L#˜‘*Ó&Ø")€OS˜4‘ZÓ)Ø!(€NC˜$‘JÓ(Ø&*Ðu˜t‘|Ó*Ø $Ð˜Ó$÷(ó (r;   r²   )r   r²   rK   r¤   N)Útypingr   r   Úhuggingface_hub.dataclassesr   Úconfiguration_utilsr   Úutilsr   r	   Úutils.type_validatorsr
   Ú
get_loggerr<   r”   r   rK   r¤   r²   Ú__all__r/   r;   r9   Ú<module>rÉ      sâ   ð÷  å .å 3ß ,Ý -ð 
×	Ò	˜HÓ	%€ñ Ð2Ñ3Øô5(Ð(ó 5(ó ó 4ð5(ñp Ð2Ñ3ØôEÐ'ó Eó ó 4ðEñP Ð2Ñ3Øô3(Ð)ó 3(ó ó 4ð3(ñl Ð2Ñ3ØôN(Ð#ó N(ó ó 4ðN(òb Zr;   