
    Z jX                     h   S SK Jr  S SKJr  S SKJr  SSKJr  SSKJ	r	J
r
JrJr  \
" 5       (       a  S SKJrJr  \R                   " \5      r\	" SS	9\ " S
 S\5      5       5       r\	" SS	9\ " S S\5      5       5       r\	" SS	9\ " S S\5      5       5       r\	" SS	9\ " S S\5      5       5       r/ SQrg)    )Sequence)Any)strict   )PreTrainedConfig)auto_docstringis_timm_availableloggingrequires_backends)ImageNetInfoinfer_imagenet_subsetzgoogle/gemma-3n-E4B)
checkpointc                     ^  \ rS rSr% SrSrS/rSSSSSSSSSSS.
rS	/S
/4SS/S/4S/S/4S.rSr	\
\S'   Sr\
\S'   Sr\
\\
   -  \S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\\S'   Sr\
\S'   S r\\S!'   S"r\\S#'   S$r\\S%'   S&r\
S'-  \S('   S)r\
\\
   -  S'-  \S*'   Sr\
S'-  \S+'   S$r\\S,'   S'r\ S'-  \S-'   S.r!\\S/'   S0r"\
\-  S'-  \S1'   S2r#\
\S3'   S'r$\\   S'-  \S4'   S5r%\\S6'   S7S8S9.r&S:r'\
\S;'   Sr(\
\S<'   S&r)\
\S='   S>r*\\S?'   S$r+\\S@'   SAr,\
\SB'   SCr-\
\SD'   SEr.\
\SF'   S'r/\\\   -  S'-  \SG'   U 4SH jr0SI r1SJ r2SKr3U =r4$ )LGemma3nTextConfig$   a  
vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
    Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
    Dimension of the hidden representations for per-layer embeddings.
altup_active_idx (`int`, *optional*, defaults to 0):
    The index of the prediction from which AltUp will compute additional predictions or correct the active prediction.
altup_coef_clip (`float`, *optional*, defaults to 120.0):
    The maximum amplitude of an AltUp prediction or correction coefficient weight.
altup_correct_scale (`bool`, *optional*, defaults to `True`):
    If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
altup_num_inputs (`int`, *optional*, defaults to 4):
    The number of predictions that AltUp should make given the input sequence.
num_kv_shared_layers (`int`, *optional*, defaults to 15):
    The number of layers that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
    layers in the model "share" the KV values in that each local and global layer in this range uses the KV
    cache values computed for the last local or global layer, respectively, before entering this range. The
    value should be a multiple of the attention pattern size (see `layer_types` parameter).
laurel_rank (`int`, *optional*, defaults to 64):
    The intermediate size for the linear projections in the Learned Augmented Residual Layer.
activation_sparsity_pattern (`Sequence[float]`, *optional*):
    The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
    explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are
    sparse with a sparsity factor of 0.95 and the rest are dense.

```python
>>> from transformers import Gemma3nTextModel, Gemma3nTextConfig

>>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
>>> configuration = Gemma3nTextConfig()

>>> # Initializing a model from the gemma3n_text-E4B style configuration
>>> model = Gemma3nTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_textpast_key_valuescolwisereplicated_with_grad_allreducerowwise)
zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.v_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_size   hidden_sizei @  intermediate_size#   num_hidden_layers   num_attention_heads   num_key_value_heads   head_dimgelu_pytorch_tanhhidden_activationi   max_position_embeddings{Gz?initializer_rangeư>rms_norm_epsT	use_cacher   Npad_token_id   eos_token_idbos_token_idtie_word_embeddingsrope_parametersFattention_bias        attention_dropouti   sliding_windowlayer_typesg      >@final_logit_softcappingg    .Ag     @)globallocal   vocab_size_per_layer_inputhidden_size_per_layer_inputaltup_active_idxg      ^@altup_coef_clipaltup_correct_scale   altup_num_inputs   num_kv_shared_layers@   laurel_rankactivation_sparsity_patternc                   > [        U R                  [        5      (       a@  [        U R                  5      =o R                  :w  a  [        SU R                   SU S35      e[        U R                  [        5      (       d  U R                  /U R                  -  U l        U R                  c8  [        U R                  5       Vs/ s H  o3S-   S-  S:X  a  SOSPM     snU l        U R                  c3  U R                  S	:  a  S	OSnS
/U-  S/U R                  U-
  -  -   U l        [        U R                  5      =oPR                  :w  a  [        SU R                   SU S35      e[        TU ](  " S0 UD6  g s  snf )Nzjintermediate_size must have an explicit intermediate size for every layer or one for all layers. Expected z values but got .r3      r   full_attentionsliding_attention
   gffffff?r9   zeactivation_sparsity_pattern must have an explicit activation sparsity value for every layer.Expected  )
isinstancer!   r   lenr#   
ValueErrorr<   rangerL   super__post_init__)selfkwargsintsize_leninum_sparse_layerslen_asp	__class__s         ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma3n/configuration_gemma3n.pyrY   Gemma3nTextConfig.__post_init__   s   t--x88 #D$:$: ;;@V@VV 2233CK=PQS  D22H==&*&<&<%=@V@V%VD"#W\]a]s]sWt WtRSUaK1$4 :MMWt D ++3&*&<&<r&Aq04v8I/ISE&&)::M 0D, 4;;<<GAWAWW 2233CG9AO 
 	''! s   E5c                     U R                   U R                  -  S:w  a&  [        SU R                    SU R                   S35      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   zThe hidden size (z6) is not a multiple of the number of attention heads (z).N)r    r%   rV   )rZ   s    ra   validate_architecture'Gemma3nTextConfig.validate_architecture   sS    d666!;#D$4$4#5 622327  <    c                    UR                  SS 5      nSS0SS0S.nU R                  b  U R                  OUU l        Ub  U R                  S   R                  U5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  SU R
                  S   5      5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  S	U R
                  S
   5      5        U R                  5         U$ )Nrope_scaling	rope_typedefault)rQ   rP   rP   
rope_thetar>   rQ   rope_local_base_freqr?   )popr7   updateget
setdefaultdefault_thetastandardize_rope_params)rZ   r[   rh   default_rope_paramss       ra   convert_rope_params_to_dict-Gemma3nTextConfig.convert_rope_params_to_dict   sI   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&rf   )rL   r!   r<   r7   )5__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   int__annotations__r    r!   listr#   r%   r'   r)   r+   strr,   r.   floatr0   r1   boolr2   r4   r5   r6   r7   dictr8   r:   r;   r<   r=   rq   rA   rB   rC   rD   rE   rG   rI   rK   rL   rY   rd   rt   __static_attributes____classcell__r`   s   @ra   r   r   $   s   %N  J#4"5%.%.%.%E%E%E%."+ )"+ &(9:#%568IJ!"_$56 JK)/sT#Y/s    Hc0s0#)S)#u#L%It L#* +,L#S	/D(, L#*  $$#'OTD[' ND ,/sU{T)/NC$(KcT!(%)U)*X>M&--'**c"OU" $$c "#"K>Be!4t!;B(> rf   r   c                      \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S'   S	r\\S
'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\   \\\4   -  \S!'   S"r\\S#'   S$r\\\\\4   \\\4   4   -  \S%'   S&r\\\\\4   \\\4   4   -  \S''   S(rg))*Gemma3nAudioConfig   a  
vocab_offset (`int`, *optional*, defaults to 262272):
    Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
    0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
input_feat_size (`int`, *optional*, defaults to 128):
    The number of channels in each mel-spectrogram frame.
gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
    Clipping value used to stabilize extremely large gradient values.
conf_attention_chunk_size (`int`, *optional*, defaults to 12):
    The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_context_left (`int`, *optional*, defaults to 13):
    The left context size of the local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_context_right (`int`, *optional*, defaults to 0):
    The right context size of the local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
    Logit cap applied during local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_num_attention_heads (`int`, *optional*, defaults to 8):
    The number of attention heads in local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_num_hidden_layers (`int`, *optional*, defaults to 12):
    The number of layers that use local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_conv_kernel_size (`int`, *optional*, defaults to 5):
    Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_reduction_factor (`int`, *optional*, defaults to 4):
    Reduction factor used in the conformer block inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_residual_weight (`float`, *optional*, defaults to 0.5):
    Residual connection weight inside the Conformer ("conf") section of the
    Universal Speech Model.
sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
    The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
    ("sscp") section of the Universal Speech Model.
sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
    Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
    Projection ("sscp") section of the Universal Speech Model.
sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
    Kernel sizes of the two convolutional layers in the subsample convolution projection  in the Sub-sample
    Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
    tuple of height and width for each layer, where the height corresponds to the time dimension and the width
    corresponds to the frequency dimension.
sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
    Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
    Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
    tuple of height and width for each layer, where the height corresponds to the time dimension and the width
    corresponds to the frequency dimension.

Example:

```python
>>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder

>>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
>>> configuration = Gemma3nAudioConfig()

>>> # Initializing a model from the gemma3n_audio-E4B style configuration
>>> model = Gemma3nAudioEncoder(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_audio   r     vocab_offsetinput_feat_sizei   r    r/   r0   g    _Bgradient_clipping   conf_attention_chunk_size   conf_attention_context_leftr   conf_attention_context_rightg      I@conf_attention_logit_capr$   conf_num_attention_headsconf_num_hidden_layersrO   conf_conv_kernel_sizerF   conf_reduction_factorg      ?conf_residual_weight)r       sscp_conv_channel_sizegMbP?sscp_conv_group_norm_eps)r   r   r   sscp_conv_kernel_size)r&   r&   r   sscp_conv_stride_sizerS   N)rv   rw   rx   ry   rz   r{   r   r   r   r   r   r    r0   r   r   r   r   r   r   r   r   r   r   r   r   r   tupler   r   r   r   rS   rf   ra   r   r      s5   BH !JJ%L#%OSKL%/u/%'s''))() #)&*e*$%c%"$C$!"3"!"3""%%%:CDIc3h7C&*e*M4%c3hsCx(H"II M4%c3hsCx(H"II rf   r   c                      ^  \ rS rSr% SrSrSr\\S'   Sr	\
\S'   Sr\\S	'   S
r\S
-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\
\S'   \S\\\4   4U 4S jj5       rS\\\4   4U 4S jjrSrU =r$ )Gemma3nVisionConfigi,  a_  
architecture (`str`, *optional*, defaults to `"resnet50"`):
    The timm architecture to load.
do_pooling (`bool`, *optional*, defaults to `True`):
    Whether to do pooling for the last_hidden_state in `TimmWrapperModel` or not.
model_args (`dict[str, Any]`, *optional*):
    Additional keyword arguments to pass to the `timm.create_model` function. e.g. `model_args={"depth": 3}`
    for `timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k` to create a model with 3 blocks. Defaults to `None`.
vocab_offset (`int`, *optional*, defaults to 262144):
    Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
    0-indexed `Gemma3nMultimodalEmbedder.embedding` table.

Example:
```python
>>> from transformers import Gemma3nVisionConfig, TimmWrapper

>>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
>>> configuration = Gemma3nVisionConfig()

>>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
>>> model = TimmWrapper(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_visionmobilenetv5_300m_encarchitecturer-   r.   F
do_poolingN
model_argsr   r    r   r   r@   r   r/   r0   config_dictc                   > UR                  5       nUR                  S5      nSU;   =(       d    SU;   nUcd  U(       d]  [        U S/5        [        U5      nU(       a>  [	        U5      nUR                  5       nUR                  SS9nU V	s/ s H  oU	   PM	     nn	Ubh  U(       da  [        [        U5      5      US'   [        [        U5      5      [        U5      :X  a$  [        U5       V
Vs0 s H  u  pX_M	     snn
US'   OS US'   UR                  SS 5      nUR                  SS 5      nU=(       d    UUS'   S	U;   a  SUS	   ;   a  US	   R                  SS 5        [        TU ]4  " U40 UD6$ s  sn	f s  snn
f )
Nlabel_names
num_labelsid2labeltimmT)as_dictlabel2idnum_classespretrained_cfg)copyro   r   r   r   r   label_descriptionsr   	enumeraterU   setrm   rX   	from_dict)clsr   r[   r   is_custom_modelimagenet_subsetdataset_infosynsetsr   synsetr]   namenum_labels_in_kwargsnum_labels_in_dictr`   s                 ra   r   Gemma3nVisionConfig.from_dictU  s    "&&(!oom4&&0HJ&4H cF8,3K@O+O<&224%1%D%DT%D%R"HOPf&9P"?!%i&<!=F: 3{#$K(88=F{=S%T=S'!dg=S%Tz"%)z"
  &zz,=(__]DA  4I7I| {*}L\@]/]()--mTBw 7773 Q &Us   E-+E2returnc                 
  > [         TU ]  5       nUR                  SU R                  5        UR                  S[	        U R
                  R                  5       5      5        UR                  SS 5        UR                  SS 5        U$ )Nr   r   r   r   )rX   to_dictrp   r   r   r   valuesrm   )rZ   outputr`   s     ra   r   Gemma3nVisionConfig.to_dict  sf    "-9-dmm.B.B.D)EF

:t$

:t$rf   rS   )rv   rw   rx   ry   rz   r{   r   r   r   r.   r   r   r   r   r   r    r   r   r   r0   classmethodr   r   r   r   r   r   s   @ra   r   r   ,  s    6 "J.L#.#u#J"Jt"KJL#L%(8DcN (8 (8Tc3h  rf   r   c                   r  ^  \ rS rSr% SrSr\\\S.r	Sr
\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   S	r\S-  \S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   U 4S jrSrU =r $ )Gemma3nConfigi  av  
audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
    The number of soft tokens per audio clip.
vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
    The number of soft tokens per image.
boi_token_id (`int`, *optional*, defaults to 255999):
    The begin-of-image token index to wrap the image prompt.
eoi_token_id (`int`, *optional*, defaults to 262144):
    The end-of-image token index to wrap the image prompt.
boa_token_id (`int`, *optional*, defaults to 256000):
    The begin-of-audio token index to wrap the audio prompt.
eoa_token_id (`int`, *optional*, defaults to 262272):
    The end-of-audio token index to wrap the audio prompt.

Example:

```python
>>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig

>>> # Initializing a MobileNet vision config, which is loaded from TIMM
>>> vision_config = Gemma3nVisionConfig()

>>> # Initializing a Gemma3n Audio config
>>> audio_config = Gemma3nAudioConfig()

>>> # Initializing a Gemma3n Text config
>>> text_config = Gemma3nTextConfig()

>>> # Initializing a Gemma3n gemma-3-4b style configuration
>>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3nTextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma3n)text_configvision_configaudio_configNr   r   r      audio_soft_tokens_per_imager(   vision_soft_tokens_per_imagei boi_token_idr@   eoi_token_idi  image_token_idi  boa_token_idr   eoa_token_idi  audio_token_idr-   r.   Tr6   c                   > U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        O1U R                  c$  [        5       U l        [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        O1U R                  c$  [        5       U l        [        R                  S5        [        TU ],  " S0 UD6  g )NzAtext_config is None, using default Gemma3nTextConfig text config.zGvision_config is None, using default Gemma3nVisionConfig vision config.z7audio_config is None. Using default Gemma3nAudioConfig.rS   )r   r   loggerinforT   r   r   r   r   r   rX   rY   )rZ   r[   r`   s     ra   rY   Gemma3nConfig.__post_init__  s    #02DKK[\(($//0D43C3CDDd(($//!4!Jt7I7I!JD'!4!6DKKabd''.. 2 GT5F5F GD& 2 4DKKQR''rf   )r   r   r   )!rv   rw   rx   ry   rz   r{   r   r   r   sub_configsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   r   r6   r   rY   r   r   r   s   @ra   r   r     s   $L J(,*K >BK"T#s(^3d:AAEM&c3h7$>E?CL$tCH~5<C.1t1/2 #*2&L#*&&L#*&!(NC$J(&L#*&&L#*&!(NC$J(&*ut|*'++( (rf   r   )r   r   r   r   N)collections.abcr   typingr   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   r
   r   	timm.datar   r   
get_loggerrv   r   r   r   r   r   __all__rS   rf   ra   <module>r      s   * %  . 3 R R =			H	% 01_( _  2_D 01_) _  2_D 01X* X  2Xv 01O($ O(  2O(d ^rf   