
    Z j                     l    S SK Jr  SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/r	g
)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringzgoogle/vaultgemma-1b)
checkpointc                     ^  \ rS rSr% SrSrS/rSSSSSSSS.rS/S	/4S
S/S
/4S
/S
/4S.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\\S'   Sr\
\S'   Sr\\S '   S!r\\S"'   S#r\\S$'   S%r\
S&-  \S''   S(r\
\\
   -  S&-  \S)'   S*r\
S&-  \S+'   S#r\\S,'   S&r\ \!-  S&-  \S-'   S.r"\\S/'   S0r#\
\-  S&-  \S1'   Sr$\
\S2'   S3r%\
S&-  \S4'   S&r&\\   S&-  \S5'   S6r'\S&-  \S7'   S8r(\S&-  \S9'   U 4S: jr)S; r*S<r+U =r,$ )=VaultGemmaConfig   a  
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
    scaling factor used on the attention scores
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
    scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
    scaling factor when applying tanh softcapping on the attention scores.

```python
>>> from transformers import VaultGemmaModel, VaultGemmaConfig
>>> # Initializing a VaultGemma vaultgemma-7b style configuration
>>> configuration = VaultGemmaConfig()
>>> # Initializing a model from the vaultgemma-7b style configuration
>>> model = VaultGemmaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
vaultgemmapast_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei 	  hidden_sizei $  intermediate_size   num_hidden_layers   num_attention_heads   num_key_value_heads   head_dimgelu_pytorch_tanhhidden_activationi    max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cacher   Npad_token_id   eos_token_id   bos_token_idtie_word_embeddingsrope_parametersFattention_biasg        attention_dropoutquery_pre_attn_scalari   sliding_windowlayer_typesg      >@final_logit_softcappingg      I@attn_logit_softcappingc                    > U R                   cC  [        U R                  5       Vs/ s H  n[        US-   S-  5      (       a  SOSPM     snU l         [        TU ]  " S0 UD6  g s  snf )Nr)   r+   sliding_attentionfull_attention )r3   ranger   boolsuper__post_init__)selfkwargsi	__class__s      ڈ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/configuration_vaultgemma.pyr=   VaultGemmaConfig.__post_init__\   si    #X]^b^t^tXu XuSTtQUaK'8'8#>NNXu D 	''	 s   $A#c                     U R                   U R                  -  S:w  a&  [        SU R                    SU R                   S35      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   zThe hidden size (z6) is not a multiple of the number of attention heads (z).N)r   r   
ValueError)r>   s    rB   validate_architecture&VaultGemmaConfig.validate_architectured   sS    d666!;#D$4$4#5 622327  <    )r3   )-__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   int__annotations__r   r   r   r   r   r!   r#   strr$   r%   floatr&   r'   r;   r(   r*   listr,   r-   r.   r   dictr/   r0   r1   r2   r3   r4   r5   r=   rF   __static_attributes____classcell__)rA   s   @rB   r
   r
      s   $ J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK!s!s    Hc0s0#'S'#u#L%It L#* +,L#S	/D(, L#*  $$48O^d*T18 ND ,/sU{T)/!$3$!%NC$J%$(KcT!(,0UT\0+/EDL/( rH   r
   N)
huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__r9   rH   rB   <module>r_      sK   , / 3 1 # 12K' K  3K\ 
rH   