
    Z j                     p    S r SSKJr  SSKJr  SSKJr  SSKJr  \" SS9\ " S	 S
\5      5       5       r	S
/r
g)zMistral4 model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz#mistralai/Mistral-Small-4-119B-2603)
checkpointc            
         ^  \ rS rSr% SrSrS/rSSSSSSSSSS	.	rS
/S/4SS/S/4S/S/4S.rSS0r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S '   S!r\\S'   S"r\\S#'   S$r\\S%'   S&r\S-  \S''   S(r\\S)'   S!r\S-  \S*'   S(r\\S+'   Sr\S-  \S,'   Sr\S-  \S-'   S.r\S-  \S/'   S0r\S-  \S1'   S2r \!S-  \S3'   S4r"\#\S5'   S6r$\\S7'   S8r%\\S9'   S:r&\\S;'   S2r'\!\S<'   S=r(\S-  \S>'   Sr)\S-  \S?'   S@r*\\+\   -  S-  \SA'   Sr,\S-  \SB'   SCr-\!\SD'   Sr.\/\0-  S-  \SE'   S2r1\!S-  \SF'   SCr2\!\SG'   SHr3\\-  S-  \SI'   U 4SJ jr4SKr5U =r6$ )LMistral4Config   a  
n_group (`int`, *optional*, defaults to 1):
    Number of groups for routed experts.
first_k_dense_replace (`int`, *optional*, defaults to 0):
    Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                    \--k dense layers--/
rope_interleave (`bool`, *optional*, defaults to `True`):
    Whether to interleave the rotary position embeddings.

Example:

```python
>>> from transformers import Mistral4Model, Mistral4Config

>>> # Initializing a Mistral4 style configuration
>>> configuration = Mistral4Config()

>>> # Accessing the model configuration
>>> configuration = model.config
```mistral4past_key_valuespacked_colwiserowwisemoe_tp_expertscolwise)	z!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_local_expertsn_routed_expertsi   
vocab_sizei   hidden_sizei 0  intermediate_sizei   moe_intermediate_size$   num_hidden_layers    num_attention_headsNnum_key_value_heads   n_shared_experts         ?routed_scaling_factor   kv_lora_ranki   q_lora_rank@   qk_rope_head_dim
v_head_dimqk_nope_head_dimn_group
topk_group   num_experts_per_tokr   first_k_dense_replaceTnorm_topk_probsilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_eps	use_cache   pad_token_idbos_token_id   eos_token_idpretraining_tpFtie_word_embeddingsrope_parametersrope_interleaveattention_biasg        attention_dropoutc                   > U R                   cA  SSSSU R                  SSSSSU R                  U R                  U R                  -   -  S.U l         U R                  c  U R
                  U l        U R                  U R                  -   U l        U R                  U R                  -   U l        U R                   R                  S	U R                  U R                  -  5        [        TU ](  " SS
SS10UD6  g )Nyarng     @g      `@i    g      @@r'   g?)type
rope_thetafactor original_max_position_embeddingsr8   	beta_fast	beta_slowmscale_all_dimmscalellama_4_scaling_betapartial_rotary_factorrR   ignore_keys_at_rope_validationrQ   r8    )rC   r8   r-   r/   r#   r"   qk_head_dimhead_dim
setdefaultsuper__post_init__)selfkwargs	__class__s     ڄ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mistral4/configuration_mistral4.pyrY   Mistral4Config.__post_init__h   s    '%48+/+G+G! "%(+)-)>)>$BWBWZ^ZoZoBo)p$D  ##+'+'?'?D$0043H3HH--0E0EE''(?AVAVY]YfYfAfg 	
,BD]+^	
bh	
    )rV   r#   rU   rC   )7__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planattribute_mapr   int__annotations__r   r   r   r    r"   r#   r%   r   r(   floatr*   r+   r-   r.   r/   r0   r1   r3   r4   r5   boolr7   strr8   r9   r:   r;   r=   r>   r@   listrA   rB   rC   r   dictrD   rE   rF   rY   __static_attributes____classcell__)r\   s   @r]   r
   r
      s@   * J#4"5-=*3 01:/81:"+ )"+
 &(9:#%568IJ!"_$56 	/M JK"s"!%3%s!!&(t(cc#&5&L#"Kt"c Jd
 cGS4ZJd
&'t'()3:)"&ND4K&J#*S*#u#L%It!L#*! L#* +,L#S	/D(,!"NC$J" %%48O^d*T18#'OTD[' ND ,/us{T)/
 
r_   r
   N)rd   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__rT   r_   r]   <module>rx      sO    # . 3 1 # @Ag
% g
  Bg
T 
r_   