
    Z j*                     `   S SK Jr  S SKrS SKJs  Jr  S SKJr  SSKJr	  SSK
Jr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,  \RZ                  " \.5      r/ " S S\#5      r0 " S S\$5      r1 " S S\*5      r2 " S S\Rf                  5      r4 " S S\5      r5 " S S\5      r6 " S  S!\5      r7 " S" S#\5      r8 " S$ S%\5      r9 " S& S'\"5      r: " S( S)\!5      r; " S* S+\\95      r< " S, S-\\95      r=/ S.Qr>g)/    )CallableN)nn   )initialization)Cache)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassification)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)is_flash_attention_requested   )DeepseekV3AttentionDeepseekV3DecoderLayerDeepseekV3MoEDeepseekV3NaiveMoeapply_rotary_pos_emb_interleave)LlamaForCausalLM
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)get_llama_4_attn_scale)Qwen2MoeMLP   )Mistral4Configc                       \ rS rSrSrg)Mistral4RMSNorm3    N__name__
__module____qualname____firstlineno____static_attributes__r#       ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mistral4/modular_mistral4.pyr!   r!   3       r*   r!   c                       \ rS rSrSrg)Mistral4RotaryEmbedding7   r#   Nr$   r#   r*   r+   r.   r.   7   r,   r*   r.   c                       \ rS rSrSrg)Mistral4MLP;   r#   Nr$   r#   r*   r+   r1   r1   ;   r,   r*   r1   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Mistral4TopkRouter?   c                    > [         TU ]  5         Xl        UR                  U l        [        R
                  " [        R                  " U R                  UR                  45      5      U l	        g N)
super__init__confign_routed_expertsr   	Parametertorchemptyhidden_sizeweight)selfr:   	__class__s     r+   r9   Mistral4TopkRouter.__init__@   sK     & 7 7ll5;;0E0EvGYGY/Z#[\r*   c                     UR                  SU R                  R                  5      n[        R                  " XR
                  5      nU$ )N)viewr:   r?   Flinearr@   )rA   hidden_statesrouter_logitss      r+   forwardMistral4TopkRouter.forwardG   s6    %**2t{{/F/FG<r*   )r:   r;   r@   )r%   r&   r'   r(   r9   rK   r)   __classcell__rB   s   @r+   r4   r4   ?   s    ] r*   r4   c                       \ rS rSrSrg)Mistral4NaiveMoeM   r#   Nr$   r#   r*   r+   rP   rP   M   r,   r*   rP   c                   l    \ rS rSrS\R
                  S\\R
                  \R
                  4   4S jrSrg)Mistral4MoEQ   rJ   returnc                 8   UR                  S5      nUR                  SU R                  U R                  U R                  -  5      R	                  SSS9S   R                  SS9n[        R                  " X R                  SSS9S   n[        R                  " U5      nUR                  SUS5        UR                  S5      R                  SU R                  U R                  U R                  -  5      R                  SU R                  5      nUR                  UR                  5       ) S5      n[        R                  " X`R                  SSS9S   nUR!                  SU5      nU R"                  (       a  UR                  SS	S
9S-   n	X-  nXR$                  -  nXx4$ )NrE   r   dimr   F)krX   sortedr           T)rX   keepdimg#B;)softmaxrF   n_groupr;   topksumr=   
topk_group
zeros_likescatter_	unsqueezeexpandreshapemasked_fillbooltop_kgathernorm_topk_probrouted_scaling_factor)
rA   rJ   group_scores	group_idx
group_mask
score_maskscores_for_choicetopk_indicestopk_weightsdenominators
             r+   route_tokens_to_experts#Mistral4MoE.route_tokens_to_expertsR   sz   %--b1r4<<1F1F$,,1VW\\]^df\ghijnnsunv 	 JJ|BuUVWX	%%l3
Ay!,  $VBd&;&;t||&KLWR../ 	
 *55z7H6H#Nzz"3zzrRWXYZ[$++A|<&**r4*@5HK'L#&@&@@))r*   r#   N)	r%   r&   r'   r(   r=   Tensortupleru   r)   r#   r*   r+   rS   rS   Q   s.    *U\\ *eELLZ_ZfZfLfFg *r*   rS   c                   ,   \ rS rSrS\S\4S jr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\R                  S
\
S-  S\\   S\	\R                  \R                  S-  \	\R                     S-  4   4S jjrSrg)Mistral4Attentioni   r:   	layer_idxc                 l   [         R                  R                  U 5        Xl        X l        UR
                  UR                  -  U l        UR                  U l        UR
                  U l	        UR                  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        SU l        U R                  c=  [         R"                  " UR$                  U R                  U R                  -  SS9U l        O[         R"                  " UR$                  UR                  UR(                  S9U l        [-        UR                  5      U l        [         R"                  " UR                  U R                  U R                  -  SS9U l        [         R"                  " UR$                  U R                  U R                  -   UR(                  S9U l        [-        U R                  5      U l        [         R"                  " U R                  U R                  U R                  U R                  -   -  SS9U l        [         R"                  " U R                  U R                  -  UR$                  UR(                  S9U l        U R                  S-  U l        g )NTF)biasg      )r   Moduler9   r:   r|   num_attention_headsnum_key_value_headsnum_key_value_groupsattention_dropout	num_headsq_lora_rankqk_rope_head_dimkv_lora_rank
v_head_dimqk_nope_head_dimqk_head_dim	is_causalLinearr?   q_projattention_biasq_a_projr!   q_a_layernormq_b_projkv_a_proj_with_mqakv_a_layernorm	kv_b_projo_projscalingrA   r:   r|   s      r+   r9   Mistral4Attention.__init__j   s   
		4 "$*$>$>&B\B\$\!!'!9!933!-- & 7 7"// ++ & 7 7!--#))F$6$6IYIY8Y`efDKIIf&8&8&:L:LSYShShiDM!01C1C!DDIIf&8&8$..4K[K[:[bghDM"$)) 5 55&&#

 .d.?.?@NNd33dooEF
 iiNNT__,&&
 ''D1r*   NrI   position_embeddingsattention_maskposition_idspast_key_valueskwargsrU   c                 ,   UR                   S S u  pxXxSU R                  4n	XxSU R                  U R                  -   4n
U R                  c  U R                  U5      nO/U R                  U R                  U R                  U5      5      5      nUR                  U	5      R                  SS5      n[        R                  " XR                  U R                  /SS9u  pU R                  U5      n[        R                  " XR                  U R                  /SS9u  nnU R!                  U R#                  U5      5      R                  U
5      R                  SS5      n[        R                  " XR                  U R                  /SS9u  nnUR                  USXR                  5      nUu  nnU R$                  R&                  (       a  [)        UUUU5      u  nnO[+        UUUU5      u  nnUR,                  " / UR                   S S QSP76 n[        R.                  " X4SS9n[        R.                  " UU4SS9nU[1        UU R$                  R2                  R5                  S5      U R$                  R2                  R5                  S5      5      R7                  UR8                  5      -  nUb   UR;                  UUU R<                  5      u  nn[?        U R$                  5      (       aJ  U R                  U R                  :w  a0  [@        RB                  " USU R                  U R                  -
  /5      n[D        RF                  " U R$                  RH                  [J        5      nU" U UUUU4U RL                  (       d  SOU RN                  U RP                  S	.UD6u  nn[?        U R$                  5      (       a5  U R                  U R                  :w  a  US S 2S S 2S S 2S U R                  24   nURS                  XxS5      RU                  5       nU RW                  U5      nUU4$ )
NrE   r   r   rW   llama_4_scaling_beta original_max_position_embeddingsr   r[   )dropoutr   ),shaper   r   r   r   r   r   r   r   rF   	transposer=   splitr   r   r   r   r   r:   rope_interleaver   r   re   catr   rope_parametersgettodtypeupdater|   r   rG   padr   get_interface_attn_implementationr   trainingr   r   rf   
contiguousr   )rA   rI   r   r   r   r   r   
batch_size
seq_lengthquery_shape	key_shapeq_statesq_passq_rotcompressed_kvk_passk_rotvalue_statescossinquery_states
key_statesattention_interfaceattn_outputattn_weightss                            r+   rK   Mistral4Attention.forward   s    "/!4!4Sb!9
!r43C3CDR1F1F1XY	#{{=1H}}T%7%7m8T%UVH==-771=H/D/DdF[F[.\bde//>M4E4EtG\G\3]cef 3 3F ;<AA)LVVWXZ[\${{64I4I4??3[acd

:q*6K6KL&S;;&&:5%cRLE5/uc3GLE54fll3B/44yy&b9YYB7
#&<KK''++,BCKK''++,NO'
 "\
 	! &'6'='=j,X\XfXf'g$J'449I9IT__9\5543C3Cdoo3U/VWL(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ (449I9IT__9\%aA/@/@&@AK!))*"EPPRkk+.L((r*   )r   r:   r   r   r   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r7   )r%   r&   r'   r(   r   intr9   r=   rw   rx   r   r   r   rK   r)   r#   r*   r+   rz   rz   i   s    )2~ )2# )2b )-F)||F) #5<<#=>F) t+	F)
 llF) F) -.F) 
u||U\\D0%2E2LL	MF) F)r*   rz   c                   &    \ rS rSrS\S\4S jrSrg)Mistral4DecoderLayer   r:   r|   c                 l   [         R                  R                  U 5        UR                  U l        [	        XS9U l        X!R                  :  a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r:   r|   )eps)r   r   r9   r?   rz   	self_attnfirst_k_dense_replacerS   mlpr1   r!   rms_norm_epsinput_layernormpost_attention_layernormr   s      r+   r9   Mistral4DecoderLayer.__init__   s    
		4 !--*&N444"6*DH"6*DH.v/A/AvGZGZ[(78J8JPVPcPc(d%r*   )r?   r   r   r   r   N)r%   r&   r'   r(   r   r   r9   r)   r#   r*   r+   r   r      s    e~ e# er*   r   c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.r/ r/ r\R*                  " 5       U 4S j5       rS	rU =r$ )
Mistral4PreTrainedModel   r:   modelTr   r   )rI   
attentionsc                   > [         TU ]  U5        [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g [        U[        5      (       ai  [        R
                  " UR                  SU R                  R                  S9  [        R
                  " UR                  SU R                  R                  S9  g g )Nr[   )meanstd)r8   _init_weights
isinstancer4   initnormal_r@   r:   initializer_rangerP   gate_up_proj	down_proj)rA   modulerB   s     r+   r   %Mistral4PreTrainedModel._init_weights  s    f%f011LLSdkk6S6ST 011LL,,3DKK<Y<YZLL))9V9VW 2r*   r#   )r%   r&   r'   r(   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   rz   _can_record_outputs_keep_in_fp32_modules_strict"_keys_to_ignore_on_load_unexpectedr=   no_gradr   r)   rM   rN   s   @r+   r   r      sz    &*#/0#4"5N!"&-' $& )+&
]]_X Xr*   r   c                       \ rS rSrSrg)Mistral4Modeli  r#   Nr$   r#   r*   r+   r   r     r,   r*   r   c                       \ rS rSrSrg)Mistral4ForCausalLMi  r#   Nr$   r#   r*   r+   r   r     r,   r*   r   c                       \ rS rSrSrg)!Mistral4ForSequenceClassificationi  r#   Nr$   r#   r*   r+   r   r     r,   r*   r   c                       \ rS rSrSrg)Mistral4ForTokenClassificationi  r#   Nr$   r#   r*   r+   r   r     r,   r*   r   )r   r   r   r   r   )?collections.abcr   r=   torch.nn.functionalr   
functionalrG    r   r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   utils.genericr    deepseek_v3.modeling_deepseek_v3r   r   r   r   r   llama.modeling_llamar   r   r   r   r   r   ministral3.modeling_ministral3r   qwen2_moe.modeling_qwen2_moer   configuration_mistral4r   
get_loggerr%   loggerr!   r.   r1   r   r4   rP   rS   rz   r   r   r   r   r   r   __all__r#   r*   r+   <module>r     s   %     &   B ^ F &  9   D 6 2 
		H	%	l 		2 		+ 	 	) 	*- *0r)+ r)je1 e Xo X:	J 		* 		(HJa 		%BD[ 	r*   