
    Z jD                     h   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.J/r/  SSK0J1r1  \Rd                  " \35      r4 " S S\*5      r5 " S S\&5      r6 " S S\/5      r7 " S S\Rp                  5      r9 " S S \.5      r: " S! S"\Rp                  5      r; " S# S$\(5      r< " S% S&\5      r= " S' S(\5      r>\ " S) S*\>5      5       r? " S+ S,\)\>\5      r@/ S-QrAg).zPyTorch AFMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_grouped_mm_availablelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GptOssRMSNorm)LlamaAttentionLlamaForCausalLMLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Qwen2MoeExpertsQwen2MoeMLP   )AfmoeConfigc                       \ rS rSrSrg)AfmoeRotaryEmbedding/    N__name__
__module____qualname____firstlineno____static_attributes__r(       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/afmoe/modular_afmoe.pyr&   r&   /       r/   r&   c                       \ rS rSrSrg)AfmoeRMSNorm3   r(   Nr)   r(   r/   r0   r3   r3   3   r1   r/   r3   c                       \ rS rSrSrg)AfmoeMLP7   r(   Nr)   r(   r/   r0   r6   r6   7   r1   r/   r6   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )AfmoeTokenChoiceRouter;   z
Token-choice top-K router for MoE routing.

This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  UR
                  SS9U l
        g NFbias)super__init__confignum_experts_per_toktop_knum_expertsroute_scaler   Linearhidden_sizegateselfrA   	__class__s     r0   r@   AfmoeTokenChoiceRouter.__init__B   s\    //
!--!--IIf00&2D2D5Q	r/   hidden_statesexpert_biasc                    UR                   u    p4UR                  SU5      nU R                  U5      R                  [        R
                  5      n[        R                  " U5      n[        R                  " Xb-   U R                  SS9u  p7UR                  SUS9nUR                  SSS9S-   n	X-  nXR                  -  nXXU4$ )Nr#   )kdim)rR   indexT)rR   keepdimg#B;)shapeviewrH   totorchfloat32sigmoidtopkrC   gathersumrE   )
rJ   rM   rN   _
hidden_dimrouter_logitsscoresselected_experts
top_scoresdenominators
             r0   forwardAfmoeTokenChoiceRouter.forwardJ   s    (..1%**2z:		-033EMMB}-#jj)=QRS]]q0@]A
 nnTn:UB-
"2"22
*:::r/   )rA   rH   rD   rE   rC   )r*   r+   r,   r-   __doc__r@   rX   Tensorre   r.   __classcell__rK   s   @r0   r9   r9   ;   s.    R;U\\ ; ; ;r/   r9   c                       \ rS rSrSrg)AfmoeExpertsY   r(   Nr)   r(   r/   r0   rl   rl   Y   r1   r/   rl   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )AfmoeSparseMoeBlock]   z
Mixture of Experts (MoE) module for AFMoE.

This module implements a sparse MoE layer with both shared experts (always active) and
routed experts (activated based on token-choice routing).
c                 ,  > [         TU ]  5         Xl        [        U5      U l        [        XR                  UR                  -  5      U l        [        U5      U l
        [        R                  " [        R                  " UR                  5      SS9U l        g )NF)requires_grad)r?   r@   rA   r9   routerr6   moe_intermediate_sizenum_shared_expertsshared_expertsrl   expertsr   	ParameterrX   zerosrD   rN   rI   s     r0   r@   AfmoeSparseMoeBlock.__init__e   sl    ,V4&v/K/KfNgNg/gh#F+<<F4F4F(GW\]r/   c                    UR                   u  p#nUR                  SU5      nU R                  XR                  5      u  pgnU R	                  U5      R                  X#U5      n	U R                  XXU5      R                  X#U5      n
X-   $ )NrP   )rU   rV   rs   rN   rv   rw   )rJ   rM   
batch_sizeseq_lenr_   hidden_states_flatr`   rc   rb   shared_outputrouted_outputs              r0   re   AfmoeSparseMoeBlock.forwardm   s    *7*=*='
Z*//J? 7;kk-QaQa6b3#3 ++,>?DDZZde%7:V[[
 ,,r/   )rA   rN   rw   rs   rv   )	r*   r+   r,   r-   rg   r@   re   r.   ri   rj   s   @r0   ro   ro   ]   s    ^- -r/   ro   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\
\R                  \R                  4   S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )AfmoeAttention|   a6  
Multi-headed attention module with optional sliding window and gating.

This attention mechanism supports both full attention and sliding window attention,
and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
of custom logic we need to maintain.
rA   	layer_idxc                   > [         TU ]  X5        UR                  U   S:H  U l        U R                  (       a  UR                  OS U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l	        [        R                  " UR                  UR                  U R                  -  SS9U l        g )Nsliding_attentionepsFr=   )r?   r@   layer_typesis_local_attentionsliding_windowr3   head_dimrms_norm_epsq_normk_normr   rF   rG   num_attention_heads	gate_projrJ   rA   r   rK   s      r0   r@   AfmoeAttention.__init__   s    + #)"4"4Y"?CV"V7;7N7Nf33TX"4==f6I6IJ"4==f6I6IJ6#5#5v7Q7QTXTaTa7ahmnr/   NrM   position_embeddingsattention_maskpast_key_valuekwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
U R                  U5      nU R                  U5      R                  SS5      nU R                  U	5      R                  SS5      n	U
R                  SS5      n
U R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                   R"                  [$        5      nU" U UU	U
4UU R&                  (       d  SOU R(                  U R*                  U R,                  S.UD6u  nnUR                  " / UQSP76 R/                  5       nU[0        R2                  " U5      -  nU R5                  U5      nUU4$ )NrP   r#   r           )r   dropoutscalingr   )rU   r   q_projrV   k_projv_projr   r   	transposer   r   r   updater   r   get_interfacerA   _attn_implementationr    trainingattention_dropoutr   r   
contiguousrX   rZ   o_proj)rJ   rM   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesgate_statescossinattention_interfaceoutputattn_weightsattn_outputs                     r0   re   AfmoeAttention.forward   s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|Dnn]3{{<0::1a@[[,66q!<
#--a3""*HC';LVY'_$L%'5'<'<ZW[WeWe'f$J(?(M(MKK,,.E)
  3	
 

 *#}}C$2H2HLL..
 
 
 
 .k.2.99;%--44kk&)L((r/   )r   r   r   r   r   )N)r*   r+   r,   r-   rg   r$   intr@   rX   rh   tupler   r   r   re   r.   ri   rj   s   @r0   r   r   |   s    	o{ 	os 	o  (,.)||.) #5<<#=>.) t+	.)
 .) +,.) 
u||U\\)	*.) .)r/   r   c                     ^  \ rS rSrSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\\R                  \R                  4   S-  S\\   S\R                   4S jjrSrU =r$ )AfmoeDecoderLayer   z
AFMoE decoder layer with dual normalization.

This layer applies self-attention followed by either a dense MLP or MoE block,
with dual normalization (pre and post) around each component.
rA   r   c                   > [         TU ]  5         UR                  U l        X l        [	        XS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        X!R                  :  U l        U R                  (       a  [        U5      U l        g [!        U5      U l        g )N)rA   r   r   )r?   r@   rG   r   r   	self_attnr3   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernormnum_dense_layersmoe_enabledro   mlpr6   r   s      r0   r@   AfmoeDecoderLayer.__init__   s    !--"'vK  ,F,>,>FDWDWX(4V5G5GVM`M`(a% ".f.@.@fFYFY!Z".v/A/AvGZGZ"[ %(?(??*62DH'DHr/   NrM   r   position_idsr   	use_cacher   r   r   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ )N)rM   r   r   r   r   r   r(   )r   r   r   r   r   r   )
rJ   rM   r   r   r   r   r   r   residualr^   s
             r0   re   AfmoeDecoderLayer.forward   s     ! ,,];>> 
')%) 3
 
 55mD 0 !..}=///> 0r/   )	rG   r   r   r   r   r   r   r   r   )NNNNN)r*   r+   r,   r-   rg   r$   r   r@   rX   rh   
LongTensorr   boolr   r   r   FloatTensorre   r.   ri   rj   s   @r0   r   r      s    ({ (s (2 /304'+!%HL!||! t+! &&-	!
 ! $;! #5<<#=>E! +,! 
		! !r/   r   c                      ^  \ rS rSr% Sr\\S'   SrS/rS/r	\
" \SS9\\S	.r/ S
QrSrSrSr\" 5       rSrSrU 4S jrSrU =r$ )AfmoePreTrainedModeli  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
rA   modelr   past_key_valuesr   )rS   )r`   rM   
attentions)r   r   r   r   r   r   normrN   Tc                   > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g[	        U[        5      (       a+  [        R                  " UR                  R                  5        g[	        U[        5      (       a!  [        R                  " UR                  5        gg)zInitialize the weightsr   )meanstdN)r?   _init_weightsrA   initializer_range
isinstancerl   initnormal_gate_up_proj	down_projr9   zeros_rH   weightro   rN   )rJ   moduler   rK   s      r0   r   "AfmoePreTrainedModel._init_weights%  s    f%kk++fl++LL,,3C@LL))= 677KK**+ 344KK**+ 5r/   r(   )r*   r+   r,   r-   rg   r$   __annotations__base_model_prefix_no_split_modules_skip_keys_device_placementr   r9   r   r   _can_record_outputs_keep_in_fp32_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   _can_compile_fullgraph_supports_attention_backendsupports_gradient_checkpointingr   r.   ri   rj   s   @r0   r   r     s    
 ,-#4"5'(>aH*$
	 N!  #'&*#
, 
,r/   r   c                     ^  \ rS rSrSrS\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )
AfmoeModeli2  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AfmoeDecoderLayer`]

Args:
    config: AfmoeConfig
rA   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   rA   F)r?   r@   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrG   embed_tokens
ModuleListrangenum_hidden_layersr   layersr3   r   r   r&   
rotary_embgradient_checkpointing	post_initr   s      r0   r@   AfmoeModel.__init__;  s     !.. ++LL):):F<N<NPTP`P`ammCHIaIaCbcCbiv1Cbc
 !!3!39L9LM	.f=&+# ds   C?N	input_idsr   inputs_embedsr   r   r   r   r   c           
      .   US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d(  U R                  UUUS.n
[        S0 U
D6[        S0 U
D6S.n	UnU R                  R                  (       a  XR                  R                  S-  -  nU R!                  X5      n[#        U R$                  5       H-  u  pU" U4XR                  R&                  U      UUUUS	.UD6nM/     U R)                  U5      n[+        UU(       a  US
9$ S S
9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r#   )device)rA   r   r   r   )full_attentionr   g      ?)r   r   r   r   r   )last_hidden_stater   r(   )
ValueErrorr   rA   r   get_seq_lengthrX   arangerU   r  	unsqueezer   dictr
   r   mup_enabledrG   r   	enumerater   r   r   r   )rJ   r   r   r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrM   r   idecoder_layers                  r0   re   AfmoeModel.forwardJ  s    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF++!."0#2	K #5"C{"C%F%U%U#
 & ;;"")[[-D-Dc-IJM"oomJ )$++ 6A)2;;3J3J13MN).#$7 M !7 		-0%+/8O
 	
>B
 	
r/   )r   r   r   r   r   r   r   )NNNNNN)r*   r+   r,   r-   rg   r$   r@   r   r   r   rX   r   rh   r   r   r   r   r   r   r   re   r.   ri   rj   s   @r0   r   r   2  s    {   .2.22604(,!%<
##d*<
 t+<
 ((4/	<

 &&-<
 <
 $;<
 +,<
 
'	'<
    <
r/   r   c                   N   \ rS rSrSS0rSS0rSS/S/40rS r\\	         SS
\
R                  S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\S	-  S\\
R                  -  S\\   S\4S jj5       5       rSrg	)AfmoeForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrM   logitsc                     [         R                  X5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r<   )
r   r@   r   r   r   r   rF   rG   r  r   )rJ   rA   s     r0   r@   AfmoeForCausalLM.__init__  sS    %%d3'
 ++yy!3!3V5F5FUSr/   Nr   r   r   r   r   labelsr   output_router_logitslogits_to_keepr   r   c
                    Ub  UOU R                   R                  nU R                  " SUUUUUUUS.U
D6nUR                  n[	        U	[
        5      (       a  [        U	* S 5      OU	nU R                  US S 2US S 24   5      nS nUb  U R                  " XU R                  40 U
D6n[        UUUR                  UR                  UR                  UR                  S9$ )N)r   r   r   r   r   r   r  )lossr  r   rM   r   r`   r(   )rA   r  r   r  r   r   slicer  loss_functionr   r   r   rM   r   r`   )rJ   r   r   r   r   r   r  r   r  r  r   outputsrM   slice_indicesr  r  s                   r0   re   AfmoeForCausalLM.forward  s      %9$D $++JjJj 	 +/** 	+
)%+'!5	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD(#33!//))!//
 	
r/   )r  r   r   )	NNNNNNNNr   )r*   r+   r,   r-   _tied_weights_keys_tp_plan_pp_planr@   r   r   rX   r   rh   r   r   r   r   r   r   r   re   r.   r(   r/   r0   r  r    s'   *,GH23H_-z:;H  .2.204(,26*.!%,0-.+
##d*+
 t++
 &&-	+

 +
 ((4/+
   4'+
 $;+
 #Tk+
 ell*+
 +,+
 
#+
  +
r/   r  )r  r   r   )Brg   collections.abcr   rX   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r   gpt_oss.modeling_gpt_ossr   llama.modeling_llamar   r   r   r   r    qwen2_moe.modeling_qwen2_moer!   r"   configuration_afmoer$   
get_loggerr*   loggerr&   r3   r6   Moduler9   rl   ro   r   r   r   r   r  __all__r(   r/   r0   <module>r9     s    $   & . ) R 9 Q F & k k 7 E 4  H , 
		H	%	/ 		= 		{ 	;RYY ;<	? 	-")) ->B)^ B)J?2 ?D,,? ,,^ V
% V
 V
r9
')= 9
xr/   