
    Z j]                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r
  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3  \"Rh                  " \55      r6 " S S\-5      r7 " S S\.5      r8 " S S\Rr                  5      r: " S S\Rr                  5      r; " S  S!\Rr                  5      r< " S" S#\Rr                  5      r= " S$ S%\Rr                  5      r> " S& S'\)5      r?\  " S( S)\,5      5       r@\  " S* S+\+5      5       rA " S, S-\@\5      rB " S. S/\\@5      rC/ S0QrDg)1zPyTorch JetMoe model.    )CallableN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassification)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )LlamaDecoderLayer)MixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralRotaryEmbeddingapply_rotary_pos_embeager_attention_forwardload_balancing_loss_func   )JetMoeConfigc                       \ rS rSrSrg)JetMoeRMSNorm4    N__name__
__module____qualname____firstlineno____static_attributes__r(       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/jetmoe/modular_jetmoe.pyr&   r&   4       r/   r&   c                       \ rS rSrSrg)JetMoeRotaryEmbedding8   r(   Nr)   r(   r/   r0   r3   r3   8   r1   r/   r3   c                   B   ^  \ rS rSrS\S\S\SS4U 4S jjrS rS	rU =r$ )
JetMoeParallelExperts<   num_experts
input_sizeoutput_sizereturnNc                    > [         TU ]  5         [        R                  " [        R
                  " XU5      5      U l        Xl        X l        X0l	        g)aS  
Initialize the JetMoeParallelExperts module.
The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
[ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
[MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
used in vllm.

Args:
    num_experts (int):
        Number of experts.
    input_size (int):
        Size of the input.
    output_size (int):
        Size of the output.
N)
super__init__r   	Parametertorchemptyweightr8   r9   r:   )selfr8   r9   r:   	__class__s       r0   r>   JetMoeParallelExperts.__init__=   s<    " 	ll5;;{#TU&$&r/   c                     UR                  USS9n/ n[        U R                  5       H8  nUR                  [        R
                  " X5   U R                  U   5      5        M:     [        R                  " USS9nU$ )z
Forward pass of the JetMoeParallelExperts module.

Args:
    inputs (Tensor):
        Input tensor.
    expert_size:
        Expert size information.

Returns:
    Tensor: Output tensor.
r   dim)	splitranger8   appendFlinearrB   r@   cat)rC   inputsexpert_size
input_listoutput_listiresultss          r0   forwardJetMoeParallelExperts.forwardT   sh     \\+1\5
t''(Aqxx
t{{1~FG )))KQ/r/   )r9   r8   r:   rB   	r*   r+   r,   r-   intr>   rU   r.   __classcell__rD   s   @r0   r6   r6   <   s.    'C 'S 's 't '. r/   r6   c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r$ )JetMoeTopKGatingi   r9   r8   top_kc                 z   > [         TU ]  5         X l        Xl        X0l        [
        R                  " XSS9U l        g)z
Initialize the top-k gating mechanism.

Args:
    input_size (`int`):
        Size of the input.
    num_experts (`int`):
        Number of experts.
    top_k (`int`):
        Number of top experts to select.
FbiasN)r=   r>   r8   r9   r^   r   Linearlayer)rC   r9   r8   r^   rD   s       r0   r>   JetMoeTopKGating.__init__j   s2     	&$
YYzUC
r/   c                 z   U R                  U5      R                  5       nUR                  U R                  SS9u  p4[        R
                  " USS9R                  U5      n[        R                  " UR                  S5      U R                  /UR                  UR                  S9nUR                  SUS5      nUR                  5       R                  S5      nUR                  5       nUR!                  5       n	U	R#                  S5      u  pUR%                  U R                  SS9nUR!                  5       nX[   nXXU4$ )Nr#   rG   r   dtypedevicetrunc)rounding_mode)rc   floattopkr^   r@   softmaxtype_aszerossizer8   rg   rh   scatterlongsumtolistflattensortdiv)rC   hidden_stateslogitstop_k_logitstop_k_indicestop_k_gatesro   gatesrP   top_k_experts_index_sorted_expertsbatch_indexbatch_gatess                 r0   rU   JetMoeTopKGating.forward~   s"   M*002&,kk$**!k&D#mmLa8@@O a $"2"23;;L;LU`UgUg
 a2jjl&&q) "((* &--/"/"4"4Q"7*..tzz.Q "))+!7#+FRRr/   )r9   rc   r8   r^   rW   rZ   s   @r0   r\   r\   i   s-    D3 DS D D(S Sr/   r\   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )	JetMoeMoE   z
A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

Args:
    config:
        Configuration object with model hyperparameters.
configc                 <  > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        R                  R                  [        R                  " U R                  5      5      U l        [        UR                  U R                  U R                  S-  5      U l        [        UR                  U R                  U R                  5      U l        [#        U R                  UR                  UR$                  S9U l        g )Nr   r9   r8   r^   )r=   r>   hidden_sizer9   intermediate_sizer   activation_function
activationr@   r   r?   rA   ra   r6   num_local_expertsinput_linearoutput_linearr\   num_experts_per_tokrouterrC   r   rD   s     r0   r>   JetMoeMoE.__init__   s     ,,!33 !;!;<HH&&u{{4??'CD	1&2J2JDOO]a]m]mpq]qr263K3KTM]M]_c_n_no&00,,
r/   c                    UR                  5       u  p#nUR                  SU5      nU R                  U5      u  pVpxn	X   n
U R                  X5      nUR	                  SSS9nU R                  US   5      US   -  nU R                  X5      nXSS2S4   -  n[        R                  " X#-  U R                  4UR                  UR                  S9nUR                  SXm5      nUR                  X#U R                  5      nXR                  -   nU$ )z
Forward pass of the mixture of experts layer.

Args:
    layer_input (Tensor):
        Input tensor.

Returns:
    Tensor:
        Output tensor.
    Tensor:
        Router logits.
r   rG   r   r#   Nrf   )rp   reshaper   r   chunkr   r   r@   ro   r9   rg   rh   	index_addviewra   )rC   layer_inputbszlengthemb_sizer   r   r   rP   router_logitsexpert_inputsrx   chunked_hidden_statesexpert_outputsro   layer_outputs                   r0   rU   JetMoeMoE.forward   s    !, 0 0 2X!))"h7BF++kBZ?-#0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++MG'ag*>>S\4??;>CWCW`n`u`uvq+F#((dooF#ii/r/   )r   ra   r   r   r9   r   r   )
r*   r+   r,   r-   __doc__r$   r>   rU   r.   rY   rZ   s   @r0   r   r      s    
| 
  r/   r   c                   F   ^  \ rS rSrSrS\4U 4S jjrS rS rS r	Sr
U =r$ )		JetMoeMoA   z
A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.

Args:
    config:
        Configuration object with model hyperparameters.
r   c                 d  > [         TU ]  5         UR                  U l        UR                  U l        UR                  UR                  -  U l        UR                  U l	        [        R                  R                  [        R                  " U R
                  5      5      U l        [        U R                  U R
                  U R                  5      U l        [        U R                  U R                  U R
                  5      U l        [%        U R
                  U R                  U R                  S9U l        g )Nr   )r=   r>   r   r8   r   r9   kv_channelsnum_key_value_headsr   r^   r@   r   r?   rA   ra   r6   r   r   r\   r   r   s     r0   r>   JetMoeMoA.__init__   s    !33 ,,!--0J0JJ//
HH&&u{{4??'CD	1$2B2BDOOUYUeUef243C3CTEUEUW[WfWfg&((**
r/   c                    UR                  5       u  p#nUR                  SU5      nU R                  U5      u  pVpxn	XVXx4n
X   nU R                  X5      n[        R
                  " X#-  U R                  -  U R                  4UR                  UR                  S9nUR                  SX\5      nUR                  X#U R                  S5      nXU
4$ )zq
Map inputs to attention experts according to routing decision and compute query projection inside each experts.
r   rf   r   )rp   r   r   r   r@   ro   r^   r   rg   rh   r   r   )rC   r   r   r   r   r   r   r   rP   r   	topo_infor   r   ro   r   s                  r0   mapJetMoeMoA.map   s     !, 0 0 2X!))"h7UYU`U`alUmR;])Q	 $0**=F \DJJ&(8(89AUAU^l^s^s
 q*>O#((djj"EI55r/   c                    UR                  5       u  p4pVUR                  SU5      nUu  pxpX   nU R                  X5      nXSS2S4   -  n[        R                  " X4-  U R
                  4UR                  UR                  S9nUR                  SX5      nUR                  X4U R
                  5      nXR                  -   nU$ )ze
Compute output projection inside each attention experts and merge the outputs of different experts.
r   Nrf   r   )rp   r   r   r@   ro   r9   rg   rh   r   r   ra   )rC   r   r   r   r   kr   r   r   r   rP   r   r   ro   r   s                  r0   reduceJetMoeMoA.reduce  s     '2&6&6&8#Q!))"k:FOC; $9++MG (ag*>> S\4??;>CWCW`n`u`uvq+F#((dooF#ii/r/   c                     [        S5      e)Nz-This module doesn't support call and forward.)NotImplementedError)rC   r   s     r0   rU   JetMoeMoA.forward  s    !"QRRr/   )ra   r   r   r9   r8   r   r   r^   )r*   r+   r,   r-   r   r$   r>   r   r   rU   r.   rY   rZ   s   @r0   r   r      s*    
| 
$6.,S Sr/   r   c                     ^  \ rS rSrSrSS\S\S-  4U 4S jjjr   SS\R                  S\R                  S-  S	\R                  S-  S
\S-  S\\R                  \R                  S-  \\R                     S-  4   4
S jjrSrU =r$ )JetMoeAttentioni   z@
Multi-headed attention from 'Attention Is All You Need' paper.
Nr   	layer_idxc                 Z  > [         TU ]  5         Xl        X l        SU l        Uc-  [
        R                  SU R                  R                   S35        SU l	        UR                  U l        UR                  U l        UR                  UR                  -  U l        UR                  U l        UR                   U l        UR                  U l        U R$                  S-  U l        [)        U5      U l        [,        R.                  R1                  UR2                  U R                  S-  SS	9U l        g)
z
Initialize the JetMoeAttention module.

Args:
    config:
        Configuration object with model hyperparameters.
    layer_idx:
        Index of the layer in the model.
TNzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r#   g      r   Fr`   )r=   r>   r   r   	is_causalloggerwarning_oncerD   r*   num_key_value_groupsr   r^   attention_dropoutr   r   kv_projection_sizenum_attention_heads	num_headshead_dimscalingr   expertsr@   r   rb   r   kv_projrC   r   r   rD   s      r0   r>   JetMoeAttention.__init__%  s     	" !8!8 9 :, , %&!//
!'!9!9"("4"4v7Q7Q"Q#)#=#= 33**}}d* (xxv'9'94;R;RUV;V]bcr/   rx   attention_maskposition_embeddingspast_key_valuesr;   c                    UR                   S S n/ UQSPU R                  P7nU R                  R                  U5      u  pn
U R	                  U5      R                  SSS9u  pUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nUR!                  SU R"                  SS5      nUR!                  SU R"                  SS5      nU" U UUUU4U R$                  (       d  SOU R&                  U R(                  S.UD6u  nnUR                  " / UQU R"                  PSP76 nU R                  R+                  UU
5      nUR                  " / UQSP76 nUUU	4$ )Nr   r   rG   r#           )dropoutr   )shaper   r   r   r   r   r   	transposer    updater   r   get_interfacer   _attn_implementationr!   repeatr^   trainingr   r   r   )rC   rx   r   r   r   kwargsinput_shapehidden_shapequery_statesr   r   
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                     r0   rU   JetMoeAttention.forwardF  s    $))#2.88b8$--8151A1A-1P.Y#'<<#>#D#DQB#D#O 
#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
  &&q$**a;
#**1djj!Q?$7	%
  $}}C$2H2HLL	%
 	%
!\ "&&DDTZZDDll))+yA!&&88R8L-77r/   )r   r   r   r   r   r   r   r   r   r   r   r   r^   N)NNN)r*   r+   r,   r-   r   r$   rX   r>   r@   Tensor
LongTensorr	   tuplerU   r.   rY   rZ   s   @r0   r   r      s    d| dd
 d dH /37;(,/8||/8 t+/8 #--4	/8
 /8 
u||U\\D0%2E2LL	M/8 /8r/   r   c                     ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )JetMoeDecoderLayerix  Nr   r   c                    > [         TU ]  X5        [        UR                  5      U l        [        X5      U l        [        UR                  5      U l        [        U5      U l	        U ?
g r   )r=   r>   r&   r   input_layernormr   self_attentionpost_attention_layernormr   mlp	self_attnr   s      r0   r>   JetMoeDecoderLayer.__init__y  sT    +,V-?-?@-f@(5f6H6H(I%V$Nr/   rx   r   position_idsr   	use_cacher   r   r;   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  n  n	X-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)rx   r   r   r   r   r   r(   )r   r   r   r   )
rC   rx   r   r   r   r   r   r   residualr   s
             r0   rU   JetMoeDecoderLayer.forward  s     !,,];"11 
')%+ 3
 
q! !0 !55mD/ 0r/   )r   r   r   r   r   )NNNFN)r*   r+   r,   r-   r$   rX   r>   r@   r   r   r	   boolr   r   r   rU   r.   rY   rZ   s   @r0   r   r   x  s    | d
   /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r/   r   c                       \ rS rSr% \" \SS9\" \SS9/\\" \SS9S.r\	\
S'   SrS	rS
/rS/rSrSrS	r\R&                  " 5       S 5       rSrg)JetMoePreTrainedModeli  r   )index   r#   )r   rx   
attentionsr   modelFr   r   Tc                 >   [         R                  " X5        [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g[        U[        [        -  5      (       a!  [        R                  " UR                  5        gg)zInitialize the weights.r   )meanstdN)r   _init_weights
isinstancer6   initnormal_rB   r   initializer_ranger   r   zeros_ra   )rC   modules     r0   r   #JetMoePreTrainedModel._init_weights  se     	%%d3f344LLSdkk6S6ST	I 566KK$ 7r/   r(   N)r*   r+   r,   r-   r   r   r\   r   _can_record_outputsr$   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr@   no_gradr   r.   r(   r/   r0   r   r     s     )BNScklDmn+$_A>
 &+#-.#4"5N"
]]_% %r/   r   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )JetMoeModeli  r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        UR                  U l        [        UR                  UR                   S9U l        g s  snf )N)eps)r=   r>   pad_token_idpadding_idx
vocab_sizer   	Embeddingr   embed_tokens
ModuleListrJ   num_hidden_layersr   layersr   r&   rms_norm_epsnormr   s      r0   r>   JetMoeModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammDI&JbJbDcdDcy2Dcd
 %+$?$?!!&"4"4&:M:MN	 es   C+N	input_idsr   r   r   inputs_embedsr   r   r;   c           
      B   US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  X5      nU R                  S U R                  R                    H  nU" U
4UU	UUUS.UD6n
M     U R                  U
5      n
[        U
US9$ )	Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r#   )rh   )r   r!  r   r   r   )r   r   r   r   r   )last_hidden_stater   )
ValueErrorr
   r   r  get_seq_lengthr@   aranger   rh   	unsqueezer   
rotary_embr  r  r  r   )rC   r   r   r   r   r!  r   r   past_seen_tokenscausal_maskrx   r   decoder_layers                r0   rU   JetMoeModel.forward  sF    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 & #oomJ![[)H4;;+H+HIM)$7* /#) M J 		-0%++
 	
r/   )r   r  r  r  r  r  )NNNNNN)r*   r+   r,   r-   r$   r>   r   r   r   r@   r   r   r	   FloatTensorr   r   r   r   rU   r.   rY   rZ   s   @r0   r  r    s    
O| 
O   .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
 5
    5
r/   r  c                   :  ^  \ rS rSrSS0rU 4S jr\\         SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\S-  S\4S jj5       5       rSrU =r$ )JetMoeForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightc                 l  > [         TU ]  U5        [        U5      U l        UR                  U l        UR
                  U l        [        R                  " UR                  UR                  SS9U l	        UR                  U l
        UR                  U l        UR                  U l        U R                  5         g )NFr`   )r=   r>   r  r   r  aux_loss_coefr   rb   r   lm_headtie_word_embeddingsr   r8   r   	post_initr   s     r0   r>   JetMoeForCausalLM.__init__  s      (
 ++#11yy!3!3V5F5FUS#)#=#= !33#)#=#=  	r/   Nr   r   r   r   r!  labelsr   logits_to_keepoutput_router_logitsr;   c
                 \   U R                   " SUUUUUUU	S.U
D6nUR                  n[        U[        5      (       a  [	        U* S 5      OUnU R                  US S 2US S 24   5      nS nUb*  U R                  " UU4SU R                  R                  0U
D6nS nU	(       aY  [        UR                  U R                  U R                  U5      nUb*  XR                  UR                  UR                  5      -  -  n[!        UUUUR"                  UR$                  UR&                  UR                  S9$ )N)r   r   r   r   r!  r   r8  r  )lossaux_lossry   r   rx   r   r   r(   )r   r#  r   rX   slicer2  loss_functionr   r  r"   r   r8   r   r1  torh   r   r   rx   r   )rC   r   r   r   r   r!  r6  r   r7  r8  r   outputsrx   slice_indicesry   r:  r;  s                    r0   rU   JetMoeForCausalLM.forward  sN    +/** 	+
)%+'!5	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%  ;;11 	D /%%  ((	H !**X[[-EEE(#33!//))!//
 	
r/   )r1  r2  r   r8   r   r3  r  )	NNNNNNNr   F)r*   r+   r,   r-   _tied_weights_keysr>   r   r   r@   r   r   r	   r-  r   rX   r   rU   r.   rY   rZ   s   @r0   r/  r/    s    *,GH  .2.204(,26*.!%-.,19
##d*9
 t+9
 &&-	9

 9
 ((4/9
   4'9
 $;9
 ell*9
 #Tk9
 
#9
  9
r/   r/  c                       \ rS rSrSrg)JetMoeForSequenceClassificationiR  r(   Nr)   r(   r/   r0   rD  rD  R  s    `cr/   rD  )r/  r  r   rD  )Er   collections.abcr   r@   r   torch.nnr   rL    r   r  activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r    r!   r"   configuration_jetmoer$   
get_loggerr*   r   r&   r3   Moduler6   r\   r   r   r   r   r   r  r/  rD  __all__r(   r/   r0   <module>rY     sS    $   $ & ! . ) / R F & R R 7 E 4   / 
		H	%	N 		2 	*BII *Z.Sryy .Sb7		 7tIS		 ISXU8bii U8p&* &R %2 % %2 E
, E
 E
PK
- K
\ d&FH] c kr/   