
    Z j<                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SS	KJr  S
SKJr  S
SKJrJrJrJrJrJr  SSKJr   " S S\5      r " S S\5      r " S S\R4                  R6                  5      rS'S jr " S S\5      r " S S\R>                  5      r  " S S\RB                  5      r" " S S\5      r# " S S\5      r$ " S  S!\5      r% " S" S#\5      r& " S$ S%\\$5      r'/ S&Qr(g)(zPyTorch Phimoe model.    )CallableN)nn   ) GenericForSequenceClassification)ROPE_INIT_FUNCTIONS)maybe_autocast)OutputRecorder   )LlamaAttention)MixtralDecoderLayerMixtralExpertsMixtralForCausalLMMixtralModelMixtralPreTrainedModelMixtralRotaryEmbedding   )PhimoeConfigc                   0    \ rS rSrSS\4S jjrSS jrSrg)	PhimoeRotaryEmbedding(   Nconfigc                    [         R                  R                  5         UR                  U l        UR                  U l        Xl        U R                  R                  S   U l        U R                  U l
        U R                  S:w  a  [        U R                     U l
        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultinv_freqF)
persistentoriginal_inv_freq)r   Module__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersr   compute_default_rope_parametersrope_init_fnr   attention_scalingregister_bufferclone)selfr   devicer   s       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/phimoe/modular_phimoe.pyr   PhimoeRotaryEmbedding.__init__)   s    
		"("@"@$*$B$B!44[A&*&J&J>>Y& 3DNN CD+/+<+<T[[&+Q((ZeD0(..2BuU    c                    Ub%  [        U R                  R                   SU S35      eS n[        R                  " U5      S-   nU R
                  R                  S   S:w  aU  U(       aN  XPR
                  R                  S   :  a  U R
                  R                  S   OU R
                  R                  S   nU R                  U R
                  UR                  U5      u  pgUc  UOUnUS S S 2S 4   R                  5       R                  UR                  S	   S
S5      R                  UR                  5      nUS S 2S S S 24   R                  5       n	[        UR                  R                  [        5      (       a0  UR                  R                  S:w  a  UR                  R                  OSn
[!        U
SS9   UR                  5       U	R                  5       -  R#                  SS5      n[        R$                  " X4S
S9nUR'                  5       U-  nUR)                  5       U-  nS S S 5        WR                  UR*                  5      WR                  UR*                  5      4$ ! , (       d  f       ND= f)Nz3 does not support layer types, but got `layer_type=`r   r   r    original_max_position_embeddingslong_mscaleshort_mscaler   mpscpuF)device_typeenabledr
   dim)
ValueError	__class____name__torchmaxr   r#   r%   r*   floatexpandshapeto
isinstancetypestrr   	transposecatcossindtype)r)   xposition_ids
layer_typemscaleseq_lenr   r&   inv_freq_expandedposition_ids_expandedr6   freqsembrH   rI   s                  r+   forwardPhimoeRotaryEmbedding.forward9   s   !>>**++^_i^jjkl  ))L)A-;;&&{3y@W [[889[\\ ++M:[[00@ 
 '+&7&7QXXw&W#&,n"&$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')f$C'')f$C	 D
 vvaggqww// DCs   4A.I  
I.)r&   r   r!   r"   r%   r   N)NN)r<   
__module____qualname____firstlineno__r   r   rT   __static_attributes__ r-   r+   r   r   (   s    V| V 0r-   r   c                       \ rS rSrSrg)PhimoeAttentionU   r[   Nr<   rW   rX   rY   rZ   r[   r-   r+   r]   r]   U       r-   r]   c                       \ rS rSr\S\R                  S\R                  S\R                  S\R                  S\R                  4
S j5       r\S\R                  4S	 j5       rS
r	g)PhimoeMultiplierY   scores
multiplierselected_expertsmasked_gatesmask_for_onec                 .    U R                  X#U5        X%-  $ )a  
Forward pass for the custom autograd function.

Args:
    ctx: Context object to save information for backward computation.
    scores (torch.Tensor): Input scores tensor.
    multiplier (torch.Tensor): Multiplier tensor.
    selected_experts (torch.Tensor): Tensor of selected experts.
    masked_gates (torch.Tensor): Masked gates tensor.
    mask_for_one (torch.Tensor): Mask for one tensor.

Returns:
    torch.Tensor: Result of the forward pass.
)save_for_backward)ctxrd   re   rf   rg   rh   s         r+   rT   PhimoeMultiplier.forwardZ   s    . 	jLI((r-   grad_at_outputc                 ~    U R                   u  p#nX-  nXAR                  S5      -  nUR                  SUUS9  USSSS4$ )a
  
Backward pass for the custom autograd function.

Args:
    ctx: Context object with saved tensors from the forward pass.
    grad_at_output (torch.Tensor): Gradient at the output.

Returns:
    tuple[torch.Tensor, None, None, None, None]: Gradients for the inputs.
r3   )r9   indexsrcN)saved_tensorsmulscatter_add_)rk   rm   re   rf   rg   grad_at_scores_expandeds         r+   backwardPhimoeMultiplier.backwardt   sg     695F5F2
l'4".1C1CB1G"G,," 	- 	
 $
 	
r-   r[   N)
r<   rW   rX   rY   staticmethodr=   TensorrT   ru   rZ   r[   r-   r+   rb   rb   Y   sx    )) LL)  ,,	)
 ll) ll) )2 

 
r-   rb   c                 H   [         R                  " 5          U R                  SSS9u  pEU R                  5       R	                  US9nX@-
  U-  SU-  :  nSSS5        U R                  W[        S5      5      nU(       ab  U[         R                  " U[         R                  S9R                  5       R                  5       -
  R                  SS	9S
   R                  S5      nOWn[         R                  " USS	9nUR                  SUS9n	U(       a  UR                  SSS9u  p[         R                  " X:H  [         R                  " U
5      S:  5      n[         R                   " SUSS9R#                  U5      n[$        R'                  U U	UUU5      nOU	n[         R(                  " U SU[        S5      5      n[         R                  " 5          UR                  SSS9u  pEU R                  5       R	                  US9nX@-
  U-  SU-  :  nSSS5        UR                  U[        S5      5      nU(       ab  U[         R                  " U[         R                  S9R                  5       R                  5       -
  R                  SS	9S
   R                  S5      nOWn[         R                  " USS	9nUR                  SUS9nU(       a  UR                  SSS9u  p[         R                  " X:H  [         R                  " U
5      R+                  5       S:  5      n[         R                   " SUSS9R#                  U5      n[$        R'                  U UUUU5      nOUn[         R,                  " UU4SS	9n[         R,                  " X4SS	9nUU4$ ! , (       d  f       GN4= f! , (       d  f       GN= f)u,  
Sparse mixer function to select top-k experts and compute multipliers.
Based on the paper: https://huggingface.co/papers/2409.12136
We first replace the TopK(·) function as random sampling of discrete variables
in model training. Then, following Liu et al. (2023a) and Liu et al. (2023b), we apply Heun's
third order method to approximate the expert routing gradient and construct a modified
back-propagation to give a mathematically sound gradient estimation for expert routing.

Args:
    scores (torch.Tensor): Input scores tensor.
    jitter_eps (float): Jitter epsilon for numerical stability.
    training (bool): Flag indicating if the model is in training mode.
    top_k (int): Number of top experts to select.

Returns:
    tuple[torch.Tensor, torch.Tensor]: Multiplier and selected experts tensors.
r3   T)r9   keepdim)minr
   Nz-inf)memory_formatr8   r   )r9   ro   g      ?gioT?gK=U?)alpha)r=   no_gradr>   absclampmasked_fillr?   
empty_likelegacy_contiguous_formatexponential_log	unsqueezesoftmaxgather
logical_or	rand_likeaddtype_asrb   applyscatteruniform_concat)rd   
jitter_epstrainingtop_kmask_logits_thresholdmax_indfactorrg   rf   multiplier_o
max_scoresrh   re   masked_scoresmasked_gates_top2selected_experts_top2multiplier_top2_omask_for_one_top2multiplier_top2s                      r+   sparsemixerr      s   $ 
)/D)I&##(=#>"7"@F!JqS]~ ^	 
 %%&;U6]KL ""<u?]?]^kkmqqst SRS[	
 Yr] 	 # ==26L&&25E&FL*..2t.D
'''OOJ'$.

 yyVDLL\Z%++

 "
 MM
f	M 
)6):):r4):)P&##(=#>"7"@F!JqS]~ ^	 
 &112GvW """#4EDbDbc
 SRS[ Yr] 	 !(&7R@)00R?T0U/33D3I
!,,!,OOJ'002T9

 "IIf.?vNVVWhi*00!
 ,z?;DJ||%5$MSUV 	 G 
f 
s   =N =N 
N
N!c                       \ rS rSrSrg)PhimoeExpertsi  r[   Nr_   r[   r-   r+   r   r     r`   r-   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  4   4U 4S jjr	Sr
U =r$ )PhimoeTopKRouteri  r   c                    > [         TU ]  UR                  UR                  SS9  UR                  U l        UR
                  U l        UR                  U l        g )NFbias)superr   hidden_sizenum_local_expertsrouter_jitter_noiseinput_jitter_noisenum_experts_per_tokr   r)   r   r;   s     r+   r   PhimoeTopKRouter.__init__  sL    ++V-E-EER#)#=#= "(";";//
r-   hidden_statesreturnc                 H  > U R                   (       aS  U R                  S:  aC  U[        R                  " U5      R	                  SU R                  -
  SU R                  -   5      -  n[
        TU ]  U5      n[        X R                  U R                   U R                  S9u  p4X#U4$ )Nr         ?)r   r   r   )
r   r   r=   r   r   r   rT   r   r   r   )r)   r   router_logitsrouting_weightsrf   r;   s        r+   rT   PhimoeTopKRouter.forward  s    ==T44q8U--m<EEd---sT5L5L/L M 6,7&>&>^b^h^h-
) /???r-   )r   r   r   )r<   rW   rX   rY   r   r   r=   rx   tuplerT   rZ   __classcell__r;   s   @r+   r   r     sA    0| 0	@U\\ 	@eELL%,,<V6W 	@ 	@r-   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )PhimoeSparseMoeBlocki)  a  
This implementation is
strictly equivalent to standard MoE with full capacity (no
dropped tokens). It's faster since it formulates MoE operations
in terms of block-sparse operations to accommodate imbalanced
assignments of tokens to experts, whereas standard MoE either
(1) drop tokens at the cost of reduced performance or (2) set
capacity factor to number of experts and thus waste computation
and memory on padding.
c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l	        [        U5      U l        [        U5      U l        UR                  U l        g rV   )r   r   r   
hidden_dimintermediate_sizeffn_dimr   num_expertsr   r   r   routerr   expertsr   r   s     r+   r   PhimoeSparseMoeBlock.__init__5  si     ,,//!33//
&v.$V,"(";";r-   r   r   c                    UR                   u  p#nU R                  (       aS  U R                  S:  aC  U[        R                  " U5      R                  SU R                  -
  SU R                  -   5      -  nUR                   u  p#nUR                  SU5      nU R                  U5      u  pVnU R                  XU5      nUR                  X#U5      $ )Nr   r   r3   )	rA   r   r   r=   r   r   reshaper   r   )	r)   r   
batch_sizesequence_lengthr   _r   rf   final_hidden_statess	            r+   rT   PhimoeSparseMoeBlock.forward?  s    2?2E2E/
Z==T44q8U--m<EEd---sT5L5L/L M 3@2E2E/
Z%--b*=/3{{=/I,,"ll=O\"**:
SSr-   )r   r   r   r   r   r   r   )r<   rW   rX   rY   __doc__r   r=   rx   rT   rZ   r   r   s   @r+   r   r   )  s1    	<TU\\ Tell T Tr-   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )PhimoeDecoderLayeriM  r   	layer_idxc                    > [         TU ]  X5        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        g NT)epselementwise_affine)r   r   r   	LayerNormr   rms_norm_epsinput_layernormpost_attention_layernorm)r)   r   r   r;   s      r+   r   PhimoeDecoderLayer.__init__N  sX    +  "||F,>,>FDWDWlpq(*F$7$7D)
%r-   )r   r   )	r<   rW   rX   rY   r   intr   rZ   r   r   s   @r+   r   r   M  s    
| 
 
 
r-   r   c                   *    \ rS rSr\" \SS9\\S.rSr	g)PhimoePreTrainedModeliX  r   )ro   )r   r   
attentionsr[   N)
r<   rW   rX   rY   r	   r   r   r]   _can_record_outputsrZ   r[   r-   r+   r   r   X  s    '(8B+%r-   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )PhimoeModeli`  r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  SS9U l        g r   )r   r   r   r   r   r   normr   s     r+   r   PhimoeModel.__init__a  s1     LL!3!39L9Laef	r-   )r   )r<   rW   rX   rY   r   r   rZ   r   r   s   @r+   r   r   `  s    g| g gr-   r   c                   D   ^  \ rS rSrU 4S jr      SU 4S jjrSrU =r$ )PhimoeForCausalLMif  c                    > [         TU ]  U5        [        R                  " UR                  UR
                  U R                  R                  S9U l        g )Nr   )	r   r   r   Linearr   
vocab_sizer   lm_head_biaslm_headr   s     r+   r   PhimoeForCausalLM.__init__g  s:     yy!3!3V5F5FT[[MeMefr-   c                 &  > U(       ap  [        U R                  S5      (       aU  UR                  S   U R                  R                  S-   :  a+  UR	                  5       n	XR                  R                  ::  a  S n[
        TU ]  " SUUUUUUUS.UD6n
U
$ )Nr0   r   )	input_idspast_key_valuesattention_maskinputs_embedsrL   	use_cachelogits_to_keepr[   )hasattrr   rA   r0   get_seq_lengthr   prepare_inputs_for_generation)r)   r   r   r   r   rL   r   r   kwargspast_lengthmodel_inputsr;   s              r+   r   /PhimoeForCausalLM.prepare_inputs_for_generationl  s    " %GHH"dkk&R&RUV&VV)88:KkkJJJ"&w< 	
+)'%)	
 	
 r-   )r   )NNNNTN)r<   rW   rX   rY   r   r   rZ   r   r   s   @r+   r   r   f  s'    g # #r-   r   c                       \ rS rSrSrg)PhimoeForSequenceClassificationi  r[   Nr_   r[   r-   r+   r   r     s    `cr-   r   )r   r   r   r   )r
   ))r   collections.abcr   r=   r   modeling_layersr   modeling_rope_utilsr   utils.genericr   utils.output_capturingr	   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r   r   configuration_phimoer   r   r]   autogradFunctionrb   r   r   r   r   r   r   r   r   r   r   r   __all__r[   r-   r+   <module>r     s     $   7 + 4 1  /*02 *0Z	n 	;
u~~.. ;
|xv	N 	@ryy @&!T299 !TH
, 
2 g, g)* )X d&FH] cr-   