
    Z j>i                       S SK Jr  S SKJr  S SKJr  SSKJr  SSKJ	r	  SSK
JrJrJrJr  SS	KJr  \" 5       (       a:  S S
Kr\R$                  R'                  \5      r\R$                  R'                  \5      r\R(                  " \5      r  S          S!S jjr          S"S jrS#S jrS#S jrS rS r\" 5       (       aR  \R:                  R=                  S\SS9  \R:                  R?                  S\5        \R:                  RA                  S\\S9  S$S jr!        S#S jr"  S            S%S jjr#          S"S jr$ " S S\	5      r%\%" 5       r&S&S jr' S'\&SSSSS.             S(S jjjr(g
))    )annotations)Callable)wraps   )logging)GeneralInterface)is_torch_availableis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compiling   )sonicmoe_experts_forwardNFc                   U(       a6  [         R                  " U R                  S5      U5      R                  S5      nO4[         R                  " XR                  S5      5      R                  S5      nUb  UR	                  U5        U$ )a  Batched linear layer supporting optional bias and transposed weights.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (batch_size, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
        else of shape (batch_size, input_dim, output_dim).
    bias (`torch.Tensor`, *optional*):
        Bias tensor of shape (batch_size, output_dim). Default is `None`.
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the weight tensor is transposed.
Returns:
    `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
r   )torchbmm	unsqueezesqueezeadd_)inputweightbiasis_transposedouts        n/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/moe.py_batched_linearr   S   se    * ii*F3;;A> ii 34<<R@J    c                   UR                  S5      nUR                  S5      nUR                  S5      nUR                  USS9nUR                  S5      nUR                  S5      n	U	R                  SU R                  S-
  5        U R
                  (       a2  U R                  U	   n
U R                  (       a  U R                  U	   OS nO1U R                  U	   n
U R                  (       a  U R                  U	   OS n[        XzXR                  S9nU R
                  (       a  U R                  U5      nOU R                  U5      nU R                  U	   n
U R                  (       a  U R                   U	   OS n[        XXR                  S9nXR#                  S5      -  nUR%                  XTU5      R'                  SS9nUR)                  UR*                  5      $ )Nr   r   dimr   r   r   )sizerepeat_interleavereshapeclamp_num_expertshas_gategate_up_projhas_biasgate_up_proj_biasup_projup_proj_biasr   r   _apply_gateact_fn	down_projdown_proj_biasr   viewsumtodtype)selfhidden_statestop_k_indextop_k_weights	num_top_k
num_tokens
hidden_dimselected_hidden_statessample_weights
expert_idsselected_weightsselected_biasesproj_outweighted_outfinal_hidden_statess                  r   batched_mm_experts_forwardrD   u   s      $I##A&J##B'J +<<YA<N"**2.N$$R(J
 a))A-. }},,Z8@D$00<SW<<
3;?==$++J7d VhVhH
 }}##H- ;;x( ~~j19=d))*5DO HZHZH
 66r::L '++J:NRRWXRY!!-"5"566r   c                *   [         R                  " U R                  S5      UR                  S5      U R                  U R                  S9nSn[        UR                  5       5       H*  u  pVXF:X  a  M  [         R                  " XU X   X4U S9  UnM,     U$ )a  
Fallback grouped matrix multiplication used when `torch.nn.functional.grouped_mm` and `torch._grouped_mm`
are unavailable or incompatible with `torch.compile` (e.g. non-bfloat16 weights).

Args:
    input (`torch.Tensor`): Input of shape (S, input_dim), sorted by expert id.
    weight (`torch.Tensor`): Expert weights of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`): Cumulative token counts per expert of shape (num_experts,).
Returns:
    `torch.Tensor`: Output of shape (S, output_dim).
r   r   devicer4   r   )r   zerosr"   rG   r4   	enumeratetolistmm)r   r   offsoutputstartiends          r   _grouped_mm_fallbackrR      s     [[AAu||SXS^S^_FE DKKM*<S!69&s2CD	 + Mr   c                z   U R                  5       S:X  d   S[        U R                  5       35       eUR                  5       S:X  d   S[        UR                  5       35       eUR                  5       S:X  d   S[        UR                  5       35       eUR                  S5      UR                  S5      :X  d+   SUR                  S5       S	UR                  S5       35       eU R                  S5      UR                  S5      :X  d+   S
U R                  S5       SUR                  S5       35       eUR                  [
        R                  [
        R                  4;   d   SUR                   35       e[
        R                  " U R                  S5      UR                  S5      U R                  U R                  S9$ )zRShape/dtype inference stub for `_grouped_mm_fallback` required by `torch.compile`.r   z+input must be 2D (S, input_dim), got shape    zBweight must be 3D (num_experts, input_dim, output_dim), got shape r   z*offs must be 1D (num_experts,), got shape r   zoffs length z must match number of experts zinput_dim mismatch: input has z, weight has z$offs must be an integer tensor, got rF   )
r    tupleshaper"   r4   r   int32int64emptyrG   r   r   rM   s      r   _grouped_mm_fallback_faker[      s   99;!_J5QVQ\Q\K]J^__::<1 
LUSYS_S_M`Lab 88:?\HtzzIZH[\\?99Q<6;;q>)v\$))A,Geflfqfqrsfteu+vv)::a=FKKN* 
(A}V[[QR^DTU* ::%++u{{33h7[\`\f\f[g5hh3;;uzz!}fkk!nU\\QVQ\Q\]]r   c                H    U R                  US   US   5        US   U l        g)zjSaves input and weight for backward; offs is stored directly as it is a non-differentiable integer tensor.r   r   r   N)save_for_backwardrM   )ctxinputsrN   s      r   "_grouped_mm_fallback_setup_contextr`      s%    &)VAY/ayCHr   c                   U R                   u  p#[        R                  " U5      n[        R                  " U5      nSn[        U R                  R                  5       5       HZ  u  pxXh:X  a  M  [        R                  " XU X7   R                  XFU S9  [        R                  " X&U R                  XU XW   S9  UnM\     XES4$ )zuBackward pass for `_grouped_mm_fallback`. Computes grad_input and grad_weight per expert group; offs has no gradient.r   rH   N)saved_tensorsr   
zeros_likerJ   rM   rK   rL   T)	r^   grad_outputr   r   
grad_inputgrad_weightrO   rP   rQ   s	            r   _grouped_mm_fallback_backwardrh      s    %%ME!!%(J""6*KE CHHOO-.<3'*3:OPS!##[s%;P / D((r   z!transformers::grouped_mm_fallback )mutates_args)setup_contextc                Z   [        5       (       a  UR                  [        R                  :w  dW  UR                  R
                  S:X  a>  [        SSS9(       a/  UR                  5       S-  S:w  d  U R                  5       S-  S:w  a  gUR                  R
                  S:X  a  [        [        R                  R                  S	5      (       a,  [        R                  R                  UR                  5      S
:  $ [        [        S5      (       ag  [        SSS9(       a,  [        R                  R                  UR                  5      S
:  $ [        R                  R                  UR                  5      S:  $ g[        [        R                  R                  S	5      =(       d    [        [        S5      $ )a  
Check if torch.nn.functional.grouped_mm or torch._grouped_mm can be used based on availability and compatibility with torch.compile.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
Returns:
    `bool`: True if grouped_mm can be used, False otherwise.
cpuz2.10.0T)
accept_dev   r   Fcuda
grouped_mm)   r   _grouped_mmz2.9)	   r   )r   r4   r   bfloat16rG   typer   data_ptrhasattrnn
functionalrp   get_device_capabilityr
   rZ   s      r   _can_use_grouped_mmr|     s&    	!""v||u~~'Ee#"8=__#q(ENN,<r,AQ,F 
 }}V#588&&55::33FMMBfLL5-(((4@zz77F&PPzz77F&PP588&&5V9VVr   c                   [        XU5      (       a  [        [        R                  R                  S5      (       aA  [        R                  R                  R                  U R                  UR                  5      XS9$ [        [        S5      (       a.  [        R                  " U R                  UR                  5      XS9$ [        R                  R                  R                  XUS9$ )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
Returns:
    `torch.Tensor`: Output tensor of shape (S, output_dim).
rq   rM   rs   )r|   rx   r   ry   rz   rq   r3   r4   rs   opstransformersgrouped_mm_fallbackrZ   s      r   rs   rs   0  s    $ 5$//
 588&&5588&&11%((6<<2H&1\\UM**$$UXXfll%;VOO99!!55e$5OOr   c                    U(       a  [        XUS9nO[        XR                  SS5      US9nUb  UR                  U5        U$ )a  Grouped linear layer supporting optional bias and transposed weights.

Args:
    input (`torch.Tensor`):
        Input tensor of shape (S, input_dim).
    weight (`torch.Tensor`):
        Weight tensor of shape (num_experts, input_dim, output_dim) if `is_transposed`,
        else of shape (num_experts, output_dim, input_dim).
    offs (`torch.Tensor`):
        Offsets tensor indicating the boundaries of each group in the input tensor.
    bias (`torch.Tensor`, *optional*):
        Bias tensor of shape (num_experts, output_dim). Default is `None`.
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the weight tensor is transposed.
Returns:
    `torch.Tensor`: Output tensor of shape (S, output_dim).
r~   r   )rs   	transposer   )r   r   rM   r   r   r   s         r   _grouped_linearr   O  sD    0 %d3 %!1!1"b!9EJr   c                   UR                   nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n	[        R                  " U	5      u  pXU-     nX   nUR
                  S;   a  U
R                  5       OU
R                  5       n[        R                  " XR                  SU R                  S-
  S9n[        R                  " US[        R                  S9nXR                  :  R                  S5      nU
R                  U R                  S-
  S9  U R                  (       a/  U R                  nU R                   (       a  U R"                  U
   OS nO.U R$                  nU R                   (       a  U R&                  U
   OS nUR)                  US5        [+        UUUUU R,                  S	9nU R                  (       a  U R/                  U5      nOU R1                  U5      nU R2                  nU R                   (       a  U R4                  U
   OS n[+        UUUUU R,                  S	9nUUR                  S5      -  nUR)                  US5        [        R6                  " U5      n[        R8                  " UR                  S5      US
9UU'   UU   nUR;                  XeU5      R=                  SS9nUR?                  UR@                  5      $ )Nr   r   )rm   mpsr   )binsminmax)r    r4   )r   g        r!   )rG   r   )!rG   r"   r$   r   sortrv   floatinthistcr&   cumsumrW   r   r%   r'   r(   r)   r*   r+   r,   masked_fill_r   r   r-   r.   r/   r0   
empty_likearanger1   r2   r3   r4   )r5   r6   r7   r8   rG   r9   r:   r;   r=   r>   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputtokens_per_expertoffsetssentinel_maskr?   r@   rA   rB   inv_permrC   s                           r   grouped_mm_experts_forwardr   u  s    !!F  $I##A&J##B'J #**2.N$$R(J J/L,Y->?%+ +1++*G,$$&\M]M]M_KK6F6FASWScScfgSghll,!5;;GG" "%5%55@@DMD,,q01 }},,BF--$00>UY<<=A]]$++L9PT ))-=  "2G/aeasasH
 }}##H- ;;x( ~~;?==d)),7dO "G/QUQcQcH
 .88<<L mS1 %H\\$))A,v>HTN)L '++J:NRRWXRY!!-"5"566r   c                  <   ^  \ rS rSrSr\\\S.rSU 4S jjr	Sr
U =r$ )ExpertsInterfacei  z;Interface for registering custom experts forward functions.)
batched_mmrq   sonicmoec                   > Uc  [         R                  S5        OUS:w  a  X;  a  [        SU S35      e[        TU ]  X5      $ )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.a
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r5   experts_implementationdefault	__class__s      r   get_interfaceExpertsInterface.get_interface  s[    !)N
 $w.3I3U*++wx  w{1;;r   ri   )r   strr   r   returnr   )__name__
__module____qualname____firstlineno____doc__rD   r   r   _global_mappingr   __static_attributes____classcell__)r   s   @r   r   r     s"    E 10,O< <r   r   c                N    UR                  SSS9u  p#U R                  U5      U-  $ )a{  
Default gating mechanism: splits the gate_up_out into gate and up parts,
applies the activation function to the gate part, and multiplies it with the up part.
Args:
    gate_up_out (`torch.Tensor`):
        The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
Returns:
    `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
r   r   r   )chunkr.   )r5   gate_up_outgateups       r   _default_apply_gater     s/        +HD;;tr!!r   T)experts_interfaceis_concatenatedr   r)   r'   c               >   ^^^^^ SUUUUU4S jjnU b  U" U 5      $ U$ )a\  Decorator to modify experts class to support different experts implementations.

Args:
    experts_class (`type[torch.nn.Module]`, *optional*):
        The experts class to modify. If not provided, returns a decorator that can be applied to the class.
    experts_interface (`ExpertsInterface`, *optional*, defaults to `ALL_EXPERTS_FUNCTIONS`):
        The experts interface to use for dispatching the forward method.
    is_concatenated (`bool`, *optional*, defaults to `True`):
        Whether the expert weights are stored in concatenated layout [gate;up]
        or interleaved layout [gate0, up0, gate1, up1, ...].
    is_transposed (`bool`, *optional*, defaults to `False`):
        Whether the expert weights are stored in transposed format.
    has_bias (`bool`, *optional*, defaults to `False`):
        Whether the expert layers include bias terms or not.
    has_gate (`bool`, *optional*, defaults to `True`):
        Whether the experts use a gating mechanism or not.
        Whether it has gate_up_proj weights or just up_proj weights.

Returns:
    `type[torch.nn.Module]`: The modified experts class.
c                   >^^ U R                   mU R                  m[        T5      UUUU	U4S j5       n[        T5      UU4S j5       n[        U S5      (       d  [        U l        Xl         X l        U $ )Nc                b   > T" X/UQ70 UD6  Xl         TU l        TU l        TU l        TU l        g N)configr'   r)   r   r   )	r5   r   argskwargsr)   r'   r   r   original_inits	       r   __init__=use_experts_implementation.<locals>.wrapper.<locals>.__init__)  s8    $888 K$DM$DM!.D#2D r   c                h   > TR                  U R                  R                  T5      nU" U /UQ70 UD6$ r   )r   r   _experts_implementation)r5   r   r   experts_forwardr   original_forwards       r   forward<use_experts_implementation.<locals>.wrapper.<locals>.forward2  s5    /==dkk>a>acstO"49$9&99r   r-   )r   r   r   rx   r   r-   )
experts_classr   r   r   r   r   r)   r'   r   r   s
      @@r   wrapper+use_experts_implementation.<locals>.wrapper%  sy    %..(00	}		3 	3 
	3 
	 	: 
!	: }m44(;M%!) 'r   )r   type[torch.nn.Module]r   r   ri   )r   r   r   r   r)   r'   r   s    ````` r   use_experts_implementationr     s%    > 2  }%%Nr   )NF)
r   torch.Tensorr   r   r   torch.Tensor | Noner   boolr   r   )
r5   ztorch.nn.Moduler6   r   r7   r   r8   r   r   r   )r   r   r   r   rM   r   r   r   )r   r   r   r   rM   r   r   r   )r   r   r   r   rM   r   r   r   r   r   r   r   )r   r   r   r   r   )r   ztype[torch.nn.Module] | Noner   r   r   r   r   r   r)   r   r'   r   r   r   ))
__future__r   collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr	   r
   r   r   r   r   r   _dynamoassume_constant_result
get_loggerr   r   r   rD   rR   r[   r`   rh   library	custom_opregister_fakeregister_autogradr|   rs   r   r   r   ALL_EXPERTS_FUNCTIONSr   r   ri   r   r   <module>r      s   # $   ,  / 
 !& D DE^ _"]]AABXY 
		H	%\ !%	  	
 D<7
<7<7 <7  	<7
 <7D4^)& 	MM?AUdfg	MM CE^_	MM##+%8 $ *WZPPP P 	PF !%### # 	#
 # #Le7
e7e7 e7  	e7
 e7P<' <0 )* " 37; +@ ;/; (; 	;
 ; ; ; ;r   