
    Z j-                    `   S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	J
r
  SS	KJr  \
R                  " \5      rS
SSS.r\" SS9 " S S5      5       r\R$                  SS j5       r\R(                  R*                                            SS j5       r          SS jrg)zSonicMoE integration: fused MoE using CuteDSL kernels from `kernels-community/sonic-moe`.

Provides `sonicmoe_experts_forward` registered as "sonicmoe" in the ExpertsInterface.
Requirements: CUDA, `kernels`, `nvidia-cutlass-dsl`, has_gate=True.
    )annotationsN)Callable)	dataclass   )logging   )lazy_load_kernelswiglugeglureglu)silugelureluT)frozenc                  .    \ rS rSr% SrS\S'   S\S'   Srg)	SonicMoE'   zAEntry points exposed by the `kernels-community/sonic-moe` kernel.typeactivation_type_enumr   moe_general_routing_inputs N)__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r       s/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/sonicmoe.pyr   r   '   s    K ((r   r   c                    [         R                  R                  5       (       d  [        S5      e[         R                  R	                  5       S   n U S:  a  [        SU  S35      e[        S5      nUc  [        S5      e[        [        US	S5      S
S5      n[        USS5      nSU4SU44 VVs/ s H  u  pEUb  M
  UPM     nnnU(       a  [        SSR                  U5       S35      e[        UUS9$ s  snnf )z
Load sonic-moe once and return its entry points.

Raises `ImportError` if CUDA/hardware requirements are not met, or if the kernel or
required symbols are not found.
zdsonic-moe kernel requires CUDA, but CUDA is not available. Use a different `experts_implementation`.r   	   z`sonic-moe requires a Hopper (SM90+) or newer GPU, but the current device has compute capability z-.x. Use a different `experts_implementation`.z	sonic-moeNu}   Failed to load the sonic-moe kernel — check that `kernels-community/sonic-moe` has a build matching the current torch/CUDA.enumsActivationTyper   zenums.ActivationTypez.sonic-moe kernel is missing required symbols: z, zN. Make sure you have the `kernels` package and `nvidia-cutlass-dsl` installed.)r   r   )	torchcudais_availableImportErrorget_device_capabilityr	   getattrjoinr   )majorkernelr   r   nameattrmissings          r    _load_sonicmoe_kernelr1   /   sA    ::""$$r
 	

 JJ,,.q1Eqy&&+W,Y[
 	

 k*F~;
 	

 #767D#ACSUYZ!(1Mt!T
 $%9:)+EF

JD  	
   <TYYw=O<P Q[ [
 	

 1#= s   .	C3;C3c                    [        5       nUR                  n[        U[        R	                  US5      R                  5       UR                  5      nUR                  U UUUUUUUU	UUU
SS9u  nnU$ )uW  Module-level shim around `moe_general_routing_inputs` so `allow_in_graph` can wrap it.

sonicmoe asserts `not torch.compiler.is_compiling()` internally because it dispatches
CuteDSL kernels, which Dynamo can't trace. `allow_in_graph` keeps the call in the FX
graph as a single opaque node (no tracing into the body, no graph break) while still
running the real Python at runtime — autograd through `_UpProjection` / `_DownProjection`
flows normally. The decorator must be applied at module load time, not inside the compiled
function — hence this shim plus the `allow_in_graph` decorator above.
r
   N)Eactivation_typeis_inference_mode_enabledconcat_layout	stream_id)r1   r   r*   ACT_MAPgetupperSWIGLUr   )hidden_statesrouter_scores
expert_ids	token_idxw1b1w2b2act_namenum_expertsr6   r5   sonicmoer   r4   output_s                    r    _sonicmoe_wrapperrI   c   s    0 %&H#88gkk(H=CCEG[GbGbO 33




'";# 4 IFA Mr   c                   U R                   (       d  [        S5      eUR                  R                  S:w  a  [        S5      eUR                  nUR	                  S5      nUR	                  S5      n[
        R                  " XdS9R                  S5      R                  SU5      R                  S5      R                  5       nUR                  S5      R                  UR                  5      nUR                  S5      R                  5       n	U R                  n
U R                  nU R                  (       a  U R                   OS nU R                  (       a  U R"                  OS n[%        U
[
        R&                  R(                  R*                  5      (       aJ  U
R-                  5       n
UR-                  5       nUb  UR-                  5       OS nUb  UR-                  5       OS n[/        U R0                  SS	5      R3                  5       nU R4                  (       a  S
OSnU
R6                  " U6 n
UR6                  " U6 n[9        UUU	UU
UUUUU R:                  U R<                  [
        R>                  " 5       (       + S9$ )Nz/sonicmoe requires gated experts (has_gate=True)r&   zsonicmoe requires CUDA devicer   )devicer   
hidden_actr   )r   r   r   )r   r   r   )r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   r6   r5   ) has_gate
ValueErrorrL   r   sizer%   arange	unsqueezeexpandreshapeinttodtypegate_up_proj	down_projhas_biasgate_up_proj_biasdown_proj_bias
isinstancedistributedtensorDTensorto_localr*   configloweris_transposedpermuterI   rE   is_concatenatedis_grad_enabled)selfr<   top_k_indextop_k_weightsrL   	num_top_k
num_tokensr?   r=   r>   r@   rB   rA   rC   rD   perms                   r    sonicmoe_experts_forwardrn      s     ==JKK  F*899!!F  $I##A&J Z7AA!DKKBPYZbbcefjjlI!))"-001D1DEM$$R(,,.J 
		B	B#'==		dB $		4B"e''..6677[[][[] nR[[]$ nR[[]$ t{{L&9??AH **9	D	T	B	T	B##$$**&+&;&;&="= r   )returnr   )r<   torch.Tensorr=   rp   r>   rp   r?   rp   r@   rp   rA   torch.Tensor | NonerB   rp   rC   rq   rD   strrE   rU   r6   boolr5   rs   ro   rp   )
rh   ztorch.nn.Moduler<   rp   ri   rp   rj   rp   ro   rp   )r   
__future__r   	functoolscollections.abcr   dataclassesr   r%   utilsr   hub_kernelsr	   
get_loggerr   loggerr8   r   cacher1   _dynamoallow_in_graphrI   rn   r   r   r    <module>r      sH   #  $ !   ) 
		H	% Wg
> $) ) ) 0 0f +++ + 	+
 	+ 	+ 	+ 	+ + + +  $+ + +\=
== =  	=
 =r   