
    Z j                       S SK Jr  S SKrS SKJr  S SKJr  S SKrS SKJ	r	  S SKJ
r  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJrJr  \R:                  " \5      r\R@                  r!\RD                  " \!5      RF                  r$\RD                  " \!5      RJ                  r&Sr'S r(\" SS9 " S S5      5       r)\RT                  S,S j5       r+\" SS9 " S S5      5       r,\RT                  S-S j5       r-S.S jr.\R^                  4             S/S jjr0 " S S\	Rb                  5      r2          S0S jr3          S0S jr4          S1S jr5          S2S  jr6      S3S! jr7          S0S" jr8 " S# S$\	Rr                  5      r: " S% S&\5      r;\;" 5       r< S4 S5S' jjr= " S( S)\5      r> " S* S+\5      r?g)6    )annotationsN)Callable)	dataclass)
functional   )ACT2FN)ConversionOps)should_convert_module)logging)get_cuda_runtime_versionis_kernels_availableresolve_internal_import   )lazy_load_kernel)ExpertsInterfaceuse_experts_implementation   c                    U H   n[        X5      (       d  M  [        X5      s  $    [        [        U 5      R                   SU 35      e)Nz has none of: )hasattrgetattrAttributeErrortype__name__)objnamesnames      z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/finegrained_fp8.py_first_attrr   /   sE    33%%  DI../~eWE
FF    T)frozenc                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S'   Srg	)
FineGrainedFP86   zNEntry points exposed by the `kernels-community/finegrained-fp8` Triton kernel.r   
fp8_matmulfp8_act_quantbatched_fp8_matmulgrouped_fp8_matmul Nr   
__module____qualname____firstlineno____doc____annotations____static_attributes__r(   r   r   r"   r"   6   s    X    r   r"   c                    [        5       (       d  [        S5      e[        S5      n U c  [        S5      e[        U SS5      n[        U SS5      n[        U SS5      n[        U SS5      nSU4SU4SU4SU44 VVs/ s H  u  pVUb  M
  UPM     nnnU(       a  [        S	S
R	                  U5       S35      e[        UUUUS9$ s  snnf )z
Load the finegrained-fp8 Triton kernel once and return its entry points.

Raises `ImportError` if the `kernels` package is missing, or the kernel or required
symbols cannot be found.
z`finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`.zfinegrained-fp8Nu   Failed to load the finegrained-fp8 kernel — check that `kernels-community/finegrained-fp8` has a build matching the current torch/CUDA.w8a8_fp8_matmulr%   w8a8_fp8_matmul_batchedw8a8_fp8_matmul_groupedz4finegrained-fp8 kernel is missing required symbols: , A. Please update the `kernels` package (`pip install -U kernels`).)r$   r%   r&   r'   )r   ImportErrorr   r   joinr"   )kernelr$   r%   r&   r'   r   attrmissings           r   _load_finegrained_fp8_kernelr;   @   s!     !!n
 	
 /0F~;
 	

 !2D9JFOT:M )BDI )BDI
 
+m,&(:;&(:;	
	
JD  	
  	 B499WCUBV WN N
 	

 #--	 !	s   9	C C c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   Srg)	DeepGEMMq   zAEntry points exposed by the `kernels-community/deep-gemm` kernel.r   r$   r'   per_token_cast_to_fp8r(   Nr)   r(   r   r   r=   r=   q   s    K  ##r   r=   c                    [        5       (       d  [        S5      e[        R                  R	                  5       (       d  [        S5      e[        R                  R                  5       S   n U S:  a  [        SU  S35      e[        5       u  pUS:  d  US:X  a  US:  a  [        S	U S
U S35      e[        S5      nUc  [        S5      e[        USS5      n[        USS5      n[        USS9nSU4SU4SU44 VVs/ s H  u  pxUb  M
  UPM     n	nnU	(       a  [        SSR                  U	5       S35      e[        UUUS9$ s  snnf )z
Load DeepGEMM once and return its entry points.

Raises `ImportError` if CUDA/hardware requirements are not met, or the kernel or
required symbols are not found.
zYDeepGEMM kernel requires the `kernels` package. Install it with `pip install -U kernels`.zcDeepGEMM kernel requires CUDA, but CUDA is not available. Use a different `experts_implementation`.r   	   z_DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device has compute capability z-.x. Use a different `experts_implementation`.      z0DeepGEMM requires CUDA runtime 12.3+, but found .zO. Please upgrade your CUDA toolkit or use a different `experts_implementation`.z	deep-gemmNu|   Failed to load the DeepGEMM kernel — check that `kernels-community/deep-gemm` has a build matching the current torch/CUDA.fp8_gemm_nt m_grouped_fp8_gemm_nt_contiguouszutils.per_token_cast_to_fp8)chained_pathz-DeepGEMM kernel is missing required symbols: r4   r5   )r$   r'   r?   )r   r6   torchcudais_availableget_device_capabilityr   r   r   r   r7   r=   )
major
cuda_major
cuda_minorr8   r$   r'   r?   r   r9   r:   s
             r   _load_deepgemm_kernelrO   z   s     !!uvv::""$$q
 	

 JJ,,.q1Eqy&&+W,Y[
 	
 67JB:+
Q>zl!J< X\ \
 	

 k*F~;
 	

 5J )KTR3FIfg
 J'/1CD*,AB

JD
  	
   ;DIIg<N;O PN N
 	

 -3 s   :	E E c                    X-   S-
  U-  $ )zCeiling division.r   r(   )abs     r   _cdivrS      s    EAI!r   c                   Ub  US   US   s=:X  a  S:X  a  O  O [        5       nU R                  SU R                  S   5      nUR                  SUR                  S   5      n[        R                  " UR                  S   UR                  S   U R
                  US9n	UR                  XxR                  5       4XR                  5       4U	5        U	R                  U R                  SS UR                  S   4-   5      $ [        5       n
U
R                  XX#XE5      $ ! [         a    [        R                  S5         N>f = f)uL  FP8 matmul: C = dequant(A, As) @ dequant(B, Bs)^T.

Supports both per-tensor and block-wise quantization:
  - block_size=None or block_size=[N, K]: per-tensor mode (As is scalar/per-row, Bs is scalar)
  - block_size=[block_n, block_k]: block-wise mode (As and Bs are per-block scale grids)

Dispatch order:
  1. DeepGEMM (Hopper+, block_size 128x128) if available
  2. Triton finegrained-fp8 kernel (universal fallback)

Args:
    A:  (M, K) float8_e4m3fn — quantized activations
    B:  (N, K) float8_e4m3fn — quantized weights
    As: block-wise: (M, K//block_k) float32; per-tensor: (M,) per-row scales
    Bs: block-wise: (N//block_n, K//block_k) float32; per-tensor: scalar or (1,) single weight scale
    block_size: [block_n, block_k] for block-wise quantization, or None/[N, K] for per-tensor
    output_dtype: desired output dtype
Nr   r   r   devicedtypea  DeepGEMM kernel is not available or compatible, falling back to Triton finegrained-fp8 kernel. To use DeepGEMM FP8 matmul, ensure you have a Hopper (SM90+) or newer GPU with CUDA runtime 12.3+, and that the `kernels` package is installed and up to date (`pip install -U kernels`).)rO   viewshaperH   emptyrW   r$   floatr6   loggerwarning_oncer;   )ABAsBs
block_sizeoutput_dtypedeepgemmA_2dAs_2doutputfinegrained_fp8s              r   r1   r1      s   4 *Q-:a="GC"G	=,.H 66"aggbk*DGGB-E[[A
188S_`F{{} 588:O;;qwws|qwwqzm;<<24O%%aBJMM  	i	s   
D D=<D=c                  T   ^  \ rS rSrSSS\4         SU 4S jjjrS	S jrSrU =r$ )
	FP8Linear   NdynamicFc                  > [         T	U ]  X5        XPl        X0l        X@l        [
        R                  R                  [
        R                  " X!US95      U l	        U R                  c=  [        R                  " [
        R                  " S[
        R                  S95      U l        OX R                  S   -   S-
  U R                  S   -  nXR                  S   -   S-
  U R                  S   -  n[        R                  " [
        R                  " Xx[
        R                  S95      U l        U R                  S:X  a=  [        R                  " [
        R                  " S[
        R                  S95      U l        OU R                  SS 5        U R                  (       a:  [        R                  " [
        R                  " U R                  5      5      U l        g U R                  SS 5        g )NrX         ?r   r   staticactivation_scalebias)super__init__has_biasrc   activation_schemerH   nn	Parameterr[   weighttensorfloat32weight_scale_invrr   register_parameterout_featuresrs   )
selfin_featuresr   rc   rw   rv   rX   scale_out_featuresscale_in_features	__class__s
            r   ru   FP8Linear.__init__   sU    	3 $!2hh((\V[)\]??"$&LLc1W$XD!".1C"Ca"GDOO\]L^!^!,q/A!AA!E$//Z[J\ \$&LL.W%D! !!X-$&LLc1W$XD!##$6===U[[1B1B%CDDI##FD1r   c           	        U R                   R                  5       S:  a+  [        R                  " XR                   U R                  5      $ U R                   nU R
                  n[        U[        R                  R                  R                  5      (       a   UR                  5       nUR                  5       nU R                  S:X  aG  [        5       nUR                  XR                  b  U R                  S   OUR                   S   5      u  pVOU R                  S:X  aW  U R"                  R%                  [        R&                  5      nX-  R)                  [*        [,        S9R%                  [.        5      nO[1        SU R                   35      e[3        UUUUU R                  UR4                  S9nU R                  b  UR7                  U R                  5        UR%                  UR4                  S9$ )	Nr   rm   rU   rq   minmaxzUnsupported activation scheme: rd   ro   )rz   element_sizeFlinearrs   r}   
isinstancerH   distributedr{   DTensorto_localrw   r;   r%   rc   rZ   rr   tor|   clamp_FP8_MIN_FP8_MAX
_FP8_DTYPENotImplementedErrorr1   rX   add_)r   inputrz   	scale_invri   qinputscalerh   s           r   forwardFP8Linear.forward  s   ;;##%)88E;;		::))	fe//66>>??__&F!**,I!!Y.:<O+99__-Htq)ekkZ\oMFE ##x/)),,U]];Em**xX*FII*UF%(GH^H^G_&`aa OO
 99 KK		"yyu{{y++r   )rr   rw   rs   rc   rv   rz   r}   )
r   intr   r   rc   tuple[int, int] | Nonerw   strrv   bool)r   torch.Tensorreturnr   )	r   r*   r+   r,   r   ru   r   r/   __classcell__r   s   @r   rk   rk      sV    
 .2!*"2"2 "2 +	"2
 "2 "2 "2H!, !,r   rk   c                   U R                   S:X  a  [        S5      e[        5       nUR                  S5      nUR                  S5      nUR                  S5      nUR	                  USS9nUR                  S5      n	UR                  S5      n
U
R                  SU R                  S-
  5        UR                  UU R                  (       a  U R                  OU R                  U R                  (       a  U R                  OU R                  U R                  U
S9nU R                  (       a  U R                  U5      nOU R!                  U5      nUR                  UU R"                  U R$                  U R                  U
S9nXR'                  UR(                  5      R+                  S5      -  nUR-                  XeU5      R/                  SS9nUR'                  UR(                  5      $ )Nrq   zbatched_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rU   r   dimr   )rc   
expert_ids)rw   r   r;   sizerepeat_interleavereshapeclamp_num_expertsr&   has_gategate_up_projup_projgate_up_proj_scale_invup_proj_scale_invrc   _apply_gateact_fn	down_projdown_proj_scale_invr   rX   	unsqueezerY   sum)r   hidden_statestop_k_indextop_k_weightsri   	num_top_k
num_tokens
hidden_dimselected_hidden_statessample_weightsr   proj_outweighted_outfinal_hidden_statess                 r   fp8_batched_mm_experts_forwardr   8  s    )!W
 	

 34O  $I##A&J##B'J +<<YA<N"**2.N$$R(J
 a))A-. 11!]]'+}}##$:P:P?? 2 H }}##H- ;;x( 11  ?? 2 H //?II"MML '++J:NRRWXRY!!-"5"566r   c           	        U R                   S:X  a  [        S5      e[        5       nUR                  nUR	                  S5      nUR	                  S5      nUR	                  S5      nUR                  S5      n	UR                  S5      n
[        R                  " U
5      u  pXU-     nX   nUR                  S:X  a  UR                  5       OUR                  5       n[        R                  " XR                  SU R                  S-
  S9n[        R                  " US[        R                  S9nXR                  :  R                  S5      nU R                   (       a  U R"                  OU R$                  nU R                   (       a  U R&                  OU R(                  nU R*                  nU R,                  n[/        U[        R0                  R2                  R4                  5      (       a@  UR7                  5       nUR7                  5       nUR7                  5       nUR7                  5       nUR9                  UUUUU R:                  US	9nU R                   (       a  U R=                  U5      nOU R?                  U5      nUR9                  UUUUU R:                  US	9nUURA                  URB                  5      R                  S5      -  nURE                  US
5        [        RF                  " U5      n[        RH                  " UR	                  S5      US9UU'   UU   nURK                  XvU5      RM                  SS9nURA                  URB                  5      $ )Nrq   zgrouped_mm experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.rU   r   cpur   binsr   r   )r   rX   )tokens_per_expertrc   offsets        rW   r   )'rw   r   r;   rW   r   r   rH   sortr   r\   r   histcr   cumsumint32r   r   r   r   r   r   r   r   r   r   r{   r   r   r'   rc   r   r   r   rX   masked_fill_
empty_likearangerY   r   )r   r   r   r   ri   rW   r   r   r   r   r   expert_ids_gpermselected_hidden_states_gsample_weights_ghistc_inputr   r   sentinel_maskw_upws_upw_downws_downr   r   inv_permr   s                              r   fp8_grouped_mm_experts_forwardr   y  s    )!W
 	

 34O!!F  $I##A&J##B'J #**2.N$$R(J J/L,Y->?%+
 +1++*>,$$&LDTDTDVKK6F6FASWScScfgSghll,!5;;GG "%5%55@@DM !%44<<D+/==D''d>T>TE^^F&&G$))008899}} """$ 11 +?? 2 H }}##H- ;;x( 11+?? 2 H .11(..AKKBOOL mS1 %H\\$))A,v>HTN)L '++J:NRRWXRY!!-"5"566r   c                   U R                   nU R                  S5      n[        R                  " U R	                  5       USUS-
  S9R                  5       nXb-   S-
  U-  U-  nU[        XQ5      US-
  -  -   nXv-
  n	[        R                  R                  R                  U	R                  S5      S5      n
[        R                  " XTS9X   -   nU(       a   UR                  S5      R	                  5       nOP[        R                  " U4SU[        R                  S9n[        R                  " X:  U R	                  5       S5      X'   XU4$ )a  Build the TMA-aligned layout DeepGEMM's grouped GEMM expects.

Returns `(sorted_to_padded, grouped_layout, total_padded_rows)`. `grouped_layout` encodes
expert boundaries as a cumsum of aligned counts on Blackwell (`use_psum_layout=True`) or
per-row expert ids with -1 for padding on Hopper.

Accepts EP sentinels: values in `expert_ids_sorted` equal to `num_experts` (unclamped sentinels)
are routed past the last aligned expert block and marked `-1` in the Hopper layout (and
excluded from the Blackwell cumsum), so DeepGEMM skips them.
r   r   r   )r   r   r   rU   rV   )rW   r   rH   r   r   longr   rx   r   padr   r   fullr   where)expert_ids_sortedr   	alignmentuse_psum_layoutrW   r   r   aligned_tokens_per_experttotal_padded_rowspadding_per_expertcumulative_paddingsorted_to_paddedgrouped_layouts                r   !_build_deepgemm_contiguous_layoutr     s=    %%F"''*J$5$9$9$;+STZehiZijooq"3"?!"C	!QU^ ^"S%AYQR]%SS 3F,,001C1J1J11MvV||J>ASAff
 399!<@@B %6$8"VSXS^S^_+0;;7H7VXiXmXmXoqs+t(->>>r   c                    [         R                  " X0R                  S   U R                  U R                  S9nXU'   [         R                  " X1R                  S   U R                  [         R
                  S9nXU'   XE4$ )zKPad sorted hidden states and scales into the TMA-aligned contiguous layout.r   rV   )rH   zerosrZ   rW   rX   r|   )r   scalesr   r   hidden_paddedscales_paddeds         r   "_pad_to_deepgemm_contiguous_layoutr   	  sv     KK..q1-:N:NVcViViM '4"#KK 1<<?=K_K_glgtgtuM&,"#''r   c                
    X   $ )z;Remove padding rows from the TMA-aligned contiguous layout.r(   )hidden_states_paddedr   s     r   &_unpad_from_deepgemm_contiguous_layoutr     s      11r   c                   U R                   S:X  a  [        S5      eU R                  c  [        S5      eU R                  S   S:w  d  U R                  S   S:w  a  [        SU R                   35      e[	        5       nUR
                  nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n	UR                  S5      n
[        R                  " U
5      u  pXU-     nX   n[        R                  R                  U5      S   S	:  n[        XR                  [        US
9u  nnnXR                  :  R                  S5      nU R                   (       a  U R"                  OU R$                  nU R                   (       a  U R&                  OU R(                  nU R*                  nU R,                  n[/        U[        R0                  R2                  R4                  5      (       a@  UR7                  5       nUR7                  5       nUR7                  5       nUR7                  5       nUR9                  USS9u  nn[;        UUUU5      u  nn[        R<                  " UUR>                  S   U[        R@                  S9nURC                  UU4UURE                  5       4UUUS9  U R                   (       a  U RG                  U5      nOU RI                  U5      nUR9                  USS9u  nn[        R<                  " UX[        R@                  S9nURC                  UU4UURE                  5       4UUUS9  [K        UU5      nUURM                  URN                  5      R                  S5      -  nURQ                  US5        [        RR                  " U5      n[        RT                  " UR                  S5      US9UU'   UU   nURW                  XvU5      RY                  SS9nURM                  URN                  5      $ )Nrq   zDeepGEMM experts dispatch does not support activation_scheme='static'. Use the default eager dispatch or switch to activation_scheme='dynamic'.zuDeepGEMM requires block-wise quantization (block_size=[128, 128]), but got per-tensor quantization (block_size=None).r   r   r   z-DeepGEMM requires block_size=(128, 128), got rU   
   )r   r   F)	use_ue8m0rV   )r   r   r   r   )-rw   r   rc   
ValueErrorrO   rW   r   r   rH   r   rI   rK   r   r   _DEEPGEMM_M_ALIGNMENTr   r   r   r   r   r   r   r   r   r   r{   r   r   r?   r   r[   rZ   bfloat16r'   r\   r   r   r   r   rX   r   r   r   rY   r   ) r   r   r   r   re   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   act_fp8
act_scalesr   proj_fp8proj_scalesr   r   r   s                                    r   fp8_deepgemm_experts_forwardr     s    )!W
 	
 A
 	
 qS DOOA$6#$=HHYZ[[$&H!!F  $I##A&J##B'J #**2.N$$R(J J/L,Y->?%+jj66v>qARGO:[&&2GYh;7n&7 "%5%55@@DM !%44<<D+/==D''d>T>TE^^F&&G$))008899}} """$ #889Q]b8cGZ<WjRbduvGZ{{,djjmFRWR`R`aH	*ekkm4h`o   
 }}##H-;;x( %::8u:UHk{{,ju~~^H	;	!'    6h@PQH .11(..AKKBOOL mS1 %H\\$))A,v>HTN)L '++J:NRRWXRY!!-"5"566r   c                     ^  \ rS rSrSSSS\4       SU 4S jjjrSS jr        SS jr S         SS	 jjrS
r	U =r
$ )
FP8Expertsi  Nrm   FTc           	     H  > [         TU ]  5         USL d   S5       eXl        X@l        XPl        X l        UR                  U l        X0l        [        USS5      U l
        [        USS5      U l        [        [        USS5         U l        U R                  (       a  S	U R                  -  U R                  p[        R                  " [         R"                  " U R                  XxUS
95      U l        U R
                  b  ['        XpR
                  S   5      OSn	U R
                  b  ['        XR
                  S   5      OSn
[        R                  " [         R"                  " U R                  X[         R(                  S
95      U l        U R-                  SS 5        OU R                  U R                  p[        R                  " [         R"                  " U R                  XUS
95      U l        U R
                  b  ['        XR
                  S   5      OSnU R
                  b  ['        XR
                  S   5      OSn[        R                  " [         R"                  " U R                  X[         R(                  S
95      U l        U R-                  SS 5        U R                  U R                  nn[        R                  " [         R"                  " U R                  UUUS
95      U l        U R
                  b  ['        XR
                  S   5      OSnU R
                  b  ['        UU R
                  S   5      OSn[        R                  " [         R"                  " U R                  UU[         R(                  S
95      U l        U R-                  SS 5        U R                  S:X  a  [        R                  " [         R6                  " U R                  [         R(                  S
95      U l        [        R                  " [         R6                  " U R                  [         R(                  S
95      U l        g g )NFzWFP8Experts does not support bias for now, please open an issue if you want this featurenum_local_expertsr   moe_intermediate_sizeintermediate_sizehidden_activation
hidden_actr   ro   r   r   gate_up_proj_biasup_proj_biasdown_proj_biasrq   )rt   ru   configrv   r   rc   hidden_sizer   rw   r   r   intermediate_dimr   r   rx   ry   rH   r[   r   rS   r|   r   r~   r   r   r   r   onesgate_up_proj_activation_scaledown_proj_activation_scale)r   r  rc   rw   rv   r   rX   gu_proj_out
gu_proj_ingu_scale_outgu_scale_in
u_proj_out	u_proj_inu_scale_out
u_scale_in
d_proj_out	d_proj_ind_scale_out
d_scale_inr   s                      r   ru   FP8Experts.__init__  s    	5  	
e	
    $ ,,!2&v/BMR +F4KM` a[1DlST==&'$*?*?&? "U[[9I9I;jo-p qDEI__E`5ooa.@AfgLCG??C^%
OOA,>?deK*,,,D,,lu}}]+D' ##$7>$($9$94??	<<D4D4Djch(ijDLCG??C^%
OOA,>?deKAEA\y//!*<=bcJ%'\\D,,kU]][&D" ##ND9 $1F1FI
ekk$2B2BJPYaf&gh?C?ZeJ(:;`a=A__=XU9dooa&89^_
#%<<KK((+zW$
  	 0$7!!X-13ejjIYIYafanan>o1pD..0ll5::dFVFV^c^k^k;l.mD+ .r   c                N    UR                  SSS9u  p#U R                  U5      U-  $ )Nr   rU   r   )chunkr   )r   gate_upgateups       r   r   FP8Experts._apply_gate  s*    ===+{{4 2%%r   c                   [         R                  " U[         R                  S9n[         R                  " 5          [         R                  R
                  R                  X R                  S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  SS	9R                  S
5      nS S S 5        W GH  nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
U R                  S:X  a  U R                  U   OS nU R!                  U
U R"                  (       a  U R$                  U   OU R&                  U   U R"                  (       a  U R(                  U   OU R*                  U   US9nU R"                  (       a  U R-                  U5      OU R/                  U5      nU R                  S:X  a  U R0                  U   OS nU R!                  UU R2                  U   U R4                  U   US9nX9US 4   nXR7                  UR8                  5      -  nUR;                  SXR7                  UR8                  5      5        GM     UR7                  UR8                  5      $ ! , (       d  f       GN= f)Nro   )num_classesr   r   r   )rU   r   F)as_tuplerU   rq   )rr   )rH   
zeros_liker|   no_gradrx   r   one_hotr   permutegreaterr   nonzerorY   r   rw   r  r   r   r   r   r   r   r   r   r  r   r   r   rX   
index_add_)r   r   r   r   r   expert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategate_up_act_scaler   down_act_scalerouting_weightsr   s                   r   r   FP8Experts.forward  s7   
 $..}EMMR]]_((--55kO_O_5`K%--aA6K{8'DaHPPZ_P`eefhiJ 
 %J---#(;;{:/F#G I)4MBFBXBX\dBd22:>jn  {{15!!*-DLLQ[D\;?==++J7dNdNdeoNp!2	 # H 6:]]t''1T\H]H?C?U?UYa?a//
;gk  {{z*((4!/	 # H ,y$,FGO#&8&8&HHL**1iI\IbIb9cd7 %8 #%%m&9&9::C _s   BI88
Jc           	        UR                  5       S:  a  [        R                  " XS 5      $ U R                  S:X  aP  UbM  UR	                  [
        R                  5      nX-  R                  [        [        S9R	                  [        5      nOF[        5       nUR                  XR                  b  U R                  S   OUR                  S   5      u  pe[        UUUUU R                  UR                   S9nUR	                  UR                   S9$ )Nr   rq   r   rU   r   ro   )r   r   r   rw   r   rH   r|   r   r   r   r   r;   r%   rc   rZ   r1   rX   )	r   r   rz   r}   rr   r   r   ri   rh   s	            r   r   FP8Experts.linear  s      1$88E400!!X-2B2N$''6Em**xX*FII*UF:<O+99__-Htq)ekkZ\oMF !OO
 yyu{{y++r   )r   rw   rc   r  r   r  r   r   r  r   rv   r   r   r  r   r   r   )rc   r   rw   r   rv   r   r   r   )r%  r   r   r   )r   r   r   r   r   r   r   r   N)
r   r   rz   r   r}   r   rr   ztorch.Tensor | Noner   r   )r   r*   r+   r,   r   ru   r   r   r   r/   r   r   s   @r   r  r    s     .2!*7n +7n 	7n
 7n 7n 7nr&(;)(;8D(;Ua(;	(;^ 15,, , '	,
 ., 
, ,r   r  c                  $    \ rS rSrSr\\\S.rSr	g)FP8ExpertsInterfacei  z?Interface for registering custom FP8 experts forward functions.)
batched_mm
grouped_mmre   r(   N)
r   r*   r+   r,   r-   r   r   r  _global_mappingr/   r(   r   r   rB  rB    s    I 540Or   rB  c                P   UR                   (       a  U $ SnU R                  5        GHL  u  pV[        XQ5      (       d  M  U(       a  0 OSS0nSn[        R                  " S5         UR                  S5      (       av  [        USS5      n	[        USS5      n
[        US	U R                  R                  5       5      n[        [        [        U
U	S
9nU" SUUR                  UR                  U
U	S.UD6nOd[        U[        R                   5      (       aE  [#        SUR$                  UR&                  UR                  UR                  UR(                  SLS.UD6nUb  U R+                  XX5        SnSSS5        GMO     U(       d  [,        R/                  S5        U $ ! , (       d  f       GM  = f)a}  
A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

Parameters:
    model (`torch.nn.Module`):
        Input model or `torch.nn.Module` as the function is run recursively.
    modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
        Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
    quantization_config (`FbgemmFp8Config`):
        The quantization config object that contains the quantization parameters.
    pre_quantized (`book`, defaults to `False`):
        Whether the model is pre-quantized or not
FrX   Nmetaz.expertsr   Trv   r  )experts_classexperts_interfacerv   r   )r  rc   rw   rv   r   )r   r   rc   rw   rv   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.r(   )
dequantizenamed_modulesr
   rH   rW   endswithr   r  get_text_configr   r  ALL_FP8_EXPERTS_FUNCTIONSweight_block_sizerw   r   rx   Linearrk   r   r   rs   set_submoduler]   warning)modelmodules_to_not_convertquantization_configpre_quantizedhas_been_replacedmodule_namemodulemodule_kwargs
new_moduler   rv   r  	new_classs                r   replace_with_fp8_linearr]    s   " %%$224$[II ,'4
\\&!##J//"6:t<"6:u= 5<<3O3O3QR6",&?%%		 ' !2DD&9&K&K%% $
 FBII..&  & 2 2!'!4!42DD&9&K&K#[[4 $
 %##K<$(!= "!  5N <	
 LK "!s   #DF
F%	c                  P    \ rS rSrSrS rS
S jrSS jrSS jr\	SS j5       r
Srg	)Fp8Quantizeia  zV
A quantization operation that creates two tensors, weight and scale out of a weight.
c                    Xl         g r@  hf_quantizerr   rb  s     r   ru   Fp8Quantize.__init__f      (r   c                l   S nU R                   R                  bp  [        U R                   R                  [        5      (       a&  U R                   R                  R	                  S5      nO![        U R                   R                  SS 5      nUc  UR                  S   UR                  S   4n[        U5      $ )NrO  r+  rU   )rb  rU  r   dictgetr   rZ   tuple)r   valuerc   s      r   _resolve_block_sizeFp8Quantize._resolve_block_sizei  s    
00<$++??FF!..BBFFGZ[
$T%6%6%J%JL_aef
++b/5;;r?;JZ  r   c                   UR                   S:  a  X0$ U R                  U5      u  p4UR                  S   UR                  S   peXS-  S:w  d  Xd-  S:w  a  X0$ UR                  S S nXS-  nXd-  n	UR                  n
UR                  [        R
                  5      nUR                  " / UQUPUPU	PUP76 nUR                  5       R                  SS9n[        R                  " US:  U[        R                  " U5      5      n[        U-  n[        R                  " US:  U[        R                  " U5      5      nUR                  S5      R                  S5      nUU-  n[        R                  " U[        [        S9R                  [        5      nUR                  U
5      nS	U-  R                  [        R
                  5      nUR!                  S
5      (       a  UR#                  SS5      S   S-   OUS-   nUUUU0$ )Nr   r+  rU   r   )rU   r   rn  r   rp   rz   rD   r   .weight_scale_inv
_scale_inv)ndimrk  rZ   r   rH   r|   r   absamaxr   	ones_liker   r   r   r   r   rL  rsplit)r   keyrj  block_mblock_nrowscolsleading_shape
rows_tiles
cols_tilesoriginal_shape
value_fp32reshapedmax_abssafe_max_absr   scales_broadcastscaled	quantized
inv_scales	scale_keys                        r   _quantize_oneFp8Quantize._quantize_onet  s    ::><33E:[[_ekk"od>Q$.A"5< CR(_
_
XXemm,
%%_}_j_'_:_W^_,,.%%(%3{{7Q;9QRL(Wq[&%//&2IJ!++B/99"=,,KKH(CFFzR	%%n5	Fl&&u}}5
CF<<PXCYCYCJJsA&q),??_beq_q	Y	:66r   c                    0 nUR                  5        HA  u  pE[        U[        5      (       a  US   OUnUR                  U R	                  XF5      5        MC     U$ )Nr   )itemsr   listupdater  )r   
input_dictkwargsresultrv  rj  r{   s          r   convertFp8Quantize.convert  sS     +-$**,JC!+E4!8!8U1XeFMM$,,S9: - r   c                ,    [        U R                  5      $ r@  )Fp8Dequantizerb  r   s    r   
reverse_opFp8Quantize.reverse_op  s    T..//r   ra  N)rj  r   r   ztuple[int, int])rv  r   rj  r   r   dict[str, torch.Tensor])r  r   r   r  r   r	   )r   r*   r+   r,   r-   ru   rk  r  r  propertyr  r/   r(   r   r   r_  r_  a  s0    )	! 7D 0 0r   r_  c                  n    \ rS rSrSrS rSS jrSrSS jrSS jr	 S     SS	 jjr
\SS
 j5       rSrg)r  i  ux  Dequantize FP8 weights using their per-block ``weight_scale_inv``.

Designed to run as the *first* op in any :class:`WeightConverter` chain when
loading with ``dequantize=True`` — :meth:`update_weight_conversions` on the
FP8 quantizer attaches it to each existing model-specific converter so that
per-expert (weight, scale) pairs are folded into full-precision tensors before
the chain's merge / concat ops collapse the per-expert structure.

Pattern semantics
    Input ``input_dict`` carries one entry per source pattern; each value is a
    list of tensors (one per ``*`` match). For every weight pattern that has a
    sibling ``*.weight_scale_inv`` pattern in the dict, this op pairs them up by
    index, dequantizes per-pair, and emits the dequantized list under the
    original *weight* key. Scale entries are dropped from the output so the
    remaining ops only see weights.
c                    Xl         g r@  ra  rc  s     r   ru   Fp8Dequantize.__init__  re  r   c                    UR                  S5      nU(       a  US S OUnUR                  S5      (       a  US [        S5      *  S-   nOUS:X  a  SnOUS-   nU(       a  US-   $ U$ )N$rU   z.weightro  rz   r}   rp  )rL  len)r   weight_patternanchoredbaser   s        r   _scale_pattern_for Fp8Dequantize._scale_pattern_for  sr    !**3/&.~cr"N==##*C	N?+.AAEX&E<'E&us{1E1r   )r   g      ?rp   g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                   [         R                  " U R                  [         R                  UR                  S9nUR                  5       R                  [         R                  5      nUS-  R                  5       nUS-	  S-  R                  5       n[         R                  " X$   X%   /SS9nUR                  " / UR                  SS QSUR                  S   -  P76 $ )uR   Two ``e2m1`` FP4 values per byte → float32 tensor twice as wide on the last dim.)rX   rW         rU   r   Nr   )rH   r{   _FP4_E2M1_LUTr|   rW   
contiguousrY   uint8r   stackr   rZ   )r   packedlutu8lowhighunpackeds          r   _unpack_fp4Fp8Dequantize._unpack_fp4  s    ll4--U]]6==Y %%ekk2CxooqC%%';;#)4"=Icr!2IAR8H4HIIr   c                B   [        [        SS 5      nUR                  [        R                  :X  d  Ub"  UR                  U:X  a  U R	                  U5      nOUR                  [        R                  5      nUR                  SS  u  pVUR                  SS  u  pxXW-  (       d	  Xh-  (       a  [        SU SU SU SU S3	5      eXW-  n	Xh-  n
UR                  R                  (       a   UR                  5       S:  a  UR                  O[        R                  nUR                  nUR                  SXyX5      nUR                  [        R                  5      R                  SXx5      R                  S5      R                  S5      nX-  R                  U5      R                  U5      $ )	Nfloat4_e2m1fn_x2r+  zWeight shape (r4   z) not divisible by scale grid (z).r   rU   )r   rH   rX   int8r  r   r|   rZ   r   is_floating_pointr   r   r   r   )r   r  r   	fp4_dtypequantized_fp32ry  rz  
scale_rows
scale_colsrw  rx  	out_dtyper~  qss                  r   _dequantize_oneFp8Dequantize._dequantize_one  so    E#5t<	??ejj(Y-ByZcGc!--i8N&\\%--8N#))"#.
 "(bc!2
 1 b.Mj\Y[\f[ggij  $$ %+LL$B$BvGZGZG\`aGaFLLglgugu	'--""2zJPIIemm$,,RHRRSUV``abczz)$,,^<<r   Nc                   SU;   a]  US   n[        U[        5      (       a  US   OUnSU;   a3  US   n[        U[        5      (       a  US   OUnX R                  XE5      0$ X$0$ 0 nUR                  5        H  u  pxSU;   d  SU;   a  M  U R	                  U5      n	X;  a  XU'   M/  [        U[        5      (       a  UOU/n
X   n[        U[        5      (       a  UOU/n[        U
5      [        U5      :w  a'  [        SU S[        U
5       S[        U5       S35      e[        X5       VVs/ s H  u  pU R                  X5      PM     snnXg'   M     U$ s  snnf )	Nzweight$r   r}   rr   z/Fp8Dequantize: weight/scale count mismatch for z (z weights vs z	 scales).)r   r  r  r  r  r  r   zip)r   r  full_layer_namer  r  r   r  rv  rj  r  weightswr  s                r   r  Fp8Dequantize.convert  sw    
""9-I(29d(C(C	!I!Z/#$67&0&>&>F')=)=i)PQQ#// @B$**,JC!S(,>#,E//4I*#s)%66eUGG*F)&$77VfXF7|s6{* EcU KG~\#f+iI  CFgBVWBV$!4//5BVWFK! -"  Xs   *Ec                ,    [        U R                  5      $ r@  )r_  rb  r  s    r   r  Fp8Dequantize.reverse_op  s    
 4,,--r   ra  )r  r   r   r   )r  r   r   r   )r  r   r   r   r   r   r@  )r  ,dict[str, list[torch.Tensor] | torch.Tensor]r  z
str | Noner   r  r  )r   r*   r+   r,   r-   ru   r  r  r  r  r  r  r  r/   r(   r   r   r  r    s_    ")
2 mMJ=> '+&@& $&
 
6&P . .r   r  )r   r"   )r   r=   )rQ   r   rR   r   r   r   )r_   r   r`   r   ra   r   rb   r   rc   z	list[int]rd   ztorch.dtyper   r   )
r   ztorch.nn.Moduler   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   ri  )
r   r   r   r   r   r   r   r   r   z!tuple[torch.Tensor, torch.Tensor])r   r   r   r   r   r   )NNF)rT  zlist[str] | None)@
__future__r   	functoolscollections.abcr   dataclassesr   rH   torch.nnrx   r   r   activationsr   core_model_loadingr	   quantizers.quantizers_utilsr
   utilsr   utils.import_utilsr   r   r   hub_kernelsr   moer   r   
get_loggerr   r]   float8_e4m3fnr   finfor   r   r   r   r   r   r"   cacher;   r=   rO   rS   r|   r1   rP  rk   r   r   r   r   r   r  Moduler  rB  rN  r]  r_  r  r(   r   r   <module>r     s   #  $ !   $   . ?  h h ) = 
		H	%   
;;z"&&;;z"&&  G $! ! ! - -` $$ $ $ = =@ !&,N,N,N 	,N 		,N
 ,N ,N ,N^F,		 F,R>7
>7>7 >7  	>7
 >7Bb7
b7b7 b7  	b7
 b7J(?#(?25(?BE(?X\(?
(?V((( #( 	(
 '( 2&2:F22g7
g7g7 g7  	g7
 g7TC, C,L*  01  ejA#3AHA0- A0Hx.M x.r   