
    Z j3                         S SK Jr  SSKJr  \(       a  SSKJr  SSKJr  SSKJ	r	J
r
JrJrJr  SSKJr  \" 5       (       a
  S S	KrSS
KJr  \R&                  " \5      rS	r " S S\5      rg	)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)Mxfp4Config)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                      ^  \ rS rSr% SrSrS\S'   U 4S jrS rS r	S	S
S\
S\4S jrSS jr SS	S
S\4S jjrS rS rS rS r\S\4S j5       rS rS rSrU =r$ )Mxfp4HfQuantizer*   z'
FP4 quantization using fbgemm kernels
Fr   quantization_configc                 6   > [         TU ]  " U40 UD6  S U l        g N)super__init__triton_kernels_hub)selfr   kwargs	__class__s      x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   Mxfp4HfQuantizer.__init__2   s    ,77"&    c                     U R                   c    SSKJn  U" S5      U l         U R                   $ U R                   $ ! [         a    [        S5      ef = f)z3Lazy import and initialize kernels only when neededr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr    ImportError)r   r    s     r   _lazy_import_kernels%Mxfp4HfQuantizer._lazy_import_kernels6   s]    ""*XA*45_*`' &&&t&&&  X!"VWWXs	   : Ac                    [        5       (       d  [        S5      eU R                  R                  (       a  g [	        5       (       d  [        S5      e[
        R                  R                  5       =(       d    [
        R                  " S5      nUR                  S;  aK  U R                  (       a+  [        R                  SU S35        SU R                  l        g [        SU S	35      e[
        R                  R                  5       (       a  Sn[!        S
5      n[#        5       nO[
        R$                  R                  5       (       a9  [
        R$                  R'                  5       nUS:  n[!        S5      n[#        5       nO.UR                  S:X  a  Sn[!        S
5      n[#        5       nOSnSnSnU R                  (       a  U(       d'  [        R                  S5        SU R                  l        g U(       d'  [        R                  S5        SU R                  l        g U(       d'  [        R                  S5        SU R                  l        g O6U(       d  [)        S5      eU(       d  [)        S5      eU(       d  [)        S5      eU R                  (       d  U R+                  5         UR-                  S5      nUbH  [/        U[0        5      (       a2  U R                  (       d   SUR3                  5       ;   a  [)        S5      eg g g g )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z9Using mxfp4 requires Accelerate: `pip install accelerate`cpu)cudaxpur&   zGUsing MXFP4 quantized models requires model on cuda/xpu/cpu, but found zj, we will default to dequantizing the model to bf16. To use mxfp4, please disable the current accelerator.TzIQuantizing a model using MXFP4 requires model on cuda/xpu/cpu, but found z7. To use mxfp4, please disable the current accelerator.z3.5.0)      z3.4.0Fu   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series). We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`. We will default to dequantizing the model to bf16.zMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`. We will default to dequantizing the model to bf16.u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) or CPUzMXFP4 quantization requires Triton: CUDA requires Triton >= 3.4.0, XPU/CPU requires Triton >= 3.5.0. Please install triton: `pip install triton`zPMXFP4 quantization requires the `kernels` package: `pip install kernels>=0.12.0`
device_mapdiskzYou are attempting to load an FP4 model with a device_map that contains a disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the disk device from the device_map.)r   r"   r   
dequantizer	   torchacceleratorcurrent_acceleratordevicetypepre_quantizedloggerwarning_onceRuntimeErrorr(   is_availabler   r
   r'   get_device_capability
ValueErrorr#   get
isinstancedictvalues)	r   argsr   r1   is_device_supported_mxfp4triton_availablekernels_installedcompute_capabilityr+   s	            r   validate_environment%Mxfp4HfQuantizer.validate_environmentA   s   !##] 
 ##..&((YZZ""668OELL<O;;44!!##]^d]e  fP  Q 7;((3"_`f_g  h_  `  99!!##(,%27; 4 6ZZ$$&&!&!A!A!C(:f(D%27; 4 6[[E!(,%27; 4 6(-%$ %,##I
 7;((3###I
 7;((3$##I
 7;((3 % +l  "`  #opp!!%%'ZZ-
!jT&B&B%%&J4E4E4G*G g  +H% 'C!r   modelr   
param_namereturnc                 Z    SSK Jn  [        X5      u  pV[        XT5      (       a  US;   a  ggg)Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsrJ   r   r;   )r   rE   rF   r   rJ   moduletensor_names          r   param_needs_quantization)Mxfp4HfQuantizer.param_needs_quantization   s/    525Ef11EEr   c                    [         R                  R                  5       (       a  [         R                  R                  5         g [         R                  R                  5       (       a  [         R                  R                  5         g g r   )r.   r'   r7   empty_cacher(   )r   rE   r   s      r   #_process_model_after_weight_loading4Mxfp4HfQuantizer._process_model_after_weight_loading   sM    ::""$$JJ""$YY##%%II!!# &r   use_kernelsc                    SSK Jn  [        R                  R	                  5       =(       d    [        R
                  " S5      nU(       a6  UR                  S;  a&  [        R                  S5        SU R                  l
        U(       d6  UR                  S;   a&  [        R                  S5        SU R                  l
        U R                  XR                  R                  UR                  5      U l        U" XR                  U R                  S9ng )	Nr   )replace_with_mxfp4_linearr&   )r&   zYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseTzMXFP4 inference on CPU requires use_kernels=True, but use_kernels is disabled. We will dequantize the model to bf16. To run MXFP4 natively on CPU, please set use_kernels=True.)modules_to_not_convertr   )rM   rX   r.   r/   r0   r1   r2   r4   r5   r   r-   get_modules_to_not_convertrY   _keep_in_fp32_modules)r   rE   rV   r   rX   r1   s         r   $_process_model_before_weight_loading5Mxfp4HfQuantizer._process_model_before_weight_loading   s     	= ""668OELL<O6;;g5e 37D$$/v{{g5s 37D$$/&*&E&E++BBED_D_'
# **E*E[_[s[s
r   c                     SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr`   updater   configs     r   update_tp_planMxfp4HfQuantizer.update_tp_plan   R    V--666v3T:F))00DRDRAOAO	 r   c                     SUR                   R                  ;   a.  [        USS 5      b   UR                  R	                  SSSSS.5        U$ )Nr_   base_model_ep_planra   rb   )r   rc   rd   rl   re   rf   s     r   update_ep_planMxfp4HfQuantizer.update_ep_plan   rj   r   c                 J   SSK Jn  UR                  5       n[        UR                  SS5      n[        UR                  SS5      nUR                  5        GHF  u  pg[        Xr5      (       a"  [        US5      (       a  [        US5      (       d  M:  S	 GH  n[        Xx5      n	[        Xx S
35      n
U	R                  R                  R                  U	R                  R                  5      R                  SS5      nUS:X  a  UR                  USSS5      nOUR                  XESS5      nU
R                  R                  R                  R                  U
R                  R                  R                  5      R                  SS5      nXU SU S3'   XU SU S3'   GM     GMI     0 nX=4$ )Nr   rI   num_local_experts    hidden_sizei@  gate_up_proj	down_proj)rs   rt   _precision_configZ      ._blocks_scales)rM   rJ   
state_dictrd   rg   named_modulesr;   hasattrstoragelayoutunswizzle_datadata	transposereshapeweight_scale)r   rE   rJ   r}   rp   rr   namerN   projtriton_tensorprecision_configblocksscalesmetadatas                 r   get_state_dict_and_metadata,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   5%%'
#ELL2ErJellM4@!//1LD666FN33FK005 ' 5#*6V;L3M#N &..55DD]EZEZE_E_`jjkmoqr>)#^^,=r2rJF#^^,=BPRSF)66>>EETT$1199>>)B#  7=dV1TF'236<dV1TF'23 6 22 ##r   c                     g)NT r   s    r   is_serializable Mxfp4HfQuantizer.is_serializable  s    r   c                 .    [         R                  S5        g)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r4   r5   r   s    r   is_trainableMxfp4HfQuantizer.is_trainable  s     x	
 r   c                     SSK Jn  U" U 5      $ )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_ops!Mxfp4HfQuantizer.get_quantize_ops  s    6T""r   c                    SSK JnJn  U R                  (       aD  U R                  R
                  (       a)  [        SS/SU" U 5      /S9[        SS/S	/U" U 5      /S9/$ [        SS/S	U" U 5      /S9[        SS/SU" U 5      /S9/$ )
Nr   )Mxfp4DequantizeMxfp4Deserializedown_proj_blocksdown_proj_scalesz
down_proj$)source_patternstarget_patterns
operationsgate_up_proj_blocksgate_up_proj_scaleszgate_up_proj$)r   r   r   r3   r   r-   r   )r   r   r   s      r   get_weight_conversions'Mxfp4HfQuantizer.get_weight_conversions  s    J$":":"E"E%79K$L$1 / 56
  %:<Q$R%4$5 / 56  !68M N 0,T23
 !35G H -,T23
 	
r   )rY   r   )rE   r   )F)rc   
__module____qualname____firstlineno____doc__requires_calibration__annotations__r   r#   rC   strboolrP   rT   r\   rh   rm   r   r   propertyr   r   r   __static_attributes____classcell__)r   s   @r   r   r   *   s     !&&'	'^@.? S _c $ "
 
 
B!$F d  #

 
r   r   )typingr   baser   modeling_utilsr   utils.quantization_configr   utilsr	   r
   r   r   r   quantizers_utilsr   r.   core_model_loadingr   
get_loggerrc   r4   r   r   r   r   r   <module>r      s\    !  07  3 4			H	% Q
{ Q
r   