
    Z ja&                         S SK Jr  SSKJr  \(       a  SSKJr  SSKJr  SSKJ	r	J
r
JrJrJrJrJr  SSKJr  \" 5       (       a  S S	Kr\R&                  " \5      r " S
 S\5      rg	)    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)FbgemmFp8Config)is_accelerate_availableis_fbgemm_gpu_availableis_kernels_availableis_torch_availableis_torch_cuda_availableis_torch_xpu_availablelogging)get_module_from_nameNc                      ^  \ rS rSr% SrSrS\S'   U 4S jrS rSS	 jr	S
SS\
S\4S jrS
SS\
SSS\4U 4S jjr  SS jrS rS rS r\S\4S j5       rS rSrU =r$ )FbgemmFp8HfQuantizer)   z'
FP8 quantization using fbgemm kernels
Fr   quantization_configc                 (   > [         TU ]  " U40 UD6  g )N)super__init__)selfr   kwargs	__class__s      }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   FbgemmFp8HfQuantizer.__init__1   s    ,77    c                    [        5       (       d  [        5       (       d  [        S5      e[        5       (       a  [        5       (       d  [        S5      e[        5       (       a  [	        5       (       d  [        S5      e[        5       (       d  [        S5      e[        5       (       a3  [        R                  R                  5       nUu  pEUS:  a  [        S5      eUR                  S5      nUc  [        R                  S5        g [        U[        5      (       aF  U R                  (       d4  S	UR!                  5       ;   d  S
UR!                  5       ;   a  [        S5      eg g g )Nz3Using fbgemm fp8 quantization requires a GPU or XPUz@Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)zLoading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librarieszWLoading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   ImportErrorr   r
   r	   torchcudaget_device_capability
ValueErrorgetloggerwarning_once
isinstancedictpre_quantizedvalues)r   argsr   compute_capabilitymajor_r    s          r   validate_environment)FbgemmFp8HfQuantizer.validate_environment4   s?   &((1G1I1ISTT!##,@,B,B`aa"$$-D-F-FF  '((i  #$$!&!A!A!C)HEqy n  ZZ-
S 
D))%%5J4E4E4G+G6U_UfUfUhKh n  Li% *r   returnc                     U[         R                  :w  a)  [        R                  SU S35        [         R                  nU$ )NzSetting dtype to zP, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16.)r$   bfloat16r)   r*   )r   dtypes     r   update_dtype!FbgemmFp8HfQuantizer.update_dtypeX   s9    ENN"#E7*z{ NNEr   modelr   
param_namec                     SSK JnJn  [        X5      u  pg[	        Xd5      (       a  U R
                  (       d  US:X  a  gg[	        Xe5      (       a  U R
                  (       d  US:X  a  ggg)Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr?   r@   r   r+   r-   )r   r;   r<   r   r?   r@   moduletensor_names           r   param_needs_quantization-FbgemmFp8HfQuantizer.param_needs_quantization`   sW    N25Ef..!![F%:f99!![F%:r   paramztorch.Tensorc                 R   > U R                  X5      (       a  g[        TU ]	  XU5      $ )z4Return the element size (in bytes) for `param_name`.r   )rE   r   param_element_size)r   r;   r<   rG   r   s       r   rI   'FbgemmFp8HfQuantizer.param_element_sizeq   s)    ((;;w)%UCCr   c                     SSK Jn  U R                  XR                  R                  UR
                  5      U l        U" UU R                  U R                  U R                  UR                  S9ng )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r-   tp_plan)rB   rL   get_modules_to_not_convertr   rM   _keep_in_fp32_modulesr-   _tp_plan)r   r;   r   rL   s       r   $_process_model_before_weight_loading9FbgemmFp8HfQuantizer._process_model_before_weight_loadingx   sc    
 	B&*&E&E++BBED_D_'
# /#'#>#> $ 8 8,,NN
r   c                     SSK JnJn  UR                  5        HY  n[	        XSU45      (       d  M  [        US5      (       d  M*  UR                  R                  U R                  R                  5        M[     U$ )z
Force update the input scale upper bound after weight loading and device dispatch are complete.
This resolves issues where persistent buffers are zeroed out or overwritten during the loading process.
r   r>   input_scale_ub)
integrations.fbgemm_fp8r?   r@   modulesr+   hasattrrU   fill_r   activation_scale_ub)r   r;   r   r?   r@   ms         r   #_process_model_after_weight_loading8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s^    
 	ZA!/IJKK1.//$$**4+C+C+W+WX	 !
 r   c                    SUR                   R                  ;   am  0 SS_SS_SS_SS_SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS
_SS_SS_SSS
SSSS.EnUR                  5       b  X!R                  5       l        U$ X!l        U$ U$ )NLlama4z layers.*.self_attn.q_proj.weightcolwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightrowwisezlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalepacked_rowwise)z.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   config	text_plans      r   update_tp_plan#FbgemmFp8HfQuantizer.update_tp_plan   sL   v''000! 3I	!
 9)! 3I! 9)! 3I! 9)! 3I! 23F! ;<O! 2!$ G	%!& Mi'!( Ei)!* KI+!, G	-!. CI/!0 I)1!2 CLHQDM ?ODT;DA!ID %%'3>G&&(; M -6)Mr   c                     g)NT r   s    r   is_serializable$FbgemmFp8HfQuantizer.is_serializable   s    r   c                     g)NFrl   rm   s    r   is_trainable!FbgemmFp8HfQuantizer.is_trainable   s    r   c                     SSK Jn  U" U 5      $ )Nr   )FbgemmFp8Quantize)rV   rt   )r   rt   s     r   get_quantize_ops%FbgemmFp8HfQuantizer.get_quantize_ops   s    ? &&r   )rM   )r8   torch.dtyper5   rw   )r;   r   )rd   
__module____qualname____firstlineno____doc__requires_calibration__annotations__r   r3   r9   strboolrE   floatrI   rR   r\   ri   rn   propertyrq   ru   __static_attributes____classcell__)r   s   @r   r   r   )   s     !**8"H.? S _c "D(9 Ds DSa Dfk D
 
&*X d  ' 'r   r   )typingr   baser   modeling_utilsr   utils.quantization_configr   utilsr	   r
   r   r   r   r   r   quantizers_utilsr   r$   
get_loggerrd   r)   r   rl   r   r   <module>r      sX    !  0;   3 			H	%f'; f'r   