
    Z jf)                         S SK Jr  SSKJrJrJrJr  SSKJr  SSK	J
r
  \" 5       (       a  S SKr\(       a  SSKJr  SS	KJr  \R                   " \5      r " S
 S\5      rg)    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModel)FineGrainedFP8Configc                      ^  \ rS rSr% SrSrS\S'   U 4S jrS rSS	S
\	S\
4S jrSS	S
\	SSS\4U 4S jjr  SS jrS rS r\S\
4S j5       r\S\
4S j5       rS rS rS rSrU =r$ )FineGrainedFP8HfQuantizer   zz
FP8 quantization implementation supporting both standard and MoE models.
Supports both e4m3fn formats based on platform.
Fr   quantization_configc                 (   > [         TU ]  " U40 UD6  g )N)super__init__)selfr   kwargs	__class__s      ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   "FineGrainedFP8HfQuantizer.__init__   s    ,77    c                    [        5       (       d  [        S5      eU R                  R                  (       a  g [        R
                  R                  5       (       dR  [        5       (       dC  U R                  (       a'  [        R                  S5        SU R                  l        g [        S5      e[        R
                  R                  5       (       ab  [        R
                  R                  5       nUu  pEUS:  d  US:X  a4  US:  a.  [        R                  SU SU S	35        SU R                  l        g UR                  S
5      nUc  [        R                  S5        g [        U[        5      (       aT  U R                  (       d#  [!        U5      S:  a  SUR#                  5       ;   d  SUR#                  5       ;   a  [%        S5      eg g )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r	   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   s          r   validate_environment.FineGrainedFP8HfQuantizer.validate_environment   s   &((mnn##..zz&&((1G1I1I!!## [ 7;((3"#fgg::""$$!&!A!A!C-LE	uzeai####('5' 2Z[
 7;((3ZZ-
6
 
D))&&
Oa'Z..00Z..00 k  1 *r   modelr   
param_namereturnc                     SSK JnJn  [        X5      u  pg[	        XeU45      (       a  U R
                  (       d  US:X  a  ggg)Nr   )
FP8Experts	FP8LinearbiasFT)integrations.finegrained_fp8r<   r=   r   r-   r'   )r   r8   r9   r   r<   r=   moduletensor_names           r   param_needs_quantization2FineGrainedFP8HfQuantizer.param_needs_quantizationO   s;    H25Ef*566!![F%:r   paramztorch.Tensorc                 R   > U R                  X5      (       a  g[        TU ]	  XU5      $ )z4Return the element size (in bytes) for `param_name`.r	   )rB   r   param_element_size)r   r8   r9   rD   r   s       r   rF   ,FineGrainedFP8HfQuantizer.param_element_sizeZ   s)    ((;;w)%UCCr   c                     SSK Jn  U R                  XR                  R                  UR
                  5      U l        U" UU R                  U R                  U R                  S9ng )Nr   )replace_with_fp8_linear)modules_to_not_convertr   r'   )r?   rI   get_modules_to_not_convertr   rJ   _keep_in_fp32_modulesr'   )r   r8   r   rI   s       r   $_process_model_before_weight_loading>FineGrainedFP8HfQuantizer._process_model_before_weight_loadinga   s\    
 	K&*&E&E++BBED_D_'
# (#'#>#> $ 8 8,,	
r   c                 h    SUR                   R                  ;   a  SSSSSSSSSSSSSSS.nX!l        U$ )NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_inv)r   __name__base_model_tp_plan)r   config	text_plans      r   update_tp_plan(FineGrainedFP8HfQuantizer.update_tp_plans   sT    f&&///4=>G4=>G4=>G4=>G1:;D/89B1:;DI" )2%r   c                     gNT r   s    r   is_serializable)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                     g)NFr[   r\   s    r   is_trainable&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                     grZ   r[   r\   s    r   is_compileable(FineGrainedFP8HfQuantizer.is_compileable   s    r   c                     SSK Jn  U" U 5      $ )Nr   )Fp8Quantize)r?   rf   )r   rf   s     r   get_quantize_ops*FineGrainedFP8HfQuantizer.get_quantize_ops   s    >4  r   c                     SSK Jn  SSKJn  U R                  (       a-  U R
                  R                  (       a  U" / SQSU" U 5      /S9/$ / $ )Nr   )WeightConverterFp8Dequantize)zweight$weight_scale_invactivation_scaleweightsource_patternstarget_patterns
operations)core_model_loadingrj   r?   rl   r'   r   r#   )r   rj   rl   s      r   get_weight_conversions0FineGrainedFP8HfQuantizer.get_weight_conversions   sK    8@$":":"E"E  $W$, -d 34  	r   c           	      f   U R                   (       a  U R                  R                  (       d  XR                  5       -   $ SSKJnJn  SSKJn  U" SSS9nU/[        U5      -   n/ nU GH  n[        Xr5      (       d  UR                  U5        M'  UR                   Vs/ s H  oR                  S5      (       d  M  UPM     n	nU	(       a  U	 Vs/ s H  oS-   PM	     n
nU	 Vs/ s H  oS	[        S5      *  S
-   PM     nnUR                   Vs/ s H  oR                  S5      (       a  M  UPM     nnX-   U-   nU" U 5      /[        UR                  5      -   nU" UUR                   US9nUR                  U5        GM     UR#                  U R                  5       5        U$ s  snf s  snf s  snf s  snf )u]  When loading with ``dequantize=True``, attach an :class:`Fp8Dequantize` op to
every existing :class:`WeightConverter` so that per-block scales are folded into
the weight *before* any later merge/concat ops collapse the per-expert structure.

For each model-supplied converter that has a ``.weight`` source, we:
  1. anchor the existing weight patterns with ``$`` so they don't accidentally
     also match the ``.weight_scale_inv`` keys (the regex is searched, so the
     unanchored prefix would match both, sending scales to the wrong bucket);
  2. add anchored ``*.weight_scale_inv`` sources next to each weight pattern so
     the loader collects scale tensors alongside the weight tensors into the
     *same* converter bucket (both keys rewrite to the same target);
  3. prepend a fresh :class:`Fp8Dequantize` op so dequant runs first, before
     any merge/concat collapses the per-expert structure.

The generic ``weight$ + weight_scale_inv → weight`` converter from
:meth:`get_weight_conversions` is still appended at the end as a fallback for
plain ``nn.Linear`` weights with no model-specific converter.
r   )rj   WeightRenamingrk   z^(.+)\.scale$z\1.weight_scale_inv)rq   rr   z.weight$Nz.weight_scale_inv$rp   )r'   r   r#   ru   rt   rj   rx   r?   rl   listr-   appendrq   endswithr/   rs   _original_target_patternsextend)r   weight_conversionsrj   rx   rl   scale_renameupdatedconvpweight_sourcesanchored_weightscale_sourcesothernew_sourcesnew_opss                  r   update_weight_conversions3FineGrainedFP8HfQuantizer.update_weight_conversions   s   & ""t'?'?'J'J%(C(C(EEEH@ &6FXno*^d3E.FF&D d44t$)-)=)=W)=AIAVa)=NW4B"CNqs7N"CVd eVdQR#4c)n_!58L!LVd e$($8$8V$8q

9@U$8V-=E(./$t2GG&$/$($B$B&
 NN4 % '( 	t2245 X"C eVs$   F9FF$ F)F.(F.)rJ   )r8   r   )rS   
__module____qualname____firstlineno____doc__requires_calibration__annotations__r   r6   strboolrB   floatrF   rM   rW   r]   propertyr`   rc   rg   ru   r   __static_attributes____classcell__)r   s   @r   r   r      s    
 !//8/b	.? 	S 	_c 	D(9 Ds DSa Dfk D
 
$. d     !
 8 8r   r   )typingr   utilsr   r   r   r   baser
   quantizers_utilsr   r$   modeling_utilsr   utils.quantization_configr   
get_loggerrS   r(   r   r[   r   r   <module>r      sI      ` `  2 0@			H	%P Pr   