
    Z j(5                     T   S SK JrJr  S SKJrJr  SSKJrJr  SSK	J
r
Jr  SSKJr  \(       a  S SKJr  SS	KJr  \" 5       (       a  S S
Kr\(       d  S SKJr  O\r\R(                  " \5      rS\4S jrS r " S S\5      r " S S\5      rS\\R8                  \R:                  /S.0rg
)    )ABCabstractmethod)TYPE_CHECKINGAny   )is_torch_availablelogging)QuantizationConfigMixinQuantizationMethod   )get_module_from_name)
ModuleListPreTrainedModelNreturnc                 J   [        5       n[        U R                  5      S:  aG  [        U R                  R                  5       5      [        U R                  R	                  5       5      -  n[        U R                  5       5      S   S   1nU R                  5       nU R                  5        VVs1 s H&  u  pEUc  M
  [        U5      [        U5      :X  d  M$  UiM(     nnnX-  U-  n[        U Vs1 s H  oR                  S5      iM     sn5      n[        U5      $ s  snnf s  snf )z
Function to automatically detect keys to not convert for usage like quantization. For example for CausalLM modules
we may want to keep the lm_head in full precision for numerical stability reasons.
r   z.weight)setlenall_tied_weights_keysvalueskeyslistnamed_parametersget_output_embeddingsnamed_modulesidremovesuffix)	model	tied_keyslast_module_keyoutput_emb_modulenamemoduleoutput_emb_keysmodules_to_not_convertks	            m/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/quantizers/base.pyget_keys_to_not_convertr)   &   s    I
5&&'!+33::<=ED_D_DdDdDf@gg	 E2245b9!<=O 335 "//11LD 	-/Z2>O;P-P 	1  
 '8?J!F\"]F\>>)#<F\"]^&'' #^s   .	D;DD.D c                     SSK Jn  U R                  5        H&  n[        X!5      (       d  M  SUR                  l        M(     g )Nr   r   T)modeling_utilsr   modules
isinstanceconfig_is_quantized)r   r   r$   s      r(   _assign_is_quantizedr0   A   s,    0--/f..*.FMM' "    c            
          \ rS rSrSrSrS\4S jrS-S jrS\	\
\4   S	-  S\	\
\4   S	-  4S
 jrSSS\
SSS\4S jrS\	\
\\
-  4   S\	\
\\
-  4   4S jrSSS\
S\4S jrS rS rS rS rS.S/S jjrS/S jrS/S jrS rS.S jrS.S jrS\
S\
4S jr\   S0SSS\\
   S	-  S \\
   S	-  S!\4S" jj5       r \!S\4S# j5       r"\!S\4S$ j5       r#S% r$\%S& 5       r&\!\%S' 5       5       r'S( r(S) r)S* r*S+ r+S,r,g	)1HfQuantizerI   ac  
Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization.
This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method
yet.

Attributes
    quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`):
        The quantization config that defines the quantization parameters of your model that you want to quantize.
    requires_calibration (`bool`):
        Whether the quantization method requires to calibrate the model before using it.
Fquantization_configc                     Xl         UR                  SS5      U l        U R                  (       d+  U R                  (       a  [	        SUR
                   S35      eg g )Npre_quantizedTzThe quantization method z does require the model to be pre-quantized. You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to pass `pre_quantized=True` while knowing what you are doing.)r5   popr7   requires_calibration
ValueErrorquant_method)selfr5   kwargss      r(   __init__HfQuantizer.__init__X   s^    #6 #ZZ>!!d&?&?*+>+K+K*L MN O  '@!r1   r   c                     U$ )a  
Some quantization methods require to explicitly set the dtype of the model to a
target dtype. You need to override this method in case you want to make sure that behavior is
preserved

Args:
    dtype (`torch.dtype`):
        The input dtype that is passed in `from_pretrained`
 )r<   dtypes     r(   update_dtypeHfQuantizer.update_dtypec   s	     r1   
device_mapNc                     U$ )aa  
Override this method if you want to pass a override the existing device map with a new
one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is
passed, the device_map is set to `"auto"``

Args:
    device_map (`Union[dict, str]`, *optional*):
        The device_map that is passed through the `from_pretrained` method.
rA   )r<   rE   s     r(   update_device_mapHfQuantizer.update_device_mapo   s
     r1   r   r   
param_nameparamtorch.Tensorc                 "    UR                  5       $ N)element_size)r<   r   rI   rJ   s       r(   param_element_sizeHfQuantizer.param_element_size{   s    !!##r1   
max_memoryc                     U$ )zaadjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantizationrA   )r<   rQ   s     r(   adjust_max_memoryHfQuantizer.adjust_max_memory~   s    r1   c                     g)z4
Check whether a given param needs to be quantized.
FrA   )r<   r   rI   r=   s       r(   param_needs_quantization$HfQuantizer.param_needs_quantization   s     r1   c                     g)a  
This method is used to potentially check for potential conflicts with arguments that are
passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers.
If no explicit check are needed, simply return nothing.
NrA   )r<   argsr=   s      r(   validate_environment HfQuantizer.validate_environment   s     	r1   c                     U$ z"updates the tp plan for the scalesrA   r<   r.   s     r(   update_tp_planHfQuantizer.update_tp_plan       r1   c                     U$ r]   rA   r^   s     r(   update_ep_planHfQuantizer.update_ep_plan   ra   r1   c                     U$ rM   rA   r<   r   r=   s      r(   $_process_model_before_weight_loading0HfQuantizer._process_model_before_weight_loading       r1   c                     [        USS5        [        USU R                  R                  5        U R                  (       a  U R	                  U5        U R
                  " U40 UD6  g)a	  
Setting model attributes and/or converting model before weights loading. At this point
the model should be initialized on the meta device so you can freely manipulate the skeleton
of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.

Args:
    model (`~transformers.PreTrainedModel`):
        The model to quantize
    kwargs (`dict`, *optional*):
        The keyword arguments that are passed along `_process_model_before_weight_loading`.
is_quantizedTquantization_methodN)setattrr5   r;   r7   _convert_model_for_quantizationrg   )r<   r   rB   r=   s       r(   preprocess_modelHfQuantizer.preprocess_model   sS     	~t,,d.F.F.S.ST00711%B6Br1   c                     U$ rM   rA   rf   s      r(   #_process_model_after_weight_loading/HfQuantizer._process_model_after_weight_loading   ri   r1   c                     U R                   UR                  l         U R                  (       a.  [        U R                   SS5      (       a  U R	                  U5        O[        U5        U R                  " U40 UD6$ )aM  
Post-process the model post weights loading.
Make sure to override the abstract method `_process_model_after_weight_loading`.

Args:
    model (`~transformers.PreTrainedModel`):
        The model to quantize
    kwargs (`dict`, *optional*):
        The keyword arguments that are passed along `_process_model_after_weight_loading`.

dequantizeF)r5   r.   r7   getattrremove_quantization_configr0   rr   rf   s      r(   postprocess_modelHfQuantizer.postprocess_model   s`     ,0+C+C('$*B*BLRW"X"X++E2 '77HHHr1   c                     [        US5      (       a  U?[        UR                  S5      (       a  UR                  ?[        US5      (       a  U?SUl        g)z0
Remove the quantization config from the model.
hf_quantizerr5   rl   FN)hasattrr{   r.   r5   rl   rk   r<   r   s     r(   rw   &HfQuantizer.remove_quantization_config   sO     5.))"5<<!67705/00)"r1   c                 x    Uc  UR                   R                  nU R                  XS9nU R                  U5        U$ )z
Potentially dequantize the model to retrieve the original model, with some loss in accuracy / performance.
Note not all quantization schemes support this.
)rB   )r.   rB   _dequantizerw   r<   r   rB   s      r(   ru   HfQuantizer.dequantize   s@    
 = LL&&E   4''.r1   c                 F    [        U R                  R                   S35      e)NzH has no implementation of `dequantize`, please raise an issue on GitHub.NotImplementedErrorr5   r;   r   s      r(   r   HfQuantizer._dequantize   s'    !''4455}~
 	
r1   c                     U$ )z>
Override this method if you want to adjust the `param_name`.
rA   )r<   rI   s     r(   get_param_nameHfQuantizer.get_param_name   s
     r1   skip_moduleskeep_in_fp32_modulesadd_default_skipsc                     Ub  U(       a  [        U 5      nO/ nUb  UR                  U5        Ub  UR                  U5        [        [        U5      5      nU$ rM   )r)   extendr   r   )r   r   r   r   r&   s        r(   get_modules_to_not_convert&HfQuantizer.get_modules_to_not_convert   s^     #4%<U%C"%'"#")),7+"))*>?!%c*@&A!B%%r1   c                     g)zUFlag indicating whether the quantized model can carry out quantization aware trainingFrA   r<   s    r(   is_qat_trainableHfQuantizer.is_qat_trainable        r1   c                     g)z;Flag indicating whether the quantized model can be compiledFrA   r   s    r(   is_compileableHfQuantizer.is_compileable  r   r1   c                 
    S0 4$ )zcGet state dict and metadata. Useful when we need to modify a bit the state dict due to quantizationNrA   r}   s     r(   get_state_dict_and_metadata'HfQuantizer.get_state_dict_and_metadata	  s    Rxr1   c                     g rM   rA   r   s    r(   is_serializableHfQuantizer.is_serializable  s    "r1   c                     g rM   rA   r   s    r(   is_trainableHfQuantizer.is_trainable  s    r1   c                    UR                  5        H  u  p#UR                  R                  nU[        ;   d  M'  U R                  R
                  [        U   S   ;   d  MM  [        R                  " S5         [        X5      u  pR[        U   S   " UR                  R                  5       5      UR                  U'   S S S 5        M     g ! , (       d  f       M  = f)Nquantization_methodsmetamodule_name)r   	__class____name__!MODULES_TO_PATCH_FOR_QUANTIZATIONr5   r;   torchdevicer   r.   get_text_config_modules)r<   r   r#   r$   module_class_nameparent_modules         r(   rn   +HfQuantizer._convert_model_for_quantization  s    !//1LD & 0 0 9 9 $EE((5545FGH^_` \\&)*>u*K'M3TUf3ghu3v4464M**40 *) 2 *)s   4AC
C	c                 F    [        U R                  R                   S35      e)Nz1 is not available yet and will be supported soon.r   r   s    r(   get_quantize_opsHfQuantizer.get_quantize_ops!  s'    !''4455fg
 	
r1   c                     / $ rM   rA   r   s    r(   get_weight_conversions"HfQuantizer.get_weight_conversions&  s    	r1   c                 &    XR                  5       -   $ )uH  Give the quantizer a chance to rewrite the weight conversion pipeline.

Loading runs ``renamings → converters → (dequant → merge → concat)``. Dequant
has to happen *before* any merge/concat op because those operations aren't
aware of per-block scales, so the per-expert (weight, scale) pairs need to be
collapsed into full-precision tensors first. Subclasses (e.g. the FP8
quantizer in ``dequantize=True`` mode) override this to inject a dequantize
op at the start of each model-provided :class:`WeightConverter` and attach the
matching scale source patterns. Default: no-op.
)r   )r<   weight_conversionss     r(   update_weight_conversions%HfQuantizer.update_weight_conversions)  s     "$?$?$AAAr1   )r7   r5   )rB   torch.dtyper   r   rM   )r   r   )NNF)-r   
__module____qualname____firstlineno____doc__r9   r
   r>   rC   dictstrr   rG   floatrO   intrS   boolrV   rZ   r_   rc   rg   ro   rr   rx   rw   ru   r   r   staticmethodr   r   propertyr   r   r   r   r   r   rn   r   r   r   __static_attributes__rA   r1   r(   r3   r3   I   s   
 !	,C 	

DcNT,A 
d3PS8nW[F[ 
$(9 $s $Sa $fk $DcCi,@ T#sUXy.EY .? S _c C$I(
#

    *.15"'	& &3i$&& #3i$.&  	& &* $      " "  

Br1   r3   c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )SequentialLlama4TextExpertsi7  z
A module that implements a compressed version of a list of expert modules.
This is specifically designed to work with Llama4TextExperts in MoE layers.
c                    > SSK Jn  [        TU ]  [	        UR
                  5       Vs/ s H
  o2" U5      PM     sn5        UR
                  U l        g s  snf )Nr   )Llama4TextMLP)*transformers.models.llama4.modeling_llama4r   superr>   rangenum_local_expertsnum_experts)r<   r.   r   _r   s       r(   r>   $SequentialLlama4TextExperts.__init__=  sH    Lv?W?W9XY9XA-/9XYZ!33 Zs   Ac                     UR                  U R                  SUR                  S   5      n[        R                  " U5      n[        U R                  5       H  nX   " X   5      X#'   M     U$ )Nr   )reshaper   shaper   
zeros_liker   )r<   hidden_states
routed_out
expert_idxs       r(   forward#SequentialLlama4TextExperts.forwardC  sh     &--d.>.>MDWDWXZD[\%%m4
 0 01J%)%5m6O%PJ" 2r1   )r   )r   rK   r   rK   )	r   r   r   r   r   r>   r   r   __classcell__)r   s   @r(   r   r   7  s)    
4% 
 r1   r   Llama4TextExperts)r   r   )abcr   r   typingr   r   utilsr   r	   utils.quantization_configr
   r   quantizers_utilsr   torch.nnr   r+   r   r   r   
get_logger__file__loggerr   r)   r0   r3   r   COMPRESSED_TENSORSBITS_AND_BYTESr   rA   r1   r(   <module>r      s    $ % / S 2 #0'J			H	%(d (6/kB# kB\* 0 211--!
% !r1   