
    Z j.%                        S SK Jr  S SKJr  SSKJrJr  SSKJr  SSK	J
r
  SSKJr  \" 5       (       a  S S	Kr\(       a  SS
KJr  \R                   " \5      r " S S\
5      rg	)    )annotations)TYPE_CHECKING   )is_torch_availablelogging)
SinqConfig   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                     ^  \ rS rSr% SrSrS\S'   S\S'   SU 4S jjrSS	 jr\	SS
 j5       r
S rSS jrSS jrSS jrSS jrS rS r S   SS jjr  SS jrSrU =r$ )SinqHfQuantizer!   a|  
HF v5 quantizer for SINQ.

Modes:
  - method="sinq" (default):
      * weight-only SINQ
      * param-level ConversionOps (`SinqQuantize`) during load for pure language models
        (each Linear.weight is turned into a SINQLinear module)
      * module-level quantization after load for multimodal models
  - method="asinq":
      * A-SINQ (activation-aware) SINQ quantization
Tbool requires_parameters_quantizationr   quantization_configc                D   > [         TU ]  " U40 UD6  S U l        SU l        g )NF)super__init___normalized_device_str_do_param_level_sinq)selfr   kwargs	__class__s      w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/quantizers/quantizer_sinq.pyr   SinqHfQuantizer.__init__2   s&    ,7726#*/!    c                    gNT r   s    r   is_serializableSinqHfQuantizer.is_serializable8   s    r   c                    gr   r    r!   s    r   is_trainableSinqHfQuantizer.is_trainable;   s    r   c                    Uca  [         R                  R                  5       (       a!  S[         R                  R                  5       0nOSS0n[        R                  SU S35        U$ )N cpuz:The device_map was not initialized. Setting device_map to zJ. If you want to use the model for inference, please set device_map='auto')torchcudais_availablecurrent_deviceloggerinfo)r   
device_maps     r   update_device_map!SinqHfQuantizer.update_device_map?   se    zz&&(( %**";";"=>
 %[
KK))3 5[[
 r   c                8    Uc  [         R                  nXl        U$ N)r*   bfloat16dtype)r   r6   s     r   update_dtypeSinqHfQuantizer.update_dtypeL   s    =NNE
r   c                   SSK Jn  U" 5       (       d  [        S5      e[        R                  R                  5       (       d  [        R                  S5        UR                  S5      n[        U[        5      (       a@  [        UR                  5       5      n[        U5      S:  a  [        S[        U5       S35      eU R                   R"                  S	:X  a  U R$                  (       d  ['        S
5      eg g )Nr   )is_sinq_availablezMThe 'sinq' package is not installed. Please install it with: pip install sinqzNo CUDA device is available. Quantization and inference will run on the CPU. Please note that this will significantly slow down inference speed and increase quantization time.r0   r	   zkSinqHfQuantizer: multi-GPU device_map detected, but SINQ currently supports only a single CUDA device. Got z. Please use device_map=None.asinqzYou are using `method='asinq'` in the quantization config. Right now the calibrated version of SINQ is not supported in Hugging Face, please refer and use the official SINQ repository `to quantize a model with this method. )utilsr:   ImportErrorr*   r+   r,   r.   warningget
isinstancedictsetvalueslenRuntimeErrorsortedr   methodpre_quantized
ValueError)r   argsr   r:   r0   device_map_valuess         r   validate_environment$SinqHfQuantizer.validate_environmentR   s    - ""mnnzz&&((NN B ZZ-
j$'' #J$5$5$7 8$%)"##)*;#<"==Z\ 
 ##**g5d>P>P:  ?Q5r   c                    SSK Jn  UR                  nU" [        UR                  5      UR
                  b  [        UR
                  5      OSSSSS[        UR                  5      US9$ )z9
Build the dict that SINQLinear expects as quant_config.
r   )sinq_base_quant_configNFr	   )nbits
group_size
quant_zeroquant_scaleview_as_floataxistiling_moderG   )sinq.sinqlinear_hfrO   rG   intrP   rQ   strrV   )r   cfgsinq_base_quant_config_fnrG   s       r   _build_sinq_quant_dict&SinqHfQuantizer._build_sinq_quant_dictn   s[     	[(cii..1nn.Hs3>>*dCOO,	
 		
r   c                   SSK Jn  U R                  (       a  gU R                  R                  S:X  a  gU R
                  (       d  g[        X5      u  pVUS:w  a  g[        XT5      n[        USS5      nU=(       a    U(       + n	U	$ )a  
Called per-parameter to decide whether to run `SinqQuantize` on it.

- If `self.pre_quantized`, we do *not* quantize again (handled by SinqDeserialize instead).
- For method="asinq": return False (ASINQ is not supported in Hugging Face).
- For method="sinq": True only for SINQLinear.weight not in modules_to_not_convert.

Note: After _process_model_before_weight_loading(), the modules are already SINQLinear,
not nn.Linear. We check for SINQLinear modules that are not yet quantized (ready=False).
r   )
SINQLinearFr;   weightreadyT)	rW   r_   rH   r   rG   r   r   r@   getattr)
r   model
param_namer   r_   moduletensor_nameis_sinqis_readyresults
             r   param_needs_quantization(SinqHfQuantizer.param_needs_quantization   s{     	2##**g5 ((25E(" V067D1)\r   c                    SSK Jn  U" U 5      $ )z
Return the ConversionOps used for param-level quantization (Sinq).
The actual SINQLinear construction is in integrations/sinq.py.
r   )SinqQuantize)integrations.sinqrm   )r   rm   s     r   get_quantize_ops SinqHfQuantizer.get_quantize_ops   s    
 	5D!!r   c                f    SSK Jn  U R                  (       a  SSKJn  U" / SQS/U" U 5      /S9/$ / $ )z
If `pre_quantized=True`, interpret a checkpoint produced by SINQLinear.state_dict:

    <prefix>.W_q
    <prefix>.bias
    <prefix>.meta

via a WeightConverter + SinqDeserialize so that we reconstruct a SINQLinear
module instead of a plain nn.Linear.
r   )WeightConverter)SinqDeserialize)z.W_qz.metaz.biasz.weight)source_patternstarget_patterns
operations)core_model_loadingrr   rH   rn   rs   )r   rr   rs   s      r   get_weight_conversions&SinqHfQuantizer.get_weight_conversions   sE     	9;  %
 &/K / 56
 
 	r   c           	        SSK Jn  U R                  XR                  R                  =(       d    / U5      U l        U R                  R
                  S:H  =(       a    U R                  (       + U l        U R                  (       a  SOU R                  U R                  5      n[        U[        5      (       aJ  [        [        UR                  5       5      S5      n[        U[        5      (       a  SU 3nO3[        U5      nO'[         R"                  R%                  5       (       a  SOSnU" UU R                  UU R&                  UU R                  S	9ng)
z
Called on meta-initialized model, before loading any weights.

For SINQ, we replace nn.Linear modules with empty SINQLinear modules here.
The actual quantization happens later in SinqQuantize.convert() when weights are loaded.
r   )replace_with_sinq_linearsinqNr   zcuda:zcuda:0r)   )modules_to_not_convertquant_configcompute_dtypedevicerH   )rn   r{   get_modules_to_not_convertr   r}   rG   rH   r   r\   r@   rA   nextiterrC   rX   rY   r*   r+   r,   r6   )	r   rc   r0   keep_in_fp32_modulesr   r{   sinq_quant_dictfirst_device
device_strs	            r   $_process_model_before_weight_loading4SinqHfQuantizer._process_model_before_weight_loading   s    	A&*&E&E,,CCIrL`'
#
 %)$<$<$C$Cv$M$hVZVhVhRh!"&"4"4$$:U:UVZVnVn:o j$''Z%6%6%8 91=L,,,$\N3
 .
%*ZZ%<%<%>%>EJ(#'#>#>(**,,
r   c                     SSK Jn  U" 5         U$ )a9  
Called after *all* weights have been loaded.

For SINQ:
1. Move non-SINQLinear modules to GPU (embeddings, norms, lm_head, etc.)
   - SINQLinear modules already have GemLite buffers on GPU
   - We skip moving SINQLinear's W_q/meta to avoid memory duplication
2. Patch HF save/load methods for SINQ serialization
r   )patch_hf_pretrained_io)
sinq.hf_ior   )r   rc   r   r   s       r   #_process_model_after_weight_loading3SinqHfQuantizer._process_model_after_weight_loading   s     	6 	 r   )r   r   r6   r}   )r   r   )returnr   )r6   torch.dtyper   r   )r   None)rZ   r   r   rA   )rc   r   rd   rY   r   r   r4   )rc   r   r   zlist[str] | None)rc   r   )__name__
__module____qualname____firstlineno____doc__r   __annotations__r   r"   propertyr%   r1   r7   rL   r\   rj   ro   rx   r   r   __static_attributes____classcell__)r   s   @r   r   r   !   s     .2$d1##0  8
$ D"B 26	)
)
 /	)
V r   r   )
__future__r   typingr   r<   r   r   utils.quantization_configr   baser
   quantizers_utilsr   r*   modeling_utilsr   
get_loggerr   r.   r   r    r   r   <module>r      sK    #   / 2  2 0			H	%ek er   