
    Z jP                    `   % S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	J
r
  S SK Jr  S SKJrJr  S SKJr  S SKJrJr  S SKJrJr  S SKJr  S S	KJr  S S
KJrJrJrJrJr  S SK J!r!  S SK"r"S SK#J$r$J%r%J&r&  S SK'J(r(  S SK)J*r*  S SK+J,r-  S SK+J.r/  S SK"J0r0J1r1  S SK2J3r3  S SK4J5r5  SSK6J7r8  SSK9J:r:  SSK;J<r<  SSK=J>r>J?r?J@r@JArA  SSKBJCrC  SSKDJErE  SSKFJGrGJHrH  SSKIJJrJJKrKJLrLJMrMJNrN  SSKOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  SSKWJXrX  SSKYJZrZ  SS K[J\r\  SS!K]J^r^  SS"K_J`r`  SS#KaJbrb  SS$KcJdrdJere  SS%KfJgrg  SS&KhJiri  SS'KjJkrk  SS(KlJmrm  SS)KnJoroJprpJqrqJrrrJsrsJtrtJuru  SS*KvJwrw  SS+KxJyryJzrzJ{r{J|r|  SS,K}J~r~  SS-KJrJr  SS.KJr  SS/KJr  SS0KJr  SS1KJr  SS2KJr  SS3KJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJr  SS4KJrJrJr  SS5KJrJrJr  SS6KJrJrJrJrJr  SS7KJrJr  SS8KJrJr  SS9KJr  \" 5       (       a  S S:KJr  S S;KJr  \(       a  SS<KJr  \"R                  GR}                  5       r\" 5       (       a7  S SKJs  J"r  S S=KJr  \(GR                  " \5      \(GR                  " S>5      :  rOS?r\GR                  " \5      r\GR                  GR                  S@SA5      GR                  5       r\GR                  GR                  SBSA5      GR                  5       r\" SCSDSE9rS?qS?q\" SFSG9 " SH SI5      5       rSJ\4SK jrSJ\4SL jrSM r\SN 5       r\SO 5       r\SSP\"GR                  SQ\S-  4SR jj5       rSS rST r\"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  \"GR                  SU.rSVSWSJ\4SX jr   SSY\\GR                  -  SZ\\"GR                  -  S[\S\\S-  SJ\\\"R`                  4   4
S] jjrS^\"R`                  SJ\4S_ jrS`\1GR                  SJ\\   4Sa jrSb\\\      Sc\\\"R`                  4   SJ\\\\      \\   4   4Sd jrSb\\\      Sc\\\"R`                  4   SJ\\\\      \\\      4   4Se jrSc\\\"R`                  4   SfSDSJ\\\"R`                  4   4Sg jrSfSDSh\S^\"R`                  4Si jrSSj\Sk\S-  SJ\4Sl jjr   SSm\\GR                  -  S-  Sk\S-  Sn\S-  So\S-  Sp\S-  Sq\Sr\S-  Ss\S-  St\S-  SJ\\\   S-  \S-  4   4Su jjGr  SSP\\"GR                  -  \-  S-  Sv\\   S-  Sw\:Sx\S-  Sc\S-  S[\Sy\S-  SJ\\:\"GR                  4   4Sz jjGr " S{ S|5      Gr " S} S~5      Gr " S SD\1GR                  G\G\\\J5      Gr\" G\GR
                  5      G\Gl        G\GR
                  GR                  b;  G\GR
                  GR                  GR                  SfSSS9G\GR
                  Gl        \SSfG\S\SJG\4S jj5       Gr\SSf\1GR                  S\SJ\1GR                  4S jj5       GrSSf\1GR                  S\SJ\1GR                  4S jjGrS\\-  \"GR                  -  SJ\4S jGr	 SSfG\S\Sy\S-  4S jjGr
SfG\S\Sy\S-  4S jGr " S S\5      GrG\" 5       GrG\G\S'    " S SG\5      Grg)    N)abstractmethod)defaultdict)CallableIterator)contextmanager)	dataclassfield)partialwraps)cycle)Thread)TYPE_CHECKINGAnyTypeVarget_type_hintsoverload)
is_zipfile)create_repois_offline_mode"split_torch_state_dict_into_shards)version)	safe_open)load)	save_file)Tensornn)constraints)
checkpoint   )initialization)PreTrainedConfig)get_model_conversion_mapping)WeightConverterWeightRenaming$convert_and_load_state_dict_in_modelrevert_weight_conversion)DistributedConfig)custom_object_save)CompileConfigGenerationConfig)PeftAdapterMixindeepspeed_confighub_kernelsis_deepspeed_zero3_enabledis_fsdp_enabled)_get_device_mapaccelerate_disk_offloadaccelerate_dispatchcheck_and_set_device_mapexpand_device_map
get_deviceload_offloaded_parameter)!_load_state_dict_into_zero3_model)eager_paged_attention_forward)ALL_FP8_EXPERTS_FUNCTIONS)flash_attention_forward)paged_attention_forward)flex_attention_forward)allow_all_hub_kernels	is_kernel)ALL_EXPERTS_FUNCTIONS)maybe_load_adapters)sdpa_attention_forward)sdpa_attention_paged_forward)ALL_PARALLEL_STYLES_get_parameter_tp_plandistribute_modelgather_state_dict_for_saveinitialize_tensor_parallelismshard_and_distribute_moduleverify_tp_plan)LOSS_MAPPING)$FLASH_ATTENTION_COMPATIBILITY_MATRIXFLASH_ATTN_KERNEL_FALLBACKlazy_import_flash_attention!lazy_import_paged_flash_attention)ROPE_INIT_FUNCTIONS)apply_patchespatch_output_recorders)id_tensor_storage)HfQuantizer)get_hf_quantizer)get_module_from_name)auto_conversion)ADAPTER_SAFE_WEIGHTS_NAMEDUMMY_INPUTSSAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEContextManagersKernelConfigPushToHubMixincached_filecheck_torch_load_is_safe	copy_funchas_fileis_accelerate_availableis_bitsandbytes_availableis_env_variable_trueis_kernels_availableis_torch_flex_attn_availableis_torch_npu_availableis_torch_xpu_availablelogging)GeneralInterfaceis_flash_attention_requestedsplit_attention_implementation)DownloadKwargscreate_and_tag_model_cardget_checkpoint_shard_files)is_flash_attn_greater_or_equal#is_huggingface_hub_greater_or_equalis_sagemaker_mp_enabledis_torch_cuda_available
is_tracing)LoadStateDictInfolog_state_dict_report)_CAN_RECORD_REGISTRYOutputRecorder)QuantizationMethod)add_hook_to_module)extract_model_from_parallel)DeviceMeshLike)__version__z1.10FXLA_USE_BF160XLA_DOWNCAST_BF16SpecificPreTrainedModelTypePreTrainedModel)boundT)frozenc                   p   \ rS rSr% SrSr\S-  \S'   \" \	S9r
\	S-  \S'   Sr\S-  \S'   Sr\\S	'   Sr\S-  \S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   Sr\R&                  S-  \S'   \" \S9r\\S'   Sr\S-  \S'   SrS\S'   Sr\\S'   Sr\\\-     S-  \S'   Sr\S-  \S'   \S\4S j5       r Sr!g)LoadStateDictConfig   zY
Config for loading weights. This allows bundling arguments that are just
passed around.
Npretrained_model_name_or_path)default_factorydownload_kwargsuse_safetensorsFignore_mismatched_sizessharded_metadata
device_mapdisk_offload_folderoffload_buffersdtype
dtype_planhf_quantizerDeviceMeshLike | Nonedevice_meshTweights_onlyweight_mappingdisable_mmapreturnc                     U R                   S L$ N)r   selfs    l/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/modeling_utils.pyis_quantized LoadStateDictConfig.is_quantized   s      ,,     )"__name__
__module____qualname____firstlineno____doc__r   str__annotations__r	   ro   r   r   boolr   r   dictr   r   r   r   torchr   r   rS   r   r   r   listr#   r$   r   propertyr   __static_attributes__r   r   r   r   r      s   
 15!3:4-2>-RO^d*R#'OTD['$)T)$(dTk("Jt"&*t*!OT! $E5;;$T2J2'+L+$++/K(/L$DHND>9:TAH $L$+$-d - -r   r   r   c                      [         =(       a?    [        [        R                  S5      =(       a    [        R                  R	                  5       $ )Nis_initialized)_torch_distributed_availablehasattrr   distributedr   r   r   r   !_is_torch_distributed_initializedr      s7    $ 	/E%%'78	/,,.r   c                      [        5       (       a  [        [        R                  S5      (       d  g[        R                  R	                  5       $ )Nget_world_sizer   )r   r   r   r   r   r   r   r   !_get_torch_distributed_world_sizer      s6    ,..ge>O>OQa6b6b++--r   c                  |    [        5       =(       a,    [        [        R                  R	                  SS5      5      S:H  $ )N
LOCAL_RANKz-1r   )r   intosenvirongetr   r   r   is_local_dist_rank_0r      s+    ,._3rzz~~lTX7Y3Z^_3__r   c               #   ,   #    Sq  S v   Sq g ! Sq f = f7fNTF)_is_quantizedr   r   r   set_quantized_stater      s      M    c               #   ,   #    Sq  S v   Sq g ! Sq f = f7fr   )_is_ds_init_calledr   r   r   set_zero3_stater      s!      #"Ur   r   model_class_namec              #   *  #    U R                   (       d  Ub	  U SU  S3nOSU  S3n[        U5      e[        R                  " 5       n [        R                  " U 5        Sv   [        R                  " U5        g! [        R                  " U5        f = f7f)z
Locally change the torch default dtype to `dtype`, and restore the old one upon exiting the context.
If `model_class_name` is provided, it's used to provide a more helpful error message if `dtype` is not valid.
Nz% cannot be instantiated under `dtype=z$` as it's not a floating-point dtypezCannot set `z7` as torch's default as it's not a floating-point dtype)is_floating_point
ValueErrorr   get_default_dtypeset_default_dtype)r   r   error_messageoriginal_dtypes       r   local_torch_dtyper      s      ""'#$$I%Ptu  +5'1hiM'',,.N0&//s   ABA8 !B8BBc                      [         R                  " / 5      R                  n [         R                  " 5       nX:X  a  U[         R                  " S5      :w  a  U$ gU $ )z
Test if a device context manager is currently in use, or if it is not the case, check if the default device
is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided.
cpuN)r   tensordeviceget_default_device)device_in_contextdefault_devices     r   *get_torch_context_manager_or_global_devicer     sM    
 R(//--/N*U\\%00!!r   c                 x   U R                  5        H\  nUR                  5       (       d  M  S[        UR                  5      ;  d  M5  S[        UR                  5      ;  d  MP  UR                  s  $    [	        U 5      S:X  a  [
        R                  $ [        [        U R                  5       5      5      R                  $ )zl
Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
float8_float4_r   )	valuesr   r   r   lenr   float32nextiter)
state_dictts     r   get_state_dict_dtyper     s        Yc!''l%ByX[\]\c\cXdGd77N ! :!}}Z&&()*000r   )BOOLU8I8I16U16F16BF16I32U32F32F64I64U64F8_E4M3F8_E5M2pathzstr | os.PathLikec                    [         R                  R                  S5      (       d  g [        R                  R                  [        R                  " U 5      5      n[        SSS9 n[        S S U 5        5       S S	S
9nSSS5        W H9  u  pEX:X  d*  UR                  UR                  S5      S-   5      (       d  M4  US:H  s  $    g! , (       d  f       NN= f! [        [        4 a     gf = f)a  True if `path` lives on an hf-mount FUSE filesystem (device string 'hf-mount').

hf-mount's mmap + readahead interaction deadlocks under parallel page-faults,
so callers should load the file into memory instead. Linux-only; returns False
on other platforms.
linuxFz/proc/mountsutf-8encodingc              3   X   #    U  H   n[        U5      S :  d  M  US   US   4v   M"     g7f)   r   r   Nr   ).0ps     r   	<genexpr>"_is_on_hf_mount.<locals>.<genexpr>A  s*     N'>!#a&A+!A$!'>s   **c              3   @   #    U  H  oR                  5       v   M     g 7fr   )split)r   ls     r   r   r  A  s     '>2a		2s   c                     [        U S   5      $ )Nr   r   )es    r   <lambda>!_is_on_hf_mount.<locals>.<lambda>B  s    c!A$ir   T)keyreverseN/zhf-mount)sysplatform
startswithr   r   realpathfspathopensortedrstripOSErrorr   )r   realfhentriesdevmps         r   _is_on_hf_mountr  4  s     <<""7++ww		$0.73rN'>2'>N'G 4 GCzT__RYYs^c-ABBj(( 
  43 Z  s5   =C $C?:C =C C 
CC C+*C+checkpoint_filemap_locationr   r   c                    [         R                  " U 5      nUc  [        U5      nUR                  S5      (       Ga8  U(       ao  US:w  ai  [	        US5       n[        UR                  5       5      nSSS5        US:w  a3  WR                  5        VVs0 s H  u  pxXxR                  U5      _M     nnnW$ [        USS9 n	0 nU	R                  5        H  nUS:X  ak  U	R                  U5      n
U
R                  5       nU[        ;   a
  [        U   nO[        SU 35      e[        R                   " U
R#                  5       USS	9Xg'   Mt  U	R%                  U5      R                  U5      Xg'   M     UsSSS5        $ U(       a
  ['        5         0 nUS:w  a  [)        U5      (       a  S
S0n[        R*                  " U4XS.UD6$ ! , (       d  f       GNR= fs  snnf ! , (       d  f       Nl= f)aC  
Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.

When `disable_mmap` is True, safetensors files are read fully into memory instead of
being memory-mapped. When `disable_mmap` is None (default), it is auto-detected to True
on hf-mount FUSE filesystems (see `_is_on_hf_mount`).
N.safetensorsmetarbr   pt)	frameworkz)Cannot load safetensors of unknown dtype )sizer   r   mmapTr  r   )r   r  r  endswithr  _safe_load_bytesreaditemstor   keys	get_slice	get_dtypestr_to_torch_dtyper   r   empty	get_shape
get_tensorra   r   r   )r  r  r   r   checkpoint_path_fhr   kvf_slicek_dtyper   
extra_argss                 r   load_state_dictr:  M  s    ii0O&7//LF2ot,-chhj9
 -u$@J@P@P@RS@Ral!33@R
S$71JVVX6)[[^F$..0G"44 27 ;(+TU\T])^__$)KKV5E5E5Gu]c$dJM$%LLO$6$6|$DJM   87   "Jv*_"="=d^
::ojLj_ijj9 -, T77s   F:G<B.G:
G	
G r   c                     U R                  5       (       a5  U R                  S5      S   R                  5       U R                  5       -   nU$ U R                  5       nU$ )N)nelementviewdata_ptrelement_size)r   stops     r   _end_ptrrB    sR    {{2r"++-0C0C0EE K  Kr   modulec           	          / nU R                  5        HX  u  p#[        US0 5      =(       d    0 nUR                  UR                  5        Vs/ s H  oR(       a  U SU 3OUPM     sn5        MZ     U$ s  snf )N_tied_weights_keys.)named_modulesgetattrextendr+  )rC  tied_weight_keysname	submoduletiedr4  s         r   _get_tied_weight_keysrN    ss    "$!//1y"6;Artyy{ S{!$D61#A!={ ST 2  !Ts   A1
tensorsr   c                 @   / nU  H  n[        U5      S:  a  UR                  U5        M%  / nU H2  nX   nUR                  UR                  5       [        U5      U45        M4     UR	                  5         US   u  pxn	UR                  U	15        USS   H4  u  pnX:  a  UR                  U15        OUS   R                  U5        UnM6     M     / n/ nU HD  n [        U 5      S:X  a!  UR                  U R                  5       5        M3  UR                  U 5        MF     X4$ )Nr   r   r   r<  )r   appendr?  rB  sortaddpop)rO  r   filtered_tensorssharedareasrK  r   _	last_stop	last_namestartrA  disjoint_tensorsshared_tensorss                 r   _find_disjointr^    s    v;?##F+D%FLL&//+Xf-=tDE  	

"'(i,!&qrE! ''/ $((.I "+ & N#w<1##GKKM2!!'*	 $
 ++r   c                 j   / n/ nU  H  n[        U5      S:  a  M  [        R                  " [        5      nU H@  nX   nUR                  UR                  5       [        U5      4nXX   R                  U5        MB     [        U5      S:X  a  UR                  U5        M  UR                  U5        M     X#4$ )Nr   r   )	r   collectionsr   setr   r?  rB  rS  rQ  )	rO  r   r]  	identicalrV  rW  rK  r   areas	            r   _find_identicalrd    s     N "Iv;?'',D%FMM6??#4hv6FGDKOOD!  u:?V$!!&)  $$r   modelc                   ^ [         R                  " [        5      nU R                  5        H  u  mn[	        U[
        R                  5      (       d  U[        U5         R                  T5        MD  UR                  R                  S:X  a0  UR                  T5      nU[        U5         R                  T5        M  U[        U5         R                  T5        M     UR                  5        VVs0 s H  u  pE[        U5      S:  d  M  XE_M     nnn[        [        U5      5      n/ n[        5       n	Ubx  UR!                  5        Hd  nSn
[#        U5       HP  m[%        U4S jU 5       5      nU(       d  M!  TU ;   d  M)  U
S-  n
U
[        U5      :  d  M?  U	R'                  T5        MR     Mf     [)        UR!                  5       U 5      u  pU H  mU T   R+                  5       U T'   M     [-        X5      u  pU HS  nUR/                  U	5      nU H  mU T	 M     UR1                  U	5      n[        U5      S:  d  MB  UR                  U5        MU     U(       a  UR3                  U5        [        U5      S:  a  [5        SU SU S35      eU $ s  snnf )aH  
Remove all tied weights from the given `state_dict`, making sure to keep only the main weight that `model`
will expect when reloading (even if we now tie weights symmetrically, it's better to keep the intended one).
This is because `safetensors` does not allow tensor aliasing - so we're going to remove aliases before saving.
r  r   r   c              3   R   >#    U  H  n[         R                  " UT5      v   M     g 7fr   research)r   patrK  s     r   r   6remove_tied_weights_from_state_dict.<locals>.<genexpr>  s!     %fFesbiiT&:&:Fes   $'z8The weights trying to be saved contained shared tensors z\ which are not properly defined. We found all the potential target tied weights keys to be: zo.
This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.)r`  r   r   r)  
isinstancer   r   idrQ  r   typeget_parameterrR   r   ra  rN  r   r  anyrS  r^  clonerd  intersection
differencerI  RuntimeError)r   re  ptrsr   ptrnamesshared_ptrsall_potential_tied_weights_keyserror_namesto_delete_namesfoundmatches_patternshared_namesdisjoint_namesidentical_namesinamesknownunknownrK  s                     @r   #remove_tied_weights_from_state_dictr    se    ""4(D"((*f&%,,// F##D)]]6) ((.FF##D) "6*+2248 +" 15

O*#E
Q:3:KO '**?*F&G#KeO '2 '')EEu"%%fFe%f"f"?tz'9QJEs5z)'++D1 & * $2+2D2D2F
#S L %d+113
4  %4L$M!L!##O4D4  ##O4w<!w' " <(
;!F{m TJJiIj k||
 	
 c Ps   .JJ
param_namec                     [        X5      u  p4XCR                  ;   aA  [        U[        R                  5      (       d"  [        R                  " X"R                  5       S9n[        X4U5        g)zUCast a single parameter or buffer `param_name` into the `model`, with value `tensor`.)requires_gradN)rU   _parametersrm  r   	Parameterr   setattr)re  r  r   parent
param_types        r   _load_parameter_into_modelr    sN    -e@F'''
62<<0P0Pf4L4L4NO F'r   weights_namevariantc                 H    Ub  U R                  SS5      u  p#U SU SU 3n U $ )NrF  r   )rsplit)r  r  r   rK  s       r   _add_variantr    s8    !((a0
q	4&1r   r   	gguf_filer   
user_agentis_remote_codetransformers_explicit_filenamer   
tqdm_classc	                    U=(       d
    [        5       nUR                  S5      n	UR                  SS5      n
UR                  S5      nUR                  SS5      nUR                  S5      nUR                  S5      =(       d    SnUR                  S	S
5      nUR                  S5      nUb@  UR                  S5      (       d*  UR                  S5      (       d  US:w  a  [        SU 35      eSnU Gb  UGc  [	        U 5      n [
        R                  R                  U 5      nU(       Ga  Ub3  [
        R                  R                  XU5      nUR                  S5      nGOUSLa  [
        R                  R                  [
        R                  R                  X[        [        U5      5      5      (       a0  [
        R                  R                  X[        [        U5      5      nGOUSLa  [
        R                  R                  [
        R                  R                  X[        [        U5      5      5      (       a2  [
        R                  R                  X[        [        U5      5      nSnGOU(       d  [
        R                  R                  [
        R                  R                  X[        [        U5      5      5      (       a0  [
        R                  R                  X[        [        U5      5      nGO~U(       d  [
        R                  R                  [
        R                  R                  X[        [        U5      5      5      (       a2  [
        R                  R                  X[        [        U5      5      nSnGOU(       a   [        S[        [        U5       SU  S35      e[        S[        [        U5       S[        [        U5       SU  S35      e[
        R                  R                  [
        R                  R                  X5      5      (       a  U nSnGOVUb  UnUR                  S5      nO&USLa  [        [        U5      nO[        [        U5      nUUUU	US.nU
UUSSUUS.UEn[!        5       (       + =(       a*    [#        S5      (       + =(       a    U(       + =(       a    US
:H  n [%        U U40 UD6nUc  U[        [        U5      :X  a  [%        U [        [        U5      40 UD6nUb  SnOzU(       aV  US:X  a  U(       a  ['        U 40 UD6u  nnnUUS'   Uc0  [        U  S[        [        U5       S[        [        U5       S35      eO[        [        U5      n[%        U U40 UD6nUc4  U[        [        U5      :X  a   [%        U [        [        U5      40 UD6nUb  SnUb`  U(       a  [        O[        nU[        [        4;   a<  [)        U U40 UD6(       d*  U(       a#  [+        [&        U 4SS0UESS9R-                  5         OkUb8  [)        U [        40 UD6(       a"  [        U  S[        [        U5       S U S!35      e[        U  S[        [        U5       S[        [        U5       S35      eU(       a  [0        R3                  S%W 35        UnOd[0        R3                  S%W S&W 35        OHU(       aA  [
        R                  R                  U5      (       a  UnOU	U
UUUUUUSSUS'.n[%        X40 UD6nSnU(       a  [5        U WU	U
UUUUUUUUS(9u  nnUU4$ U b  W/OSnUU4$ ! [         a    e [.         a)  n[        S"U  S#U  S$[        [        U5       S35      UeSnAff = f))zGet all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
checkpoints are sharded.
This function will download the data if necessary.
	cache_dirforce_downloadFproxieslocal_files_onlytokenrevisionmain	subfolder commit_hashNr  z.safetensors.index.jsonzadapter_model.binzThe transformers file in the config seems to be incorrect: it is neither a safetensors file (*.safetensors) nor a safetensors index file (*.safetensors.index.json): TzError no file named z found in directory rF  z, or z, found in directory )r  r  r  r  r  )r  r  r   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries_commit_hashr  DISABLE_SAFETENSORS_CONVERSIONz& does not appear to have a file named z or zX and thus cannot be loaded with `safetensors`. Please do not set `use_safetensors=True`.ignore_errors_during_conversionzThread-auto_conversion)targetargskwargsrK  z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at )r  r  r  r  r  r  r  r  r  r  r  )
r  r  r  r  r  r  r  r  r  r  )ro   r   r&  r   r   r   r   isdirjoinisfiler  rZ   rY   r\   r[   r  r   rf   r`   rV   rc   r   r[  	Exceptionloggerinforq   )r   r  r  r   r  r  r  r   r  r  r  r  r  r  r  r  r  
is_shardedis_localarchive_filefilenamehas_file_kwargscached_file_kwargscan_auto_convertresolved_archive_filesafe_weights_namer  r   checkpoint_filess                                r   _get_resolved_checkpoint_filesr  %  s    &9)9O##K0I$(()95AN!!),G&**+=uE(E"":.8&H##K4I!%%m4K%1-66~FFOmOvOv%P
 P
 .1DD `568  J$0Y5F(+,I(J%77==!>?-9!ww||,IVtu;DDE^_
 -"''..:|TegnGop3 3  "ww||1lK\^e>f  !-"''..:|TkmtGuv3 3  "ww||1lKbdk>l  "
$:|T`biGjk* *  "ww||1l<Y`>a  %:|TfhoGpq* *  "ww||1lK]_f>g  "
 *<8I7+S*T U56a9 
 *<8I7+S*TTYZfgsu|Z}Y~ ++H*IL  WW^^BGGLLRSS8LH .99;DDE^_
 -'(97C'g> %"&$4O #1(&499> +(	" "	" $%% $,-MNN$ '&$ O Y )44QS[(r_q(r% )0XN_ahAi5i,75$%<gF- --)
 -8%)
(#v-2BJY =KASKG18Z :B*:608")#@"A B$$01BG$L#MTR^_vx  SA  RB Bz!z#  9 $0g#F0;981GY1-
 )0Xl\cAd5d,75$%7A- --)
 -8%)
 )4CM(?Sd% \3E$FF ()FHY m]l m,#2"?!A$Eu#cPb#c!9	
  %'
 *x5|0GV0 &<= >  ,\7 CD E  'y(ce  &<= >  ,\7 CDDVgipIqHrrsu $ KK/~>?$0!KK/zI^H_`a	77>>)$$$-!
 '"0"$4($&499> +" %00M$o\n$o! -G)!)-!$!.
**" --- 7T6_12ei---}    01N0O P99V8W X::F|U\:]9^^_a
 s    *E\$ A+\$ $]!8$]]!r  configr   r   c                    USLnU Gbo  [        U [        5      (       Ga%  U S:X  a  [        US5      (       a3  UR                  b&  UR                  n [        R                  SU  S35        OU(       a  SU;   a  US   n OUUb  [        U5      n OFUb*  US   R                  S5      (       a  [        R                  n O[        US   SUS	9n[        U5      n [        R                  S
U  S35        O1[        [        U 5      (       a  [        [        U 5      n O[        S5      e[        U [        5      (       a  [        [        U 5      OU n OI[        U [        [        R                  45      (       d  [        SU  35      eO[        R                  " 5       n Ub  UR                  U 5      n [        U [        5      (       af  U R!                  S[        R                  " 5       5      n[        U[        5      (       a  [        [        U5      OUn[        R#                  SU S35        OU nXl        UR$                   H  n	[        X)5      =n
c  M  Xl        M     X(4$ )a  Find the correct `dtype` to use based on provided arguments. Also update the `config` based on the
inferred dtype. We do the following:
1. If dtype is "auto", we try to read the config, else auto-detect dtype from the loaded state_dict, by checking
its first weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
2. Else, use the dtype provided as a dict or str
Nautor   zWill use dtype=z$ as defined in model's config objectr   z.ggufr  r%  zTSince the `dtype` attribute can't be found in model's config object, will use dtype=z  as derived from model's weightsze`dtype` provided as a `str` can only be `'auto'`, or a string representation of a valid `torch.dtype`z`dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `dtype` for each sub-config in composite configs, but received r  zUsing different dtypes per module is deprecated and will be removed in future versions Setting different dtypes per backbone model might cause device errors downstream, therefore setting the dtype=z for all modules.)rm  r   r   r   r  r  r   r&  r   r   r:  rH  r   r   r   update_dtyper   warning_oncesub_configs)r   r  r  r   r   r   r   r  
main_dtypesub_config_key
sub_configs              r   
_get_dtyper  1  s2    "-JeS!!67++0H"LLEKK/%8\ ]^!g1A&A 0 9#/ 4Z @)5:J1:M:V:VW^:_:_ %%4,Q/fS_&
 !5Z @KK**/0PR &&u- { 
 .8s-C-CGE5)EED%++#677JJOR  8 '')))%0 %YYr5#:#:#<=
3=j#3N3NWUJ/T^
!!+,=?	
 
 L ,,!&99JF) - r   c                      \ rS rSrSr\SSS\R                  4S j5       r\SSS\R                  4S j5       rSSS\	S\	4S	 jr
\S
 5       r SSSS\	S\\S4   S\R                  S-  S\	4
S jjrSSSS\S\S\4S jjrSrg)ModuleUtilsMixini  z@
A few utilities for `torch.nn.Modules`, to be used as a mixin.
r   r   r   c                 B    [        S U R                  5        5       5      $ )zu
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
c              3   8   #    U  H  oR                   v   M     g 7fr   r   r   params     r   r   *ModuleUtilsMixin.device.<locals>.<genexpr>  s     @.?ULL.?s   r   
parametersr   s    r   r   ModuleUtilsMixin.device  s     @doo.?@@@r   c                 B    [        S U R                  5        5       5      $ )zg
`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
c              3   f   #    U  H'  oR                  5       (       d  M  UR                  v   M)     g 7fr   )r   r   r  s     r   r   )ModuleUtilsMixin.dtype.<locals>.<genexpr>  s!     \->EBYBYB[KEKK->s   11r  r   s    r   r   ModuleUtilsMixin.dtype  s    
 \T__->\\\r   encoder_attention_maskc                     UR                  5       S:X  a  USS2SSS2SS24   nUR                  5       S:X  a  USS2SSSS24   nWR                  U R                  S9nSU-
  [        R                  " U R                  5      R
                  -  nU$ )z
Invert an attention mask (e.g., switches 0. and 1.).

Args:
    encoder_attention_mask (`torch.Tensor`): An attention mask.

Returns:
    `torch.Tensor`: The inverted attention mask.
   Nr   r         ?)dimr*  r   r   finfomin)r   r  encoder_extended_attention_masks      r   invert_attention_mask&ModuleUtilsMixin.invert_attention_mask  s     "%%'1,.DQaQR].S+!%%'1,.DQdTUEU.V+ +J*L*LSWS]S]*L*^'+.1P+PTYT_T_`d`j`jTkToTo*o'..r   c                    UR                   nU u  p4[        R                  " XBS9nUS S S S 24   R                  X4S5      US S S 2S 4   :*  nUR	                  UR
                  5      nUR                  S   UR                  S   :  aU  UR                  S   UR                  S   -
  n[        R                  " [        R                  " X4U4X&R
                  S9U/SS9nUS S 2S S S 2S S 24   US S 2S S S S 24   -  nU$ )Nr  r   r   r   r<  axis)	r   r   arangerepeatr*  r   shapecatones)	input_shapeattention_maskr   
batch_size
seq_lengthseq_idscausal_maskprefix_seq_lenextended_attention_masks	            r   *create_extended_attention_mask_for_decoder;ModuleUtilsMixin.create_extended_attention_mask_for_decoder  s   &&!,
,,z9dD!m,33JANRYZ^`acgZgRhh!nn^%9%9:Q."6"6q"99+11!4{7H7H7KKN))JJ
GPV^o^op K #.aq!m"<~aQUW[]^N^?_"_&&r   Nr  r  .r   c                    Uc  U R                   nUR                  5       S:X  a  USS2SSS2SS24   nOpUR                  5       S:X  a@  [        U R                  SS5      (       a  [        R                  X!5      nO*USS2SSSS24   nO[        SU SUR                   S35      eUR                  US9nS	U-
  [        R                  " U5      R                  -  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
Nr  r   
is_decoderz!Wrong shape for input_ids (shape z) or attention_mask (shape )r  r  )r   r  rH  r  r  r  r   r  r*  r   r  r  )r   r  r  r   r  s        r   get_extended_attention_mask,ModuleUtilsMixin.get_extended_attention_mask  s    $ =JJE 1$&4Qa]&C#!Q& t{{L$77*:*e*e+' +9D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<5"<"I#&)@#@EKKPUDVDZDZ"Z&&r   only_trainableexclude_embeddingsc                    U(       aH  U R                  5        VVs/ s H+  u  p4[        U[        R                  5      (       d  M&  U S3PM-     nnn[	        U SS5      nU(       a  SSKnSnU R                  5        H  u  p9U(       a  UW;   a  M  U	R                  (       d	  U(       a  M.  U(       a  [        U	WR                  R                  5      (       af  [        U	S5      (       a  U	R                  5       n
O*[        U	S5      (       a  U	R                  R                  n
OSn
XR                  5       S	-  U
-  -  nM  XR                  5       -  nM     U$ s  snnf )
a  
Get number of (optionally, trainable or non-embeddings) parameters in the module.

Args:
    only_trainable (`bool`, *optional*, defaults to `False`):
        Whether or not to return only the number of trainable parameters

    exclude_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to return only the number of non-embeddings parameters

Returns:
    `int`: The number of parameters.
z.weightis_loaded_in_4bitFr   Nr@  quant_storager   r   )rG  rm  r   	EmbeddingrH  bitsandbytesnamed_parametersr  
Params4bitr   r@  r  itemsizenumel)r   r  r  rK  module_typeembedding_param_namesr  bnbtotal_paramsr  	num_bytess              r   num_parametersModuleUtilsMixin.num_parameters  s%    :>:L:L:N%:N%6TR\]hjljvjvRw 4& :N " % $D*=uE&002KD!d.C&C"".. %E366;L;L)M)Mun55$)$6$6$8	 88$)$7$7$@$@	$%	 KKMA$5	$AAL KKM1L 3" 5%s   %E	Er   r   FF)r   r   r   r   r   r   r   r   r   r   r  staticmethodr  tupler   r  r   r	  r   r   r   r   r  r    s     A& A5<< A A ]% ]%++ ] ]/$5 /v /Z` /, ' '2 %)	/'/'/' 38_/' {{T!	/'
 
/'b*. * *bf *sv * *r   r  c                   l    \ rS rSrSrSrS\R                  4S jrS\R                  4S jr	S r
S	 rS
rg)EmbeddingAccessMixini"  z
Base utilities to regroup getters and setters for embeddings.
Introduces the `input_layer_embed` attribute, which indicates
where the input embeddings come from and where they
should be set.
embed_tokensr   c                    [        U SS5      n[        XS5      =nb  U$ [        U SS5      nUb  [        X15      (       a  [        X15      $ [        U SS5      nUb  [        XA5      (       a  [        XA5      $ [        U SS5      nUb%  XPLa!  [        US5      (       a  UR                  5       $ [        SU R                  R
                   S	35      e)
zv
Returns the model's input embeddings.

Returns:
    `nn.Module`: A torch module mapping vocabulary to hidden states.
_input_embed_layerr  N
embeddingsre  
base_modelget_input_embeddingsu.   `get_input_embeddings` not auto‑handled for "; please override in the subclass.)rH  r   r  NotImplementedError	__class__r   )r   rK  default_embeddingr  re  r  s         r   r  )EmbeddingAccessMixin.get_input_embeddings,  s     t1>B ")T!::G$$T<6
!gj&?&?:,,gt,!5!55''T<6
!j&<UkAlAl2244!<T^^=T=T<UUwx
 	
r   valuec                    [        U SS5      n[        X5      (       a  [        XU5        g[        U SS5      =nb  [        X25      (       a  [        X2U5        g[        U SS5      =nb  [        XB5      (       a  [        XBU5        g[        U SS5      =nb'  XPLa#  [        US5      (       a  UR                  U5        g[	        SU R
                  R                   S	35      e)
a  Fallback setter that handles **~70%** of models in the code-base.

Order of attempts:
1. `self.<_input_embed_layer>` (direct attribute)
2. `self.embeddings.<_input_embed_layer>` (nested embeddings for vision/audio models)
3. `self.model.<_input_embed_layer>` (encoder/decoder models)
4. delegate to the *base model* if one exists
5. otherwise raise `NotImplementedError` so subclasses still can (and
    should) override for exotic layouts.
r  r  r  Nre  r  set_input_embeddingsu.   `set_input_embeddings` not auto‑handled for r  )rH  r   r  r  r  r  r   )r   r  rK  r  re  r  s         r   r  )EmbeddingAccessMixin.set_input_embeddingsJ  s     t1>B4D&#D,==jJwWaOhOhJe,tWd33e@WUEYEYE' #4t<<ZI&
$:;;++E2%@AXAX@YY{| r   c                     [        U S5      (       d  g  U R                  5         U R                  $ ! [         a     g f = f)Nlm_head)r   r  r  r   r   s    r   get_output_embeddings*EmbeddingAccessMixin.get_output_embeddingsl  sE    tY''	 %%' || # 		s   0 
==c                 4    [        U S5      (       a  Xl        gg)zU
Sets the model's output embedding, defaulting to setting new_embeddings to lm_head.
r   N)rH  r   )r   new_embeddingss     r   set_output_embeddings*EmbeddingAccessMixin.set_output_embeddingsw  s     4##)L $r   )r   N)r   r   r   r   r   r  r   Moduler  r  r!  r%  r   r   r   r   r  r  "  s:     (
bii 
< "))  D	*r   r  c                   <  ^  \ rS rSr% SrSr\\   S-  \S'   Sr	Sr
\\S'   Sr\\S'   Sr\\   S-  \S	'   S
r\\S'   Sr\\\   -  \S'   Sr\\   \\   -  S-  \S'   Sr\\\   -  S-  \S'   Sr\\   \\   -  S-  \S'   Sr\\   \\   -  S-  \S'   Sr\\\4   \S'   0 r\\\4   \S'   Sr\\   S-  \S'   Sr\\   S-  \S'   Sr\\   S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr \\   S-  \S'   Sr!\\\4   \S'   Sr"Sr#\\\$\\4   4   \S'   Sr%\\S'   Sr&\\S'   Sr'\\S'   Sr(\S-  \S '   \)\*RV                  RX                  S!\\\-4   4S" j5       5       r.\)S!\\\*R^                  4   4S# j5       r0U 4S$ jr1S%\4U 4S& jjr2S' r3\)S!\\\4   4S( j5       r4\)S!\\\$\\4   4   4S) j5       r5\4Rl                  S*\\\4   S-  4S+ j5       r4\5Rl                  S*\\\$\\4   4   S-  4S, j5       r5SS- jr7S. r8S/\\   \-  S!S4S0 jr9\:S1 5       r;\)S!\<Rz                  4S2 j5       r>\:S!\4S3 j5       r?  SS4\@S5\AS6\AS7\$\$\A\4   S84   S9\$\$\A\4   S84   S:\@S-  4S; jjrBSS4\@S<\S!\4S= jjrCSS<\S!\4S> jjrDS!\4S? jrESS<\S!\4S@ jjrF SSA\S-  S<\SB\S!\4SC jjrGSD\S-  S!\4SE jrHSSF\S-  S<\S!\4SG jjrISH\S-  S!\4SI jrJ\:S!\4SJ j5       rK\:S!\4SK j5       rLSSA\\-  SB\4SL jjrMSD\\-  4SM jrNSN rOSO rPSSP\S-  4SQ jjrQSSP\S-  4SR jjrRSS rSST rT\*R                  " 5       SU 5       rVSSV\4SW jjrW\*R                  " 5       \XR                  " 5       SX 5       5       rZSSY\S!\4SZ jjr[SS\\\   S-  S]\4S^ jjr\S_ r]   SS`\@S-  Sa\@S-  Sb\S!\<R                  4Sc jjr_SSd jr`   SSe\<R                  S`\@S-  Sa\@S-  Sb\S!\<R                  4
Sf jjra   SSg\<R                  S`\@S-  Sh\Sb\S!\<R                  4
Si jjrcSj rd SSh\4Sk jjreSl rfSm rgSn\@4So jrhS!\<R                  \$\<R                     -  4Sp jriSq rjSSr jrkS[\l4Ss\St\A4Su jjrmSv rn\)S!\4Sw j5       ro        SSx\\pR                  -  Sy\Sz\S-  S{\S|\@\-  S}\S-  S~\\-  S-  S\S\4S jjrr\s" \tR                  5      U 4S j5       ruSS jrv\s" \*Rx                  Rz                  R                  5      U 4S j5       rw\s" \*Rx                  Rz                  R                  5      U 4S j5       rxU 4S jryU 4S jrz\:S\*R                  S\S\SB\S-  4S j5       r|S\*R                  S!\4S jr}SS\~S-  4S jjr\:SSSSSSSSS[SSS.S\\   S\\pR                  -  S-  S%\\-  \pR                  -  S-  S\\pR                  -  S-  S\S\S\S~\\-  S-  S\S\S-  S\S\\\\\\4   -  4   S-  S\S-  S!\4S jj5       r\ SSS Sz\S-  S\\   S-  S\S\\   S-  S!\$\\4   4S jj5       r\S\S\S!\4S j5       rSS jr\:SS j5       rS r\)S 5       r\)S 5       r\)S 5       r\)S 5       r\Rl                  S 5       rSS jr\)S!\4S j5       r\Rl                  S\S!S4S j5       rS\S-  S!\A4S jr\:S 5       rS\\\   S\S-  SSS\S-  S!S4
S jrS\S!S4S jrS\S!S4S jrS rS\4S jr SS\S\S!\\$\\*R^                  4      4S jjrSS\4U 4S jjjrS r\:S!\4S j5       rSrU =r$ )r   i  am  
Base class for all models.

[`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
downloading and saving models as well as a few methods common to all models to:

    - resize the input embeddings

Class attributes (overridden by derived classes):

    - **config_class** ([`PreTrainedConfig`]) -- A subclass of [`PreTrainedConfig`] to use as configuration class
      for this model architecture.
    - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
      classes of the same architecture adding modules on top of the base model.
    - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
      models, `pixel_values` for vision models and `input_values` for speech models).
    - **can_record_outputs** (dict):
Nconfig_classr  base_model_prefixF_is_stateful
model_tags	input_idsmain_input_nametextinput_modalities_no_split_modules_skip_keys_device_placement_keep_in_fp32_modules_keep_in_fp32_modules_strictrE  _checkpoint_conversion_mapping_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpected_keys_to_ignore_on_save_supports_sdpa_supports_flash_attn_supports_flex_attn!_compatible_flash_implementations_tp_plan_pp_plansupports_gradient_checkpointing_can_compile_fullgraph_supports_attention_backend_can_record_outputsr   c                 ,    U R                   =(       d    0 $ )a  
 Maps output names (e.g., "attentions", "hidden_states")
 to either:
     - A module class (e.g., `LlamaDecoderLayer`), using default index conventions:
         * index=0 for "hidden_states"
         * index=1 for "attentions"
     - Or an `OutputRecorder(...)` with `target_class`, optional `index`, and `layer_name`.

 Examples:
     These two are equivalent:

 ```python
     _can_record_outputs = {
         "attentions": LlamaAttention,
         "hidden_states": LlamaDecoderLayer
     }

     _can_record_outputs = {
         "attentions": OutputRecorder(LlamaAttention, index=1),
         "hidden_states": OutputRecorder(LlamaDecoderLayer, index=0)
     }
```

 This means you can record outputs from the same class, by specifying a layer name. Before
 collecting outputs, we check that they come from this layer.

 If you have cross attention that come from `LlamaAttention` and self attention that also
 come from `LlamaAttention` but from `self_attn` you can do this:

 ```python
 class LlamaModel(PreTrainedModel):
     _can_record_outputs = {
         "attentions": OutputRecorder(LlamaAttention, index=1, layer-name="self_attn"),
         "cross_attentions": OutputRecorder(LlamaAttention, index=1, layer_name="cross_attn")
     }

```
)rB  r   s    r   can_record_outputs"PreTrainedModel.can_record_outputs  s    R ''-2-r   c                 :    S[         R                  " [        5      0$ )zN
`dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
r-  )r   r   rX   r   s    r   dummy_inputsPreTrainedModel.dummy_inputs  s    
 U\\,788r   c                 H  > [         TU ]  " S0 UD6  [        R                  " U 5      R	                  SS 5      nU R
                  R	                  SS 5      n[        U 5      R	                  SS 5      nU R                  nUb  X0l        g Ub  X l        g Ub  XPl        g Ub  X@l        g g )Nr  r)  r   )super__init_subclass__inspectget_annotationsr   __dict__r   r)  )clsr  child_annotationchild_attributefull_annotationfull_attributer  s         r   rK  !PreTrainedModel.__init_subclass__  s    !+F+ #2237;;HdK,,**>4@ )-11(DA)) &.)/'-(. )r   r  c                   > [         TU ]  5         [        U[        5      (       d:  [	        SU R
                  R                   SU R
                  R                   S35      eXl        UR                  U l        U R                  U R                  R                  S[        R                  S9U R                  l        U R                  U R                  R                  5      U R                  l        U R#                  5       (       a  [$        R&                  " U5      U l        U R
                  R                  nU[*        ;  aZ  SSR-                  [*        5       S3n[.        R0                  " XPR
                  R                  5      n[3        U5      S	:  a  US	   nOS nX@l        U R6                  [8        [;        U R
                  5      '   g )
NzParameter config in `zt(config)` should be an instance of class `PreTrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`Tis_init_checkallow_all_kernels(|r  r   )rJ  __init__rm  r!   	TypeErrorr  r   r  name_or_path%_check_and_adjust_attn_implementation_attn_implementationr-   ALLOW_ALL_KERNELS_attn_implementation_internal(_check_and_adjust_experts_implementation_experts_implementation _experts_implementation_internalcan_generater*   from_model_configgeneration_configrJ   r  ri  findallr   	loss_typerB  ry   r   )r   r  inputsr  ri  loss_groupsr  s         r   r[  PreTrainedModel.__init__  ss   &"233'(?(?'@ A NN3344\^ 
 "// 594^4^KK,,);;	 5_ 5
1 8<7d7dKK//8
4 %5%G%G%OD" NN++	L(chh|45Q7K

;0G0GHI9~!%aL	 	"484L4LS01r   c                    0 0 0 sU l         U l        U l        U R                  U L a  U R                  R
                  b$  U R                  R
                  R                  5       O0 U l        U R                  R                  b$  U R                  R                  R                  5       O0 U l         U R                  R                  b$  U R                  R                  R                  5       O0 U l        U R                  SS9U l
        [        U R                  =(       d    / 5      U l        [        U R                  =(       d    / 5      U l        [        U R                  =(       d    / 5      U l        U R                  5        GH3  u  p[!        USS5      =n(       aR  U R                  R#                  UR                  5       R%                  5        VVs0 s H  u  pEU SU 3U_M     snn5        [!        USS5      =n(       aR  U R                   R#                  UR                  5       R%                  5        VVs0 s H  u  pEU SU 3U_M     snn5        [!        USS5      =n(       aR  U R                  R#                  UR                  5       R%                  5        VVs0 s H  u  pEU SU 3U_M     snn5        [!        USS5      =n(       aW  U R                  R#                  UR                  5       R%                  5        VVs0 s H  u  pEU SU 3U SU 3_M     snn5        [!        US	S5      =n(       a  U R                  R#                  U5        [!        US
S5      =n(       a  U R                  R#                  U5        [!        USS5      =n	(       d  GM  U R                  R#                  U	5        GM6     U R'                  5         U R)                  5         gs  snnf s  snnf s  snnf s  snnf )a  
A method executed at the end of each Transformer model initialization, to execute code that needs the model's
modules properly initialized (such as weight initialization).
It is also used to obtain all correct static properties (parallelism plans, tied_weights_keys, _keep_in_fp32_modules, etc)
correctly in the case of composite models (that is, the top level model should know about those properties from its children).
NFall_submodels_ep_planrF  r=  r>  all_tied_weights_keysr3  r4  r1  )r=  rp  r>  r  r  base_model_pp_plancopybase_model_tp_planbase_model_ep_planget_expanded_tied_weights_keysrq  ra  r3  r4  r1  named_childrenrH  updater)  init_weights._backward_compatibility_gradient_checkpointing)
r   rK  rC  planr4  r5  	tied_keys	keep_fp32keep_fp32_strictno_splits
             r   	post_initPreTrainedModel.post_initE  s=    79"b3t}dm??d"EI[[EcEcEoDKK::??AuwDMEI[[EcEcEoDKK::??AuwDMEI[[EcEcEoDKK::??AuwDM%)%H%HW\%H%]"%()C)C)Ir%J",/0Q0Q0WUW,X)!$T%;%;%Ar!B !//1LDvz488t8$$499;CTCTCV%WCV41asmQ&6CV%WXvz488t8$$499;CTCTCV%WCV41asmQ&6CV%WXvz488t8$$499;CTCTCV%WCV41asmQ&6CV%WX#F,CTJJyJ**11\e\j\j\l\r\r\t2u\tTXTUdV1QC=TF!A3-3O\t2uv#F,CTJJyJ**11)<#*63QSW#XXX11889IJ"6+>EExEE&&--h7% 2* 	;;=' &X%W%W 3vs   	O/OO;O$c                     [        U R                  S5      (       a1  U R                  R                  R                  (       a  U R                  $ U R
                  $ )z*
The full tp plan for the model's modules
distributed_config)r   r  r  enable_expert_parallelrp  r=  r   s    r   tp_planPreTrainedModel.tp_planv  s<    
 4;; 455$++:X:X:o:o== }}r   c                     U R                   $ r   )r>  r   s    r   pp_planPreTrainedModel.pp_plan  s    }}r   r{  c                 L   Uc  0 U l         g [        U[        5      (       d  [        S5      eUR	                  5        H?  u  p#U[
        ;  d  M  [        SU SU S[        [
        R                  " 5       5       35      e   U R                  5        VVs/ s H  u  pEUPM	     nnnUR                  5        Hb  nUR                  SS5      nSnU H"  n	[        R                  " Xy5      (       d  M   Sn  O   U(       a  MH  [        R                  " S	U S
35        Md     Xl         g s  snnf )Nz&Can only set a dictionary as `tp_plan`z#Unsupported tensor parallel style 'z' for layer 'z'. Supported styles are *z\d+FTzLayer pattern 'z' does not match any parameters in the model. This rule may not be applied during tensor parallelization, or may lead to dimension mismatches)r=  rm  r   r   r)  rC   r   r+  r   replaceri  matchwarningswarn)
r   r{  layer_patternparallel_stylerK  rX  model_param_namesregex_patternpattern_matchedr  s
             r   r  r    s%   <DM$%%EFF .2ZZ\)M%88 9.9IWdVe f,,01D1I1I1K,L+MO  .: 261F1F1HI1HgdT1HI!YY[M)11#v>M#O/
88M66&*O 0 #?%m_ 5d d ) ! Js   D c                 f    Uc  0 U l         g [        U[        5      (       d  [        S5      eXl         g )Nz&Can only set a dictionary as `pp_plan`)r>  rm  r   r   )r   r{  s     r   r  r    s/    <DM$%%EFFr   c                 V    [        U SS5      nUc  [        S5      eUR                  XS9$ )zv
Potentially dequantize the model in case it has been quantized by a quantization method that support
dequantization.
r   Nz?You need to first quantize your model in order to dequantize itr  )rH  r   
dequantize)r   r   r   s      r   r  PreTrainedModel.dequantize  s8    
 t^T:^__&&t&99r   c                     U R                   (       aD  [        U R                  SS5      (       a'  U R                  5         [	        U R                  S5        g g g )Ngradient_checkpointingF)r?  rH  r  gradient_checkpointing_enabledelattrr   s    r   rz  >PreTrainedModel._backward_compatibility_gradient_checkpointing  sC    //GDKKIach4i4i..0DKK!9: 5j/r   tagsc                     [        U[        5      (       a  U/nU R                  c  / U l        U H/  nX R                  ;  d  M  U R                  R                  U5        M1     g)a  
Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
not overwrite existing tags in the model.

Args:
    tags (`Union[list[str], str]`):
        The desired tags to inject in the model

Examples:

```python
from transformers import AutoModel

model = AutoModel.from_pretrained("google-bert/bert-base-cased")

model.add_model_tags(["custom", "custom-bert"])

# Push the model to your namespace with the name "my-custom-bert".
model.push_to_hub("my-custom-bert")
```
N)rm  r   r,  rQ  )r   r  tags      r   add_model_tagsPreTrainedModel.add_model_tags  sO    , dC  6D??" DOC//)&&s+ r   c                    UR                  SUR                  5      nUR                  SS5      =nb(  [        R                  S5        X1R                  :w  a  UOUn[	        U[
        5      (       a  [        [        U5      nUR                   H  n[        X5      =nc  M  X6l        M     SU;   a  UR                  S5      Ul	        SU;   a  UR                  S5      Ul
        UR                  SS5      n[        5       /nUb$  UR                  [        X0R                  5      5        U(       a  UR                  [!        5       5        [#        5       =(       a    [$        (       + =(       a    [&        (       + n	U	(       ag  [        R)                  S	5        S
SKn
UR-                  [.        R0                  " 5       U
R2                  R5                  [7        5       S9[9        5       /5        [;        U5         U " U40 UD6n[=        U5        SSS5        U	(       a  SSKJ n  U" W5        URC                  5         W$ ! , (       d  f       N5= f)z
All context managers that the model should be initialized under go here.

Args:
    dtype (`torch.dtype`, *optional*):
        Override the default `dtype` and load the model under this dtype.
r   torch_dtypeNz1`torch_dtype` is deprecated! Use `dtype` instead!attn_implementationexperts_implementationrX  F@Detected DeepSpeed ZeRO-3: activating zero.init() for this modelr   config_dict_or_pathr   )initialize_weights_zero3)"rT  r   r  r  rm  r   rH  r   r  r_  rc  r   rP   rQ  r   r   r=   r.   r   r   r  	deepspeedrI  initno_init_weightszeroInitr,   r   r]   rQ   integrations.deepspeedr  tie_weights)rO  r  r  r   r  r  r  rX  init_contextsneeds_zero3_initr  re  r  s                r   _from_configPreTrainedModel._from_config  s    

7FLL1!::mT::KG ST"ll2EEeS!!E5)E %00N%f==
J#(  1
 !F**0**5J*KF' $v--3ZZ8P-QF* #JJ':EB&)  !25,,!GH  !6!8957h<MhVhRhKKZ[   ((*NN''<L<N'O#% ]+)&)E"5) , H$U+ ,+s   +H//
H=c                 .    [        X R                  U 5      $ )z0
`torch.nn.Module`: The main body of the model.
)rH  r*  r   s    r   r  PreTrainedModel.base_model+  s    
 t33T::r   c                 :   S[        U R                  5      ;   a  gU R                   H>  n[        US5      (       d  M  S[        U5      ;  d  M'  UR                  5       (       d  M>    g   [        U S5      (       a"  [        R                  U R                   S35        g)az  
Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`.

Under the hood, on classes where this function returns True, some generation-specific changes are triggered:
for instance, the model instance will have a populated `generation_config` attribute.

Returns:
    `bool`: Whether this model can generate sequences with `.generate()`.
GenerationMixinTre  r   prepare_inputs_for_generationu6   has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.F)r   	__bases__r   re  r  warningr   )rO  bases     r   re  PreTrainedModel.can_generate2  s     CMM 22MMD400 D	1d6G6G6I6I	 " 3788NN<<. 	!  	  r   flash_attn_versiongeneral_availability_checkpkg_availability_checksupported_devices.custom_supported_devicescuda_min_major_versionc                 V   U H(  u  pxU" 5       (       d  M  [         R                  U5          g   U" 5       (       d  SU S3n	U" 5       (       d  [        U	 SU S35      eUS:X  a!  [        S5      (       d  [        U	 SU S	35      e[	        U6 u  p[        S
 U
 5       5      (       d  [        U	 SU SU S35      eUbl  [        5       (       a\  [        R                  R                  5       u  pX:  a6  [        U	 SU SU S[        R                  R                  5        SU S3
5      egggg)a  
Checks whether the specified Flash Attention version is supported and if not, searches for the specific reason
on why it failed - package import and/or device incompatibility issues.

Args:
    flash_attn_version (`int`):
        The requested version of Flash Attention.
    general_availability_check (`Callable`):
        Checks whether our `is_available` function detects the specific FA version. Failing reasons
        are then checked for one-by-one.
    pkg_availability_check (`Callable`):
        Checks whether the package could theoretically be detected in the environment by the init structures.
        This is not a sure-fire check as device compatibility with FA is just as important.
    supported_devices (`tuple[tuple[Callable, str]]`):
        Essentially a list (for mutable kwargs reasons a tuple) of the supported devices in the format of
        `(device_availability_check, device_name)`, i.e. a pair of the associated device's name and whether
        it is available in the environment.
    custom_supported_devices (`tuple[tuple[Callable, str]]`, *optional*, defaults to `()`):
        Essentially a list (for mutable kwargs reasons a tuple) of the custom supported devices in the format of
        `(device_availability_check, info_message)`. These custom devices have custom logic outside the torch
        ecosystem either via kernels or other packages and hence have early checks for availability.
    cuda_min_major_version (`int`, *optional*):
        The minimum major cuda version supported for this version of Flash Attention. This is mostly
        affecting more recent versions which are more specialized to the features of new hardware.
NFlashAttentionzG has been toggled on, but it cannot be used due to the following error:z the package for FlashAttentionz doesn't seem to be installed.r   z2.3.3z FlashAttentionz# requires at least version `2.3.3`.c              3   .   #    U  H  o" 5       v   M     g 7fr   r   )r   device_availability_checks     r   r   ;PreTrainedModel._flash_attn_import_error.<locals>.<genexpr>  s     sXr;T466Xr   zT is not available on CPU. Please make sure you are on any of the supported devices: rF  z  requires compute capability >= z, but found z with compute capability z.x)
r  r  ImportErrorrr   ziprq  ru   r   cudaget_device_capability)r   r  r  r  r  r  r  r  info_messageprefacedevice_availability_checksdevice_namesmajorrX  s                 r   _flash_attn_import_error(PreTrainedModel._flash_attn_import_errorX  s   F 8P3%(**L) 8P
 *++&'9&:  ;B  CG *++!i>?Q>RRpq  $q(1OPW1X1X!WI_=O<PPs"tuu <?@Q;R8*sXrsss%")?3E2F  G[  \h  [i  ij  k  ,7<S<U<U$zz??AHE5)&i7I6JJj  lB  kC  CO  PU  PZ  PZ  Pp  Pp  Pr  Os  sL  MR  LS  SU  V  6 =V7' ,r   rW  c                 6   U R                   (       d<  [        U R                  R                   SU SU R                  R
                   S35      eUS;  a  [        SU S35      eU R                  " S0 [        U   D6  US:  aN  [        U R                  S5      (       a3  U R                  R                  S	:  a  [        R                  S
U S35        U R                  R                  nUc  [        R                  S
U S35        O]UbZ  U[        R                  [        R                  4;  a6  [        R                  SU SU R                  R                   SU SU S3	5        U(       d  [!        U R#                  5        Vs1 s H  oDR$                  iM     sn5      n['        U5      S:X  al  US	   R(                  S:X  aY  Sn[        U   S    H1  u  pxU" 5       (       d  M  Sn[        R                  S
U SU S35          O   U(       d  [        S
U S35      egs  snf )a^  
Check the availability of Flash Attention for a given model.

Args:
    flash_attn_version (`int`):
        The requested version of Flash Attention.
    is_init_check (`bool`, *optional*):
        Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
        fully instantiated. This is needed as we also check the devices of the weights, which are only available
        later after __init__. This allows to raise proper exceptions early before instantiating the full models
        if we know that the model does not support the requested attention.
z" does not support Flash Attention zm yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/zk/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new)r   r     zRequested Flash Attention z which is not supported.r   attention_dropoutr   z*You are attempting to use Flash Attention zv with dropout. This might lead to unexpected behaviour as this is not supported on recent versions of Flash Attention.zD without specifying a dtype. This might lead to unexpected behaviourzFlash Attention zP only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in z is a&  . You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_z", dtype=torch.float16)`r   r   Fr  Tz with a model not initialized on GPU. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU, e.g. with `model.to('z')`.a    with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.r   )r:  r   r  r   r  _name_or_pathr  rK   r   r  r  r  r   r   float16bfloat16r   r  r   r   ro  )	r   r  rW  r   r  param_devicesfound_devicer  device_names	            r   _flash_attn_can_dispatch(PreTrainedModel._flash_attn_can_dispatch  sn    ((>>**++MN`Ma bWW[WbWbWpWpVq rnn  Y.9:L9MMefgg 	%%a(LM_(`a !t{{$788T[[=Z=Z]^=^##@AS@T U~ ~ !!=<=O<P  QU  V 50O#O"#5"6 7((,(?(?'@UG Lm n@  mA  AYZ  DOO<M!N<M5,,<M!NOM=!Q&=+;+@+@E+I$>bcu>v'?:- 122'+++HI[H\ ]FFQ]RVX
 ? $$DEWDX YV V  / "Os   >Hc                    U R                   (       d"  [        U R                  R                   S35      e[        R
                  R                  b  [        R                  R                  5       S:  az  [
        R                  " [        R                  5      [
        R                  " S5      :  a>  [        R                  S5        [        R                  R                  R                  S5        g)a  
Check the availability of SDPA for a given model.

Args:
    is_init_check (`bool`, *optional*):
        Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
        fully instantiated. This is needed as we also check the devices of the weights, which are only available
        later after __init__. This allows to raise proper exceptions early before instantiating the full models
        if we know that the model does not support the requested attention.
a   does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`r   z2.4.1zUsing the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.FT)r9  r   r  r   r   r   hipr  device_countparser   r  r  backendsenable_flash_sdpr   rW  s     r   _sdpa_can_dispatch"PreTrainedModel._sdpa_can_dispatch  s     "">>**+ ,O O  MM)

'')A-e//07==3II y NN007r   c                 r    U R                  5       (       d"  [        U R                  R                   S35      eg)z9
Check the availability of Grouped MM for a given model.
z1 does not support setting experts implementation.T)_can_set_experts_implementationr   r  r   r   s    r   _grouped_mm_can_dispatch(PreTrainedModel._grouped_mm_can_dispatch  s6    
 3355 7 788ijkk r   c                     U R                   (       d"  [        U R                  R                   S35      e[	        5       (       d  [        S5      eg)a  
Check the availability of Flex Attention for a given model.

Args:
    is_init_check (`bool`, *optional*):
        Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
        fully instantiated. This is needed as we also check the devices of the weights, which are only available
        later after __init__. This allows to raise proper exceptions early before instantiating the full models
        if we know that the model does not support the requested attention.
a   does not support an attention implementation through torch's flex_attention. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`z]PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0.T)r;  r   r  r   rh   r  r  s     r   _flex_attn_can_dispatch'PreTrainedModel._flex_attn_can_dispatch  sV     ''>>**+ ,t t  ,--o 
 r   r  rX  c           	         [        U5      u  pEUbX  [        U SS5      n[        US9(       a=  Ub:  XV;  a5  U(       a  SUS    3OUS   n[        R	                  SU SU SU S	35        Un[        U5      u  pEUnS
n	[        US9(       aA  [
        R                  " 5        H(  n
USU
 3:X  d  M  [
        U
   S   " 5       (       a  M&  Sn	  O   U R                  (       aQ  U	(       aJ  [        5       (       a;  [        5       (       d,  [        U   n[        5       (       a  US:X  a  S
n	U(       a  SU 3n[        U5      (       a=   U(       a
  [        XS9  O	[        XS9  U	(       a  [        R	                  SU S35        U$ U R%                  X5      n[        US9(       a  [        U5        U$ ! [         a+  nU	(       a  [!        US   5      n
U R#                  XS9  UeSnAff = f)a@  
Check that the `attn_implementation` exists and is supported by the models, and try to get the kernel from hub if
it matches hf kernels pattern.

Args:
    attn_implementation (`str` or `None`):
        The attention implementation to check for existence/validity.
    is_init_check (`bool`, *optional*):
        Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
        fully instantiated. This is needed as we also check the devices of the weights, which are only available
        later after __init__. This allows to raise proper exceptions early before instantiating the full models
        if we know that the model does not support the requested attention.
    allow_all_kernels (`bool`, optional):
        Whether to load kernels from unverified hub repos, if `attn_implementation` is a custom kernel outside
        of the `kernels-community` hub repository.

Returns:
    `str`: The final attention implementation to use, including potential fallbacks from sdpa to eager, or from
    None to sdpa (to potentially eager).
Nr<  "requested_attention_implementationzpaged|r   zNThis model is compatible with the following flash attention implementations: `z"`. Automatically falling back to `z` instead of `z`.Fflash_attention_r  Tflash_attention_2)rX  z/You do not have `flash_attn` installed, using `z%` from the `kernels` library instead!r<  r  rW  )rn   rH  rm   r  r  rK   r+  r:  rg   ri   rL   rj   r>   rN   rM   r  r   r  get_correct_attn_implementation)r   r  rW  rX  is_pagedbase_implementation compatible_flash_implementationsdefault_flash_implementationapplicable_attn_implementationrequested_original_flash_attn
fa_versionr  s               r   r^  5PreTrainedModel._check_and_adjust_attn_implementation)  s2   . )GGZ([% */6t=`bf/g,,Pcd4@'O GOf=a@ABTtuvTw - ##d  fF  eG G66R5SSabuavvxz 'C#(FGZ([%)<&(-%'K^_BGGI
 (-=j\+JJ@LMijll481 J %%-$&&*,,-GH[-\*%'',?CV,V 16-39:X9Y1Z.34456 00Nt 1''IJhIi j> >* .- .2-Q-Q..*
 ,Omn+,JK--#  0!$%8%<!=J11Z1m s   <:F$ $
G.&GGr  c                 (    U R                  U5      nU$ )a  
Check that the `experts_implementation` exists and is supported by the models.

Args:
    experts_implementation (`str` or `None`):
        The experts implementation to check for existence/validity.
Returns:
    `str`: The final experts implementation to use.
)"get_correct_experts_implementation)r   r  !applicable_experts_implementations      r   rb  8PreTrainedModel._check_and_adjust_experts_implementation  s     -1,S,STj,k)00r   requested_attentionc                    Uc  SOUnUS/[         R                  5       -   ;  a  SU S3nU R                  (       d  [        U SS5      (       a2  US-  n[        R
                  " 5        H  nUSU S	U S
3-  nM     US S nU R                  (       a  US-  nU R                  (       a  US-  n[        US-   5      e[        US9(       aI  [        R                  " SU5      =n(       a+  [        UR                  S5      5      nU R                  XRS9  U$ SU;   a  U R                  U5        U$ SU;   a   U R!                  U5        U$ U$ ! [        ["        4 a  nUb  SU;   a  UeSn S nAU$ S nAff = f)Nsdpaeager Specified `attn_implementation="zc"` is not supported. The only possible arguments are `attn_implementation="eager"`, `"paged|eager"`_supports_flash_attn_2F, z&`"attn_implementation=flash_attention_z0"`, `"attn_implementation=paged|flash_attention_z"`, zB, `"attn_implementation=sdpa"`, `"attn_implementation=paged|sdpa"`z(, `"attn_implementation=flex_attention"`rF  r  z^flash_attention_(\d)$r   r  flex_attention)ALL_ATTENTION_FUNCTIONS
valid_keysr:  rH  rK   r+  r9  r;  r   rm   ri  rj  r   groupr  r  r  r  )r   r  rW  applicable_attentionmessager  
fa_matchedr  s           r   r  /PreTrainedModel.get_correct_attn_implementation  s   )<)DvJ]y3J3U3U3W'WW23G2H IA A 
 ((GD:RTY,Z,Z4"F"K"K"MJ!G
|  TD  EO  DP  PT   U  UG #N!#2,""__''EEWs]++ (K_`))$=?STTJTZ--a01J))Z)e $# !55((7 $# ++/''6 $### , /&2vAT7TG'.$##/s   0E E-E((E-requested_expertsc                    Uc  SOUnS/[        [        [        R                  " 5       5      [        [        R                  " 5       5      -  5      -   nU Vs/ s H	  nSU S3PM     nnSUS   -   US'   SR                  U5      nX#;  a  SU S	U S
3n[        U5      eUS:X  a   U R                  5         U$ U$ s  snf ! [        [        4 a  nUS:X  a  UeSn S nAU$ S nAff = f)N
grouped_mmr  z`experts_implementation="z"`zand r<  r
  z#Specified `experts_implementation="z5"` is not supported. The only possible arguments are rF  )	r   ra  r?   r+  r9   r  r   r  r  )	r   r  applicable_expertsbase_experts_fnsfnvalid_experts_str_listvalid_experts_strr  r  s	            r   r  2PreTrainedModel.get_correct_experts_implementation  s   ->-F\L]#9tC0E0J0J0L,MPSTmTrTrTtPu,u'vvO_!`O_$=bT"DO_!`%+.DR.H%Hr" II&<=556H5II~$%Q(  W%% ----/ "!!!' "a , -$4G%,"!!-s   B8$B= =C"
CC"c                 \   [         R                  R                  U R                  5      nUb  [	        US5      (       d  gUR
                  n[        USSS9 nUR                  5       nSSS5        [        R                  " SW5      (       a  SU;   =(       a    S	U;   $ g
! , (       d  f       N<= f)zDetect whether the class supports setting its attention implementation dynamically. It is an ugly check based on
opening the file, but avoids maintaining yet another property flag.
N__file__Frr   r   zclass \w+Attention\(nn.Module\)eager_attention_forwardz&ALL_ATTENTION_FUNCTIONS.get_interface(T)
r  modulesr   r   r   r  r  r(  ri  rj  rO  class_module
class_filer6  codes        r   _can_set_attn_implementation,PreTrainedModel._can_set_attn_implementation  s    
 {{s~~6w|Z'H'H!**
*cG4668D 5 997>>,4i9aei9ii  54s   B
B+c                    [         R                  R                  U R                  5      nUb  [	        US5      (       d  gUR
                  n[        USSS9 nUR                  5       nSSS5        SU;   $ ! , (       d  f       SW;   $ = f)zDetect whether the class supports setting its experts implementation dynamically. It is an ugly check based on
opening the file, but avoids maintaining yet another property flag.
Nr  Fr  r   r   z@use_experts_implementation)r  r!  r   r   r   r  r  r(  r"  s        r   r  /PreTrainedModel._can_set_experts_implementation  s{    
 {{s~~6w|Z'H'H!**
*cG4668D 5 -44 54 -44s   A44
Bc                 |   [        U[        5      (       d  UO%UR                  SU R                  R                  5      nX0R                  R                  :w  ac  U R                  5       (       d-  [        R                  U R                  R                   S35        O!U R                  USUS9nX0R                  l        U R                  5        GH`  nX@Ld  M
  [        U[        5      (       d  M!  UR                  R                  U R                  R                  :w  d  MQ  [        UR                  S5      (       a  Mn  UR                  5       (       d-  [        R                  UR                  R                   S35        OUn[        U[        5      (       ag  U R                  R                   HM  n[!        U R                  U5      UR                  L d  M(  UR                  XdR                  R                  5      n  O   UR#                  U5      nXTR                  l        SUR                  l        GMc     U R                  R                   H  n[!        U R                  U5      =nc  M  [        U[        5      (       d  UOUR                  XgR                  5      n[        US5      (       d~  XWR                  :w  ao  US/[&        R)                  5       -   ;  a/  [+        S	U S
U S[-        [&        R)                  5       5       35      eXWl        [        R                  SU SU S35        M  [        US5      (       d  M  U?M     g)aS  
Set the requested `attn_implementation` for this model.

Args:
    attn_implementation (`str` or `dict`):
        The attention implementation to set for this model. It can be either a `str`, in which case it will be
        dispatched to all submodels if relevant, or a `dict` where keys are the sub_configs name, in which case each
        submodel will dispatch the corresponding value.
    allow_all_kernels (`bool`, optional):
        Whether to load kernels from unverified hub repos, if `attn_implementation` is a custom kernel outside
        of the `kernels-community` hub repository.
r  z does not support setting its attention implementation dynamically, because it does not follow the functional approach based on AttentionInterface (see https://huggingface.co/docs/transformers/en/attention_interface)FrV  _attn_was_changedTNr  r  z"` is not supported for zd. The only possible arguments are "eager" (manual attention implementation)or one of the following: z8We set the attention implementation for the sub-config `z` to `z` without finding the associated sub-model. For this reason we could not check if the model supports it. You may encounter undefined behavior.)rm  r   r   r  r_  r&  r  r  r  r   r^  ra  r!  r   r   r  rH  r  r+  r  r  r   r   )r   r  rX  requested_implementationrL  sub_implementationsubconfig_key	subconfigs           r   set_attn_implementation'PreTrainedModel.set_attn_implementation  s    1488  $((T[[-M-MN 	! ${{'G'GG4466~~../ 0\ \ ,0+U+U,EUf ,V ,( =U9 I %y/::$$..$++2G2GG	 0 02EFF !==??NN$..778 9` ` *B&!"5t<<-1[[-D-DM&t{{MBiFVFVV5H5L5L$13C3C3X3X6" 2 !& .E *3)R)RSe)f&EW$$B 6:	  2E (J "[[44M$T[[-@@	M &&94@@ -,00@^@^_ #  	+>??*.L.LL)'=T=_=_=a1aa(>?Q>RRjkxjy z88<=T=_=_=a8b7ce 
 ?Q;NNRS`Raaghzg{ |@ @ y*=>>%79 5r   c                    [        U[        5      (       d  UO%UR                  SU R                  R                  5      nX R                  R                  :w  a!  U R                  U5      nX R                  l        U R                  5        H  nX0Ld  M	  [        U[        5      (       d  M   UR                  R                  U R                  R                  :w  d  MP  Un[        U[        5      (       ag  U R                  R                   HM  n[        U R                  U5      UR                  L d  M(  UR                  XSR                  R                  5      n  O   UR                  U5      nXCR                  l        M     g)a  
Set the requested `experts_implementation` for this model.

Args:
    experts_implementation (`str` or `dict`):
        The experts implementation to set for this model. It can be either a `str`, in which case it will be
        dispatched to all submodels if relevant, or a `dict` where keys are the sub_configs name, in which case each
        submodel will dispatch the corresponding value.
r  N)rm  r   r   r  rc  rb  rd  r!  r   r  r  rH  r  )r   r  r,  rL  r-  r.  s         r   set_experts_implementation*PreTrainedModel.set_experts_implementationj  s2    4d;; #'++B0S0ST 	! ${{'J'JJ'+'T'TUm'n$;SKK8 I %y/::$$..$++2G2GG &>"4d;;)-)@)@"4;;>)BRBRR1G1K1K -/?/?/W/W2. " *A &/%Q%QRd%e"DV  A) (r   c                 4   S n/ n[        5       nSnU R                  5        H  n[        U[        5      (       a  [	        US5      (       d  M+   UR                  5       nUb  [	        US5      (       d  MR  [        U5      nXs;   a  Md  UR                  U5        UR                  UR                  U5      5        SnM     X l        U(       a
  US   U l        U(       d-  [        R                  U R                  R                    S35        gg! [         a     M  f = f)	z
Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
the model weights fixed.
c                 &    UR                  S5        g NT)requires_grad_)rC  inputoutputs      r   make_inputs_require_gradsMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_grads  s    !!$'r   Fr  Nregister_forward_hookTr   a   does not expose input embeddings. Gradients cannot flow back to the token embeddings when using adapters or gradient checkpointing. Override `get_input_embeddings` to fully support those features, or set `_input_embed_layer` to the attribute name that holds the embeddings.)ra  r!  rm  r   r   r  r  rn  rS  rQ  r=  _require_grads_hooks_require_grads_hookr  r  r  r   )r   r;  hooksseen_modulesfound_embeddingsrC  input_embeddingsembedding_ids           r   enable_input_require_grads*PreTrainedModel.enable_input_require_grads  s   	( u llnFv77GFLb<c<c#)#>#>#@   'w7GI`/a/a./L+\*LL)??@YZ[#% %( %*!',QxD$>>**+ ,w w  % ' s   D		
DDc                     [        U SS5      nU(       d  gU H  nUR                  5         M     / U l        [        U S5      (       a  U ?gg)z$
Removes the `_require_grads_hook`.
r>  Nr?  )rH  remover>  r   r?  )r   r@  hooks      r   disable_input_require_grads+PreTrainedModel.disable_input_require_grads  sO     4d;DKKM  %'!4.//( 0r   modalityc                 R   US;   a  / SQnO!US:X  a  / SQnOUc  SS/nO[        SU 35      eU H   n[        X5      (       d  M  [        X5      s  $    U R                  U LaE  [        U R                  S5      (       a*  U R                  R	                  US	9nX@R                  :w  a  U$ U $ )
aA  
Best-effort lookup of the *encoder* module. If provided with `modality` argument,
it looks for a modality-specific encoder in multimodal models (e.g. "image_encoder")
By default the function returns model's text encoder if any, and otherwise returns `self`.

Possible `modality` values are "image", "video" and "audio".
imagevideovision_towervisualvision_modelvision_encoderimage_toweraudio)audio_toweraudio_encoderspeech_encodertext_encoderencoderHUnnrecognized modality, has to be "image", "video" or "audio" but found get_encoderrL  )r   r   rH  r  r^  )r   rL  possible_module_namesrK  base_encoders        r   r^  PreTrainedModel.get_encoder  s     ))$o! $V!%3Y$?!ghpgqrss)Dt""t** * ??$&74??M+R+R??666IL .## r   c                 @   US;   a  / SQnO!US:X  a  SS/nOUc  SS/nO[        S	U 35      eU H!  n[        X5      (       d  M  [        XU5          g   U R                  U La<  [        U R                  S
5      (       a  U R                  R	                  XS9  gXl        gg)zC
Symmetric setter. Mirrors the lookup logic used in `get_encoder`.
rN  rQ  rW  rX  rY  Nr[  r\  r]  set_encoderr_  )r   r   r  r  rd  re  )r   r\  rL  r`  rK  s        r   rd  PreTrainedModel.set_encoder  s     ))$o! %2O$D!%3Y$?!ghpgqrss)Dt""G, *
 ??$&t66++G+G$
	 'r   c                     / SQnU H   n[        X5      (       d  M  [        X5      s  $    U R                  U La5  [        U R                  S5      (       a  U R                  R                  5       $ U $ )ad  
Best-effort lookup of the *decoder* module.

Order of attempts (covers ~85 % of current usages):

1. `self.decoder/self.language_model/self.text_model`
2. `self.base_model`                  (many wrappers store the decoder here)
3. `self.base_model.get_decoder()`    (nested wrappers)
4. fallback: raise for the few exotic models that need a bespoke rule
)language_model
text_modeldecodertext_decoderget_decoder)r   rH  r  rk  )r   r`  rK  s      r   rk  PreTrainedModel.get_decoder	  sc     !\)Dt""t** * ??$&74??M+R+R??..00 r   c                     / SQnU H!  n[        X5      (       d  M  [        XU5          g   U R                  U La>  [        U R                  S5      (       a  U R                  R                  U5        gXl        gg)zC
Symmetric setter. Mirrors the lookup logic used in `get_decoder`.
)rg  rh  ri  Nset_decoder)r   r  r  rn  re  )r   ri  r`  rK  s       r   rn  PreTrainedModel.set_decoder"	  sh    
 !L)Dt""G, *
 ??$&t66++G4$
	 'r   c           	         [        U R                  S5      (       a   U R                  R                  =(       d    SnO[        U R                  S5      (       a  U R                  R                  nOW[        U R                  S5      (       a  U R                  R                  nO%[        U R                  R                  5       SS5      n[        U[        R                  [        R                  [        R                  [        R                  [        R                  [        R                  45      (       ak  [        USS5      b.  [        R                   " UR"                  R%                  5       SUS9  UR&                  b!  [        R(                  " UR&                  5        gg[        U[        R*                  5      (       ay  [        R                   " UR"                  SUS9  UR,                  bK  [        UR"                  S	S
5      (       d.  [        R(                  " UR"                  UR,                     5        ggg[        U[        R.                  5      (       a  UR1                  5         g[        U[        R2                  [        R4                  [        R6                  [        R8                  45      (       d4  SUR:                  R<                  ;   d  SUR:                  R<                  ;   a  [        USS5      b   [        R>                  " UR"                  5        [        USS5      b   [        R(                  " UR&                  5        [        USS5      ba  [        R(                  " UR@                  5        [        R>                  " URB                  5        [        R(                  " URD                  5        ggSUR:                  R<                  ;   a  [        US5      (       a  URF                  S:w  a  [H        URF                     OURJ                  nU" UR                  5      u  pE[        RL                  " URN                  U5        [        RL                  " URP                  U5        ggg)aD  
Initialize the weights. This is quite general on purpose, in the spirit of what we usually do. For more complex
initialization scheme, it should be overridden by the derived `PreTrainedModel` class. In case a model adds an explicit
`nn.Parameter`, this method should also be overridden in order to initialize it correctly.
initializer_rangeg{Gz?init_stdinitializer_factorweightNg        meanstd_is_hf_initializedF	LayerNormRMSNormbiasrunning_meanRotaryEmbeddingoriginal_inv_freqdefault))r   r  rq  rr  rs  rH  get_text_configrm  r   LinearConv1dConv2dConv3dConvTranspose1dConvTranspose2dr  normal_rt  floatr{  zeros_r  padding_idxMultiheadAttention_reset_parameters	GroupNormBatchNorm1dBatchNorm2dBatchNorm3dr  r   ones_r|  running_varnum_batches_tracked	rope_typerO   compute_default_rope_parameterscopy_inv_freqr~  )r   rC  rw  rope_fnbuffer_valuerX  s         r   _init_weightsPreTrainedModel._init_weights3	  s    4;; 344++//74CT[[*--++&&CT[["677++00C $++5579LdSCfryy"))RYY		2K]K]_a_q_qrssvx.:V]]002#F{{&FKK( '--LLSc:!!-gfmmMach6i6iFMM&*<*<=> 7j- 5 566$$& vbnnbnnbnn]^^f..777F,,555 vx.:

6==)vvt,8FKK(v~t4@F//0

6--.F667 A
 &"2"2";";;Pc@d@d ##y0 $F$4$45;; 
 &fmm4OLJJv5JJv//> Ae;r   r  c                    [        USS5      (       a  gU(       aP  [        S UR                  SS9 5       5      (       a,  [        S UR                  SS9 5       5      (       a  SUl        gU R                  U5        SUl        g)z=
Initialize the weights if they are not already initialized.
rx  FNc              3   <   #    U  H  n[        US S5      v   M     g7f)rx  FNrH  r  s     r   r   6PreTrainedModel._initialize_weights.<locals>.<genexpr>y	  s     nMmEGE#7??Mms   )recursec              3   F   #    U  H  nUc  M  [        USS5      v   M     g 7f)Nrx  Fr  )r   buffers     r   r   r  z	  s)      ;F = 4e<<;s   !!T)rH  allr  buffersrx  r  )r   rC  r  s      r   _initialize_weights#PreTrainedModel._initialize_weightsm	  s     6/77 nVM^M^glM^Mmnnn $nnUn;   )-F%6"$(!r   c                 v  ^ [        [        R                  R                  S5      (       db  S[        R                  S[        [        R                  [
        /S4   S[
        4U4S jjm[        [        R                  R                  ST5        [        U S5      nU" U R                  U R                  5       5        g)a  
This is equivalent to calling `self.apply(self._initialize_weights)`, but correctly handles composite models.
This function dynamically dispatches the correct `init_weights` function to the modules as we advance in the
module graph along the recursion. It can handle an arbitrary number of sub-models. Without it, every composite
model would have to recurse a second time on all sub-models explicitly in the outer-most `_init_weights`, which
is extremely error prone and inefficient.
smart_applyrC  r  Nr  c                    > U R                  5        H6  n[        U[        5      (       a  T" X3R                  U5        M-  T" X1U5        M8     U" X5        U $ r   )childrenrm  r   r  )rC  r  r  childr  s       r   r  7PreTrainedModel.initialize_weights.<locals>.smart_apply	  sJ    #__.E!%99#E+D+DnU#E~> / 6*r   )
r   r   r   r'  r   r   r  rH  r  r  )r   smart_apply_fnr  s     @r   initialize_weights"PreTrainedModel.initialize_weights	  s     uxx66BII 8RYY<Mt<S3T fj  EHHOO]K@ !}5t//1D1D1FGr   ro  c                   ^^^ U(       a  0 nU R                  SS9 Hq  u  p4[        U[        5      (       d  M  UR                  SS9nUS:w  a/  UR	                  5        VVs0 s H  u  pgU SU 3U SU 3_M     nnnUR                  U5        Ms     U$ U R                  n[        U R                  SS5      n	U	(       d  0 $ Uc  0 $ [        R                  " S5      m[        U4S jUR                  5       UR                  5       -   5       5      (       a  UR                  5       $ 0 nU R                  SS9 VV
s1 s H  u  pjUiM	     sn
nU R!                  SS9 VV
s1 s H  u  pjUiM	     sn
n-  nUR	                  5        H  u  mmS	T-   mS	T-   m[#        [%        U4S
 jU5      5      n[#        [%        U4S jU5      5      n['        U5      S:  a*  ['        U5      S:  a  ['        U5      ['        U5      -  S:w  a  [)        ST ST SU SU 35      e[+        U[-        U5      5       H$  u  pXR                  5       ;   a  X/   X.'   M   XU'   M&     M     U$ s  snnf s  sn
nf s  sn
nf )a	  
Return the expanded tied weight keys (in case they contain modules or regex patterns) for only the current
model, or recursively for all submodels if `all_submodels=True` (i.e. it will re-check the config values for all
submodels).

For almost all models, we only require to tie the embeddings, so the model has an internal property
`_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}`. In this case, the mapping is already
"expanded", i.e. it already contains full parameters, and this function will simply return a copy of the property.
For more complex patterns, e.g. for `DFineForObjectDetection`, we have the following attribute
```
_tied_weights_keys = {
    r"bbox_embed.(?![0])\d+": "bbox_embed.0",
    r"class_embed.(?![0])\d+": "class_embed.0",
    "model.decoder.class_embed": "class_embed",
    "model.decoder.bbox_embed": "bbox_embed",
}
```
In this case, the function looks up all the model's parameters and buffers, and matches all the params,
returning the following:
```
{
    'bbox_embed.1.layers.0.bias': 'bbox_embed.0.layers.0.bias',
    'bbox_embed.1.layers.0.weight': 'bbox_embed.0.layers.0.weight',
    'bbox_embed.1.layers.1.bias': 'bbox_embed.0.layers.1.bias',
    'bbox_embed.1.layers.1.weight': 'bbox_embed.0.layers.1.weight',
    'bbox_embed.1.layers.2.bias': 'bbox_embed.0.layers.2.bias',
    'bbox_embed.1.layers.2.weight': 'bbox_embed.0.layers.2.weight',
    'bbox_embed.2.layers.0.bias': 'bbox_embed.0.layers.0.bias',
    'bbox_embed.2.layers.0.weight': 'bbox_embed.0.layers.0.weight',
    ...
    'class_embed.1.bias': 'class_embed.0.bias',
    'class_embed.1.weight': 'class_embed.0.weight',
    'class_embed.2.bias': 'class_embed.0.bias',
    'class_embed.2.weight': 'class_embed.0.weight',
    ...
    'model.decoder.class_embed.0.bias': 'class_embed.0.bias',
    'model.decoder.class_embed.0.weight': 'class_embed.0.weight',
    'model.decoder.class_embed.1.bias': 'class_embed.0.bias',
    'model.decoder.class_embed.1.weight': 'class_embed.0.weight',
    ...
    'model.decoder.bbox_embed.0.layers.0.bias': 'bbox_embed.0.layers.0.bias',
    'model.decoder.bbox_embed.0.layers.0.weight': 'bbox_embed.0.layers.0.weight',
    'model.decoder.bbox_embed.0.layers.1.bias': 'bbox_embed.0.layers.1.bias',
    'model.decoder.bbox_embed.0.layers.1.weight': 'bbox_embed.0.layers.1.weight',
    ...
}
```
i.e. all the parameters matching the regex and modules patterns in `_tied_weights_keys`
F)remove_duplicatern  r  rF  tie_word_embeddingsz ^[A-Za-z0-9_\.]+(weight)|(bias)$c              3   F   >#    U  H  nTR                  U5      v   M     g 7fr   )r  )r   r4  common_case_regexs     r   r   APreTrainedModel.get_expanded_tied_weights_keys.<locals>.<genexpr>	  s"     _3^a &&q))3^s   !^c                 2   > [         R                  " TU 5      $ r   rh  )xsource_names    r   r  @PreTrainedModel.get_expanded_tied_weights_keys.<locals>.<lambda>	      BIIk14Mr   c                 2   > [         R                  " TU 5      $ r   rh  )r  target_names    r   r  r  	  r  r   r   zAThere is an issue with your definition of `tie_weights_keys` for :z. We found z to tie into )rG  rm  r   rv  r)  rx  rE  rH  r  ri  compiler  r+  r   rs  r   named_buffersr  filterr   r   r  r   )r   ro  expanded_tied_weightsprefixrL  submodel_tied_weightsr4  r5  tied_mappingr  rX  all_param_namessource_paramstarget_paramstarget_nsource_nr  r  r  s                   @@@r   rv  .PreTrainedModel.get_expanded_tied_weights_keys	  s   d $&!%)%7%7%7%O!i99,5,T,Tch,T,i)|I^IdIdIf1IfvhasOxq_<If . 1 *001FG &P )(.. &dkk3H%P"I!I JJ'JK_<3D3D3FI\I\I^3^___$$&& !#)-)>)>PU)>)VW)V1)VW,,e,D[
D$!AD[
 
 )5(:(:(<$K+K+K"6*M#_`M"6*M#_`M&*=)A-}%M(::a? WXcWddefqer s  -mM?L 
 '*-}9M&N" 99;;6K6U)3 7?(3 'O! )=6 %$o12 X [
s    I7IITmissing_keysrecompute_mappingc                    U(       d  U R                   nOU R                  SS9n[        UR                  5       5      n[	        U5       GH{  u  nu  pVUb  SnXa;  nXQ;  n	U(       a{  U	(       at  [
        R                  " U R                  U5      U R                  U5      5      (       d9  [        R                  SU SU S35        U R                   R                  U5        M  OcU(       d
  U	(       a  XepeORU(       dK  U	(       dD  X4S-   S  H  u  pX:X  d  M  X;  nU(       d  M  U
n  O    Sn[        R                  S	U SU S
35        U R                  U5      nSU;   a&  UR                  SS5      u  pU R                  U5      nOUnU n[        UX5        U R                  UU5        Uc  GM`  W(       d  GMj  UR!                  U5        GM~     g)a  
Tie the model weights. If `recompute_mapping=False` (default when called internally), it will rely on the
`model.all_tied_weights_keys` attribute, containing the `{target: source}` mapping for the tied params.
If `recompute_mapping=True`, it will re-check all internal submodels and their config to determine the params
that need to be tied. This is the default when `model.tie_weights()` is called on its own, outside of
`__init__`, and `from_pretrained`, in case the config values were changed somewhere.

Note that during `from_pretrained`, tying is *symmetric*: if the mapping says "tie target -> source" but
`source` is missing in the checkpoint while `target` exists, we *swap* source and target so we can still
tie everything to the parameter that actually exists.
Trn  NzDThe tied weights mapping and config for this model specifies to tie z to z, but both are present in the checkpoints with different values, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning.r   FzYThis checkpoint seem corrupted. The tied weights mapping for this model specifies to tie zk, but both are absent from the checkpoint, and we could not find another related tied weight for those keysrF  )rq  rv  r   r)  	enumerater   equalrp  r  r  rT  get_parameter_or_bufferr  get_submoduler  _adjust_biasdiscard)r   r  r  r|  itarget_param_namesource_param_nameremove_from_missingsource_is_theretarget_is_theretarget_backupsource_backuptarget_backup_is_theresource_paramparent_namerK  r  s                    r   r  PreTrainedModel.tie_weights
  s    !22I;;$;OI*+	9B99M5A5!'&*#"3"G"3"G # !;;t'9'9:K'LdN`N`arNsttbctbuuy01 2 22667HI  u )_;L'8(8Aa%'8J4 )=5B5V2  654A 1 % 9K  /4+w016G5H I__  778IJL''$5$<$<S!$D!++K8(FD/fl3',?,?$$%67} :Nr   c                    [        USS 5      b  [        US5      (       az  UR                  R                  n[        R
                  R                  UR                  R                  SUS   UR                  R                  S   -
  4SS5      UR                  l        [        US5      (       a$  [        US5      (       a  UR                  Ul
        g g g )Nr{  rt  r   constantout_featuresnum_embeddings)rH  r   rt  r  r   
functionalpadr{  datar  r  )r   output_embeddingsrC  weight_shapes       r   r  PreTrainedModel._adjust_biasi
  s    $fd3?GL]_gDhDh,3399L*,--*;*;!&&++LO&7&<&<&B&B1&EEF	+""' $n55'BRTd:e:e-=-L-L* ;f5r   new_num_tokenspad_to_multiple_ofmean_resizingc                    U R                  XU5      nUc  Uc  U$ [        U S5      =(       a    U R                  SLn[        5       (       aR  U(       dK  SSKnUR
                  R                  UR                  SS9   UR                  R                  S   nSSS5        OUR                  R                  S   nWU R                  R                  5       l        Xpl        U R                  5         U$ ! , (       d  f       NE= f)ad  
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

Arguments:
    new_num_tokens (`int`, *optional*):
        The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
        returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
        `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
        details about this, or help on choosing the correct value for resizing, refer to this guide:
        https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
    mean_resizing (`bool`):
        Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
        covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

        Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
        where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
        old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
        Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

Return:
    `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
Nr   r   modifier_rank)_resize_token_embeddingsr   r   r.   r  r  GatheredParametersrt  r  r  r  
vocab_sizer  )r   r  r  r  model_embedsr   r  r  s           r   resize_token_embeddings'PreTrainedModel.resize_token_embeddingsu
  s    H 44^Yfg!&8&@ t^4V9J9JRV9V%''22<3F3FVZ2[)0066q9
 \[ &,,2215J 4>##%0$ 	 \[s   :C--
C;c                    U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[	        XV5        UR
                  R                  nUR                  U5        U R                  U5        [        U S5      =(       a    U R                  S LnUbz  [        5       (       aR  U(       dK  SS Kn	U	R                  R                  UR
                  S S9   UR
                  R                  S   nS S S 5        OUR
                  R                  S   nU R                  5       b  U R                  5       n
[!        U
["        R$                  R&                  5      (       a  U R                  XUS9nOU R)                  XUS9n[        U
S5      (       a  U
R                  n[	        X5        U
R
                  R                  nUR                  U5        U R+                  U5        U R                  5       $ ! , (       d  f       N= f)N_hf_hookr   r   r  )r  )r  _get_resized_embeddingsr   r  r|   rt  r  r8  r  r   r.   r  r  r  r  r!  rm  r   r   r  _get_resized_lm_headr%  )r   r  r  r  old_embeddingsr$  rI  old_embeddings_requires_gradr   r  old_lm_headnew_lm_headold_lm_head_requires_grads                r   r  (PreTrainedModel._resize_token_embeddings
  s   22455,>
 >:..!**D~4'5'<'<'J'J$%%&BC!!.1t^4V9J9JRV9V ))++L ^^66~7L7L\`6a%3%:%:%@%@%CN ba "0!6!6!<!<Q!? %%'3446K+uxx'9'9::"::;fs:t"77cp7q{J//"++";5(3(:(:(H(H%&&'@A&&{3((**' bas   %G<<
H
r  c           	      	   UbN  [        U[        5      (       d  [        SU S35      eUc  UR                  R                  S   nX#-   S-
  U-  U-  nO[
        R                  SU S35        Uc  U$ [        U S5      =(       a    U R                  SLn[        5       (       aU  U(       dN  SSK
nUR                  R                  UR                  SS	9   UR                  R                  5       u  pxSSS5        OUR                  R                  5       u  pxWU:X  a  [        5       (       d  X!l        U$ [        U[        R                   5      (       d:  [#        S
[%        U5       S[        R                    S[        R                    S35      e[        R                   " UWUR                  R&                  UR                  R(                  S9n	X':  a  U(       d  U R+                  U	5        OX':  a  U(       a  [
        R-                  S5        X'-
  n
[        5       (       aL  U(       dE  SSK
nUR                  R                  UR                  /SS	9   U R/                  XXz5        SSS5        OU R/                  XXz5        [1        Xr5      n[        5       (       a  U(       d|  SSK
nUR                  U	R                  /nUR                  R                  USS	9   UR                  R2                  SU2SS24   U	R                  R2                  SU2SS24'   SSS5        O<UR                  R2                  SU2SS24   U	R                  R2                  SU2SS24'   [        5       (       a  U(       d  SSK
nUR                  U	R                  /nUR                  R                  USS	9   U	R                  Ul        U	R                  R2                  R                  S   Ul        UR4                  b  US-
  UR4                  :  a  SUl        SSS5        U$ U	R                  R2                  UR                  l        U	R                  R2                  R                  S   Ul        UR4                  b  US-
  UR4                  :  a  SUl        U$ ! , (       d  f       GNz= f! , (       d  f       GN'= f! , (       d  f       GN`= f! , (       d  f       U$ = f)a  
Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
initialized vectors at the end. Reducing the size will remove vectors from the end

Args:
    old_embeddings (`torch.nn.Embedding`):
        Old embeddings to be resized.
    new_num_tokens (`int`, *optional*):
        New number of tokens in the embedding matrix.

        Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
        vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
        `torch.nn.Embedding` module of the model without doing anything.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
        `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
        details about this, or help on choosing the correct value for resizing, refer to this guide:
        https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
    mean_resizing (`bool`):
        Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
        covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

        Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
        where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
        old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
        Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html


Return:
    `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
    `new_num_tokens` is `None`
Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerr   r   zYou are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.  . This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr   r  zOld embeddings are of type , which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of rF  r  zThe new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)rm  r   r   rt  r  r  r  r   r   r.   r  r  r  r#  r  r   r  r\  ro  r   r   r  r  (_init_added_embeddings_weights_with_meanr  r  r  )r   r  r  r  r  r   r  old_num_tokensold_embedding_dimr$  added_num_tokensnparamss                r   r  'PreTrainedModel._get_resized_embeddings
  sf   V )0#66 KL^K_  ``  a  %!/!6!6!<!<Q!?-BQFK]]assNKK&&4%5 6DD !!!t^4V9J9JRV9V%''22>3H3HX\2]4B4I4I4N4N4P1 ^] 1?0E0E0J0J0L-N^+4N4P4P,:)!!.",,77-d>.B-CCabdbnbnao pLL>$  !((// ''--	
 *=~., =  .>)++L ^^668M8M7N^b6cAA& dc
 =="N /%''$++^-B-BCF2262K4B4I4I4N4NrPQrSTu4U%%**2A2q51 LK 1?0E0E0J0J2A2q50QN!!&&rr1u-
 &''$++^-B-BCF2262K(6(=(=%0>0E0E0J0J0P0PQR0S- "--9~PQ?QUcUoUo>o15N. L  *8)>)>)C)CN!!&,:,A,A,F,F,L,LQ,ON)))5>A;MQ_QkQk:k-1*w ^]^ dc$ LK LK s1   Q<R=R A!R2<
R
R 
R/2
Sr  
transposedc           	      h   Uc  U$ [        U S5      =(       a    U R                  SLn[        5       (       a  U(       d}  SSKnUR                  R                  UR                  SS9   U(       d  UR                  R                  5       O'UR                  R                  5       R                  5       u  pxSSS5        OKU(       d  UR                  R                  5       O'UR                  R                  5       R                  5       u  pxWU:X  a  [        5       (       d  X!l	        U$ [        U[        R                  5      (       d:  [        S[        U5       S[        R                   S[        R                   S35      eU(       d  WU4OUW4n	UR                  SLn
[        R                  " U	U
UR                  R                   UR                  R"                  S	.6nX':  a  U(       d  U R%                  U5        OX':  a  U(       a  [&        R)                  S
5        X'-
  n[        5       (       a~  U(       dw  SSKnUR                  /nU
(       a  XR                  /-  nUR                  R                  USS9   U R+                  XXX5        U
(       a  U R-                  XU5        SSS5        O,U R+                  XXX5        U
(       a  U R-                  XU5        [/        Xr5      n[        5       (       ap  U(       di  SSKnUR                  UR                  UR                  UR                  /nUR                  R                  USS9   U R1                  XXU
5        SSS5        OU R1                  XXU
5        [3        USS5        U$ ! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       N@= f)a  
Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end

Args:
    old_lm_head (`torch.nn.Linear`):
        Old lm head liner layer to be resized.
    new_num_tokens (`int`, *optional*):
        New number of tokens in the linear matrix.

        Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
        vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
        `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
        to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
        vocab_size` else `vocab_size, lm_head_dim`.
    mean_resizing (`bool`):
        Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
        covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

        Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
        where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
        old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
        Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

Return:
    `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
    `None`
Nr   r   r  z#Old language model head is of type r  zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of rF  )r{  r   r   a  The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`rx  T)r   r   r.   r  r  r  rt  r#  r   r  rm  r   r  r\  ro  r{  r   r   r  r  r  %_init_added_lm_head_weights_with_mean"_init_added_lm_head_bias_with_meanr  !_copy_lm_head_original_to_resizedr  )r   r  r  r
  r  r   r  r  old_lm_head_dimnew_lm_head_shapehas_new_lm_head_biasr  r  r  num_tokens_to_copys                  r   r  $PreTrainedModel._get_resized_lm_headv  sL   H !t^4V9J9JRV9V%''22;3E3EUY2Z5?K&&++-[EWEWEYEYE[E`E`Eb 0 [Z 2<""'')ASASAUAUAWA\A\A^ ,N ^+4N4P4P'5$+ryy115d;6G5HHfgigpgpfq rII;a!  FP_n=VdfuUv*//t; ii%%%,,$$**	
 *={+, =  .>)++L %,,-'//00F^^66vT6R>>#/Sc ,??Zjk SR ::oO_ (;;KVfg @%''!((+*:*:K<N<NP[P`P`aF2262K66.@Nb LK
 22*<J^ 	148m [Zp SR( LKs%   %AN 2-NN# 
N
N #
N1c                    UR                   R                  R                  [        R                  5      n[        R
                  " USS9nXV-
  nUR                  U-  U-  nSn	[        R                  R                  X-  5      R                  5       n
U
(       a~  [        R                  R                  R                  XiU-  S9nUR                  U4S9R                  UR                   R                  5      UR                   R                  SU-  S 2S S 24'   g US S S 24   R!                  US5      R                  UR                   R                  5      UR                   R                  SU-  S 2S S 24'   g )Nr   r  &.>)covariance_matrix)sample_shaper<  r   )rt  r  r*  r   r   rv  Tr   positive_definitecheckr  distributionsmultivariate_normalMultivariateNormalsampler   r  )r   r  r$  r  r  old_embeddings_weightmean_embeddingsold_centered_embeddings
covarianceepsilonis_covariance_psddistributions               r   r  8PreTrainedModel._init_added_embeddings_weights_with_mean  s]    !/ 5 5 : : = =emm L**%:C"7"I,..1HH>Y
 '99??@TUYY[ ..BBUUZ3G V L FREXEX.0 FY Fb&&,,- !!&&r,<'<'>'AB  a(//0@!DGGH]H]HcHcd !!&&r,<'<'>'ABr   c                    U(       a^  UR                   R                  R                  UR                   l        UR                   R                  R                  UR                   l        U R                  XXE5        U(       a_  UR                   R                  R                  UR                   l        UR                   R                  R                  UR                   l        g g r   )rt  r  r  r  )r   r  r  r  r  r  r
  s          r   r  5PreTrainedModel._init_added_lm_head_weights_with_mean  s     &1&8&8&=&=&?&?K#&1&8&8&=&=&?&?K# 	55kP^q&1&8&8&=&=&?&?K#&1&8&8&=&=&?&?K# r   c                 Z   [         R                  " UR                  R                  S[         R                  S9n[         R
                  " UR                  R                  SS9R                  [         R                  5      nUR                  R                  SU-  S  R                  USU-  S9  g )Nr   )r  r   r  r<  r  ru  )r   rv  r{  r  r   rw  r*  r  )r   r  r  r  	bias_meanbias_stds         r   r  2PreTrainedModel._init_added_lm_head_bias_with_mean(  s    JJ{//441EMMR	99[--22;>>u}}Mb#3356>>ISWZbSb>cr   c                 t   U(       d=  UR                   R                  S U2S S 24   UR                   R                  S U2S S 24'   O<UR                   R                  S S 2S U24   UR                   R                  S S 2S U24'   U(       a1  UR                  R                  S U UR                  R                  S U& g g r   )rt  r  r{  )r   r  r  r  r
  r  s         r   r  1PreTrainedModel._copy_lm_head_original_to_resized-  s     >I>P>P>U>UViWiViklVl>mK##$7%7$7$:;>I>P>P>U>UVWYlZlYlVl>mK##A':(:':$:;  9D9I9I9N9NObPb9cK!!"5#56  r   new_num_position_embeddingsc           	      |    [        SU R                   SU R                   SU R                  R                   S35      e)Nz4`resize_position_embeddings` is not implemented for B`. To implement it, you should overwrite this method in the class  in `modeling_.py`r  r  r   )r   r/  s     r   resize_position_embeddings*PreTrainedModel.resize_position_embeddings:  sH    !B4>>BR S226..1APTP^P^PiPiOjjnp
 	
r   c           	      |    [        SU R                   SU R                   SU R                  R                   S35      e)Nz1`get_position_embeddings` is not implemented for r1  r2  r3  r4  r   s    r   get_position_embeddings'PreTrainedModel.get_position_embeddings@  sH    !??O P226..1APTP^P^PiPiOjjnp
 	
r   c                     [        5       [        R                  " S5      :w  a  U R                  5         U R	                  SS9  g)z
Initialize and tie the weights if needed. If using a custom `PreTrainedModel`, you need to implement any
initialization logic in `_init_weights`.
r  F)r  N)r   r   r   r  r  r   s    r   ry  PreTrainedModel.init_weightsF  s5     675<<;OO##%51r   c                 "   U R                   (       d"  [        U R                  R                   S35      eUc  SS0n[        R
                  " [        40 UD6nS[        R                  " U R                  5      R                  ;   nU(       d  U R                  SUS9  O8U R                  [        U R                  SS95        [        R                  S	5        U R                  S
:H  nU=(       d    [        U SS5      nU(       a  U R!                  5         gg)a  
Activates gradient checkpointing for the current model.

We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2

Args:
    gradient_checkpointing_kwargs (dict, *optional*):
        Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
z) does not support gradient checkpointing.Nuse_reentrantFr  T)enablegradient_checkpointing_funcr  V  You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.r-  _hf_peft_config_loaded)r?  r   r  r   	functoolsr
   r   rL  	signature_set_gradient_checkpointingr  applyr  r  r.  rH  rE  )r   gradient_checkpointing_kwargsr?  _is_using_old_formatneeds_embedding_gradsenable_input_gradss         r   r  -PreTrainedModel.gradient_checkpointing_enableR  s     33 7 788abcc(0-<e,D)&/&7&7
&dFc&d#  ''*;*;D<\<\*]*h*hh#,,DVq,rJJwt??tLMNNH
 !% 4 4 C2dgdD\^c6d
 ++- r   r>  r?  c                 $   Sn[        U S5      (       a  X l        Xl        SnU R                  5        H2  n[        US5      (       d  M  [	        USU5        [	        USU5        SnM4     U(       d"  [        U R                  R                   S35      eg )NFr  T_gradient_checkpointing_funcz is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute `gradient_checkpointing` to modules of the model that uses checkpointing.)r   rM  r  r!  r  r   r  r   )r   r>  r?  is_gradient_checkpointing_setrC  s        r   rE  +PreTrainedModel._set_gradient_checkpointing|  s    (-% 41220K-*0',0)llnFv788 >@[\ 8&A04-	 % ->>**+ ,] ]  -r   c                 d   U R                   (       a|  S[        R                  " U R                  5      R                  ;   nU(       d  U R                  SS9  O8[
        R                  S5        U R                  [        U R                  SS95        [        U SS5      (       a  U R                  5         gg)z;
Deactivates gradient checkpointing for the current model.
r  F)r>  rA  r@  rB  N)r?  rL  rD  rE  r  r  r  rF  r
   rH  rJ  )r   rH  s     r   gradient_checkpointing_disable.PreTrainedModel.gradient_checkpointing_disable  s     // $+g.?.?@`@`.a.l.l#l '000>L 

74#C#C5QR41599,,. :r   c                 B    [        S U R                  5        5       5      $ )zD
Whether gradient checkpointing is activated for this model or not.
c              3   `   #    U  H$  n[        US 5      =(       a    UR                  v   M&     g7f)r  N)r   r  )r   ms     r   r   <PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>  s)     m^lYZ7167TA<T<TT^ls   ,.)rq  r!  r   s    r   is_gradient_checkpointing)PreTrainedModel.is_gradient_checkpointing  s    
 m^b^j^j^lmmmr   save_directoryis_main_processr   push_to_hubmax_shard_sizer  r  save_peft_formatsave_original_formatc
           	      <   Ub  XzS'   [        U SS5      n[        U SS5      nUSL=(       a'    [        U[        5      =(       a    UR                  5       nUb1  U(       d*  U(       d#  [	        SUR
                  R                   S35      eU R                  b  [        S5      (       d  [        S	5      e[        R                  R                  U5      (       a  [        R                  S
U S35        g[        R                  " USS9  [        R                   " U5      nU(       a  U
R#                  SS5      nU
R#                  SUR%                  [        R                  R&                  5      S   5      nU
R#                  SS5      n[)        U4SS0U
D6R*                  nU R-                  U5      n0 nUb  UR/                  U 5      u  nnSUS'   [1        U 5      nUR2                  n[5        U5      R%                  S5      S   UR6                  l        UR8                  R:                  R=                  S5      /UR6                  l        U R@                  b  [C        XU R6                  S9  U(       Ga  U(       d  UR6                  RE                  U5        U RG                  5       (       a  URH                  RE                  U5        U(       a  [        RK                  S5        URM                  US9nU(       a;  [        RK                  S5        0 nURO                  5        H  u  nnUUSU 3'   M     UnU RQ                  5       n[S        U5      S:  a  [	        S5      eUS   nU RT                  U   nURE                  U5        Uc  URW                  5       nSn[Y        U S5      (       a  [S        [[        U R\                  R_                  5       5      5      S:  aT  S U R\                  R_                  5       ;   d  S!U R\                  R_                  5       ;   a  Sn[`        Rb                  " S"5        [d        (       a6  [f        Rh                  Rj                  Rl                   H  u  nnU" U5      nM     U Rn                  b  U Rn                   H  nUU;   d  M  UU	 M     U R                  b+  [q        X0Rr                  U Rt                  U R                  5      n[w        UU5      nU	(       a  U(       d  [y        UU5      nU(       d  [z        n[}        UU5      nO[~        nUR                  S#S$5      R                  S%S&5      n [        UU US'9n!Sn"U!R                  (       a+  S(U R                  5       0U!R                  EU!R                  S).n"[        R                  " U5       GH  n#[        R                  R                  UU#5      n$UR                  S#S*5      R                  S%S*5      n%U#R                  S#S*5      R                  S%S*5      n&[        R                  " S+5      n'U#R                  U%5      (       d  M  [        R                  R                  U$5      (       d  M  U#U!R                  ;  d  M  U(       d  M  U'R                  U&5      c  M  [        R                  " U$5        GM     [        R                  " U!R                  RO                  5       S,S-9 H  u  n(n)[        R                  R                  UU(5      n#0 n*U) HT  n+UR#                  U+5      n,U(       a&  U,R                  R                  S.:X  a  [        UU+5      n,U,R                  5       U*U+'   MV     [        U*U#US/9  A*M     U"c9  [        R                  R                  UU5      n-[        RK                  S0U- 35        O[        n.[        R                  R                  U[}        U.U5      5      n.[        U.S1S2S39 n/[        R                  " U"S4SS59S6-   n0U/R                  U05        SSS5        [        RK                  S7U S8[S        U!R                  5       S9U. S35        U(       aY  [        WU R                  US:9n1U1R                  [        R                  R                  US;5      5        U R                  UUWWUWS<9  gg! , (       d  f       N= f)=a^  
Save a model and its configuration file to a directory, so that it can be re-loaded using the
[`~PreTrainedModel.from_pretrained`] class method.

Arguments:
    save_directory (`str` or `os.PathLike`):
        Directory to which to save. Will be created if it doesn't exist.
    is_main_process (`bool`, *optional*, defaults to `True`):
        Whether the process calling this is the main process or not. Useful when in distributed training like
        TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
        the main process to avoid race conditions.
    state_dict (nested dictionary of `torch.Tensor`):
        The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
        save parts of the model or if special precautions need to be taken when recovering the state dictionary
        of a model (like when using model parallelism).
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
        repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
        namespace).
    max_shard_size (`int` or `str`, *optional*, defaults to `"50GB"`):
        The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
        lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).

        <Tip warning={true}>

        If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
        which will be bigger than `max_shard_size`.

        </Tip>

    variant (`str`, *optional*):
        If specified, weights are saved in the format model.<variant>.safetensors.
    token (`str` or `bool`, *optional*):
        The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
        the token generated when running `hf auth login` (stored in `~/.huggingface`).
    save_peft_format (`bool`, *optional*, defaults to `True`):
        For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
        keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
        disable this behaviours by setting `save_peft_format` to `False`.
    save_original_format (`bool`, *optional*, defaults to `True`):
        For backward compatibility with the previous versions of `transformers` you can save the checkpoint with
        its reverse mapping. The reverse mapping needs to exists even if the model was loaded from a None legacy
        checkpoint.
    kwargs (`dict[str, Any]`, *optional*):
        Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
Nr  rB  Fr   zThe model is quantized with z and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.z0.31.4z[Saving a model with tensor parallelism requires `huggingface_hub` version 0.31.4 or higher.zProvided path (z#) should be a directory, not a fileT)exist_okcommit_messagerepo_idr<  	create_prr`  r!  formatrF  r   FSDP)r  zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.)r   zTo match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`.zbase_model.model.zMultiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`r   hf_device_mapr   diskz}Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (50GB default)z.binz{suffix}.binr  z{suffix}.safetensors)filename_patternr\  total_parameters)metadata
weight_mapr  z(.*?)-\d{5}-of-\d{5}zWriting model shards)descr  )rj  zModel weights saved in wr   r   r   )indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )r  z	README.md)ra  r  rc  )^rH  rm  rS   is_serializabler   quantization_configquant_method_tp_sizers   r  r   r   r  r  errormakedirsr  rT  r  sepr   rb  _get_files_timestampsget_state_dict_and_metadataunwrap_modelr   r   r  r  r   removeprefixarchitectures_auto_classr(   save_pretrainedre  rg  r  get_adapter_state_dictr)  active_adaptersr   peft_configr   r   ra  rf  r   r  r  IS_SAGEMAKER_MP_POST_1_10smpstatemodule_managertranslate_functionsr8  rF   r=  _device_meshr  r&   rZ   r  rW   r  r   r  r	  rj  tensor_to_filenamelistdirr  ri  r  r  filename_to_tensors	fullmatchrH  rk   tqdmr   ro  r6   
contiguoussafe_save_filerY   r  jsondumpswriterp   r,  save_upload_modified_files)2r   rY  rZ  r   r[  r\  r  r  r]  r^  r  rB  r   quantization_serializablesave_directory_pathra  rb  rc  files_timestampsrj  model_to_saver   peft_state_dictr	  r  active_adaptercurrent_peft_configis_offloaded	smp_to_hfrX  
ignore_keyr  rh  state_dict_splitindexr  full_filenameweights_no_suffixfilename_no_suffixreg
shard_filetensor_namesshard_state_dicttensor_namer   path_to_weightssave_index_filer6  content
model_cards2                                                     r   r~  PreTrainedModel.save_pretrained  s   v #7O!(/G!Ot^T:$qL+)NqS_SoSoSq 	" #,BKd.|/O/O/\/\.] ^u u  ==$-PQY-Z-Zm  77>>.))LL?>*::]^_
NT2 ii7#ZZ(8$?Njj,?,E,Ebggkk,RSU,VWG

;6I!'CDCFCKKG#99.I##/#K#KD#Q J! %T* ##%(Z%5%5c%:1%=" /<.E.E.N.N.[.[\b.c-d* 'tDKKH )$$44^D  ""//??O%~ +AAZAX
#KK ^ ')O&0&6&6&8
UEJ*;C5(AB '9!0J!%!5!5!7~&*$u  "0!2&*&6&6~&F##33NC &113J D/**C**113459$,,33554CUCUC\C\C^9^LMM: %$ #		 8 8 L L	1&z2
 !M ''3"::
+":. ;
 ==$3JtO`O`bfbobopJ 9]S
  (>1-LJ &,L'g>L4L'//GOOP^`vw=)9.
 &&/1D1D1FdJZJcJcd.AAE 

>2HGGLLBM !- 4 4VR @ H HY[ \ "*!1!1&"!=!E!EnVX!Y**45C ##$566GGNN=11$4$H$HH#OMM"45A		-(# 3( )000668?U)
$J ww||NJ?H!+#4
  FMM$6$6&$@5m[QF 170A0A0C -  ,  +XI /)
2 = ggll><HOKK1/1BCD5O ggll><Y`;abOosW=**U1EL  > KKL^L\ ] 0 D DEF G$$3#4A7 27DOOSXYJ OOBGGLLEF'' -# (   >=s   +b
bc                   > U R                   b  U R                   O/ nUR                  S/ 5      n[        U[        5      (       a  U/nU H  nXS;  d  M
  UR	                  U5        M     U(       a  X2S'   [
        TU ]  " U0 UD6$ )Nr  )r,  r   rm  r   rQ  rJ  r[  )r   r  r  r  tags_kwargsr  r  s         r   r[  PreTrainedModel.push_to_hub  sw    "&//"=t2jj,k3''&-KCC   !6Nw"D3F33r   c                     [        S U R                  5        5       5      nU(       a$  [        S U R                  5        5       5      nX#-   nU$ )a  
Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2

Arguments:
    return_buffers (`bool`, *optional*, defaults to `True`):
        Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
        are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
        norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
c              3   b   #    U  H%  oR                  5       UR                  5       -  v   M'     g 7fr   r=  r@  r  s     r   r   7PreTrainedModel.get_memory_footprint.<locals>.<genexpr>  s%     YGXe.."U%7%7%99GX   -/c              3   b   #    U  H%  oR                  5       UR                  5       -  v   M'     g 7fr   r  )r   bufs     r   r   r    s"     Y.3<<>C,<,<,>>.r  )sumr  r  )r   return_buffersmemmem_bufss       r   get_memory_footprint$PreTrainedModel.get_memory_footprint  s@     YtGXYYY$,,.YYH.C
r   c                   > [        U SS 5      [        R                  :X  ay  SSKJn  [
        TU ]  " U0 UD6  U R                  5        HM  n[        XC5      (       d  M  [        U5      S:  a  US   nOUR                  SS5      nUR                  U5        MO     U $ [        U SS 5      [        R                  :X  a  [        U SS5      (       a  [        S5      e[
        TU ]  " U0 UD6$ )	Nquantization_methodr   	HQQLinearr   r  is_loaded_in_8bitFzCalling `cuda()` is not supported for `8-bit` quantized models.  Please use the model as it is, since the model has already been set to the correct devices.)rH  r{   HQQhqq.core.quantizer  rJ  r  r!  rm  r   r   BITS_AND_BYTESr   )r   r  r  r  rC  r   r  s         r   r  PreTrainedModel.cuda  s    4.59K9O9OO3 GL$)&),,.f004y1}!%a!'Hf!=KK' ) K 4.59K9Z9ZZt0%88 s  w|T,V,,r   c                 \  > SU;   nU(       d,  U H&  n[        U[        R                  5      (       d  M$  Sn  O   [        U SS 5      [        R
                  :X  a  SSKJn  [        T	U ]$  " U0 UD6  U R                  5        HX  n[        Xe5      (       d  M  SU;   a  US   nOUS   nSU;   a  US   nOU(       a  WnOS nUb  Xl        UR                  U5        MZ     U $ U(       a*  [        U SS 5      [        R                  :X  a  [        S5      e[        U SS 5      [        R                  :X  a@  U(       a  [        S5      e[        U S	S
5      (       a  [!        S5      (       d  [        S5      eO1[        U SS 5      [        R"                  :X  a  U(       a  [        S5      e[        T	U ]$  " U0 UD6$ )Nr   Tr  r   r  r   zBCasting a Quark quantized model to a new `dtype` is not supported.zYou cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `dtype` argument.r  Fz0.48zsYou need to install `pip install bitsandbytes>=0.48.0` if you want to move a 8-bit model across devices using to().zYou cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `dtype` argument.)rm  r   r   rH  r{   r  r  r  rJ  r*  r!  compute_dtyper  QUARKr   r  re   GPTQ)
r   r  r  dtype_present_in_argsargr  rC  r   r   r  s
            r   r*  PreTrainedModel.to  s    !(6 1$c5;;//,0) 
 4.59K9O9OO3 GJ'',,.f006)!'!1!%a&( &w. # $ (/4,KK'# )$ K WT3H$%OSeSkSk%kabb 4.59K9Z9ZZ$ P 
 t0%88AZ[aAbAb  J  T0$7;M;R;RR$ H  wz4*6**r   c                 X   > [        U SS5      (       a  [        S5      e[        TU ]  " U6 $ )Nr   Fz`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)rH  r   rJ  halfr   r  r  s     r   r  PreTrainedModel.halfD  s6    4//I 
 7<&&r   c                 X   > [        U SS5      (       a  [        S5      e[        TU ]  " U6 $ )Nr   Fz`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)rH  r   rJ  r  r  s     r   r  PreTrainedModel.floatN  s6    4//I 
 7=$''r   r   r   r   c                    [        XR                  5      [        R                  " 5       [	        5       /nU(       a  UR                  [        5       5        [        5       (       a  SS KnU(       dl  U(       de  [        R                  S5        UR                  [        R                  " 5       UR                  R                  [        5       S9[!        5       /5        U$ U(       a/  UR                  ["        R$                  " S5      ['        5       /5        U$ UR                  ["        R$                  " S5      [        R(                  " 5       /5        U$ )Nr   r  r  r  )r   r   r  no_tie_weightsrP   rQ  r=   r.   r  r  r  rI  r  r  r  r,   r   r   r   r   meta_device_safe_creation_ops)rO  r   r   r   rX  r  r  s          r   get_init_context PreTrainedModel.get_init_contextX  s    
 +5,,?ATATAVXeXgh  !6!89%''  (:^_$$,,.!++@P@R+S')  $$ell6&:<O<Q%RS    %,,v"68Z8Z8\!]^r   c                    0 nU R                   bQ  U[        R                  :X  a=  UR                  [        R                  U R                   [        R                  5      5        U R                  ba  U[        R                  [        R                  4;   a=  UR                  [        R                  U R                  [        R                  5      5        U$ )z\Create the dtype_plan describing modules/parameters that should use the `keep_in_fp32` flag.)	r3  r   r  rx  r   fromkeysr   r4  r  )r   r   r   s      r   _get_dtype_planPreTrainedModel._get_dtype_planx  s    

 %%1eu}}6LdmmD,F,FVW ,,8Uu}}V[VdVdFe=edmmD,M,Mu}}]^r   kernel_configc                    U(       a  [        5       (       d  [        S5      eSSKJn  SSKJn  U" 5         Ubj  [        U[        5      (       aU  UR                  U 5        UR                  U 5        UR                  (       + nU" UR                  US9   SU l        SSS5        gSU l        gS	U l        g! , (       d  f       g= f)
aw  
Set whether or not to use the `kernels` library to kernelize some layers of the model.
Args:
    use_kernels (`bool`):
        Whether or not to use the `kernels` library to kernelize some layers of the model.
    kernel_config (`KernelConfig`, *optional*):
        The kernel configuration to use to kernelize the model. If `None`, the default kernel mapping will be used.
zk`use_kernels=True` requires kernels>=0.9.0. Please install the latest version with `pip install -U kernels`r   )use_kernel_mappingr   )$register_kernel_mapping_transformersN)inherit_mappingTF)rg   r   kernelsr  integrations.hub_kernelsr  rm  r^   sanitize_kernel_mappingcreate_compatible_mappinguse_local_kernelkernel_mappinguse_kernels)r   r  r  r  r  r  s         r   set_use_kernelsPreTrainedModel.set_use_kernels  s     '))  B  3V02(Z|-T-T55d; 77=
 '4&D&D"D'(D(DVef'+D$ gf $( $D gfs   B22
C r  )r  r  r   r  r  r  r  r   r   fusion_configr   rO  r   r  r   r  r  r  r   r   r  r   c                    UR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  S	S5      nUR                  S
S5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  S0 5      =(       d    0 R                  5       nUR                  SS5      n UR                  SS5      n!UR                  SS5      n"UR                  SS5      n#UR                  SS5      n$UR                  SS5      n%UR                  SS5      n&UR                  SS5      n'UR                  SS5      n(UR                  SS5      n)UR                  S S5      n*UR                  S!S5      n+U%b  U#c  S"n#S# H  n,UR                  U,S5      n-M     Ub  Ub  UOUnUc  S"n[        5       (       a	  U(       d  S$nUUUUUUUS%.n.0 U.ES&U0En/Ub  Uc  U"b  [        S'5      eUS":X  aC  [	        [
        R                  R                  S(S)5      5      (       a  [        R                  S*5        U#c  U$b  [        U#U$U&US+9u  nn&n$U"b  [        5       (       d  [        S,5      eUc  0 n[        UU/40 UD6u  n0nn[        U5      nS-S.US/.n1Ub  UU1S0'   [        U[        5      (       du  Ub  UOUn2U R                   n3U3c  [        U R"                   S135      eU3R$                  " U24S$U"UUS2.U.DUD6u  nn4SU4;   a  U4R                  S5        U4R                  SU5      nO%[        R&                  " U5      nUn4[)        USU5      nUU/S&'   S3U;   a  UR                  S35      Ul        S4U;   a  UR                  S45      Ul        [/        UUUU
U15      u  n5nnU"(       aK  U5b  [        S55      eUb:  [        U[0        5      (       a  S6UR3                  5       ;   d  S6U;   a  [5        S75      eU*b  U)(       d  [        R7                  S85        S$n)[9        UUU"U	U/U1U R;                  5       [)        US9S5      US:9	u  n6n7U5SLn8[=        UU6UU7XU55      u  nnU"(       a<  S;S<KJ n9  [B        RD                  " S=5         U " U5      n:SSS5        U9" U6S>   S$W:US?9S@   nXl#        Ub  [        R&                  " U5      Ul$        [)        USAS5      nUb  S;SBK%J&n;  U;" XU5        U RO                  UU8[P        U(5      n<[        R&                  " U5      n[S        U<5         U " U/UQ70 U4D6n=[U        U=5        U5b  U5RW                  U=UUU6U)SC9  SSS5        W=RY                  U5      n>[[        U=U+U55      n?[\        (       a  U&b  [_        U=U#U%U&U$5      n=Ub  [a        U=UUU55      n[c        UUU7UUUUU>U5U&U
U?U	U.USD9n@U Re                  U=UU6U@5      u  nAnBU Rg                  U=U@UA5      nAU=Ri                  5         U=Rk                  U)U*5        U=Rm                  5       (       a4  [o        U=SE5      (       a#  U"(       d  U=Rp                  " U!UUU40 U.DSU'0DUD6  Ub6  [s        [u        UR3                  5       5      5      S;:  a  [w        U=U5UUWBU5        U5b  U5U=l<        U5R{                  U=5        U0b  Ub  UUSF'   U=R}                  U0U W@USG9nAU(       a  U=WAR                  5       4$ U=$ ! , (       d  f       GNT= f! , (       d  f       GN= f)Ha:  
Instantiate a pretrained pytorch model from a pre-trained model configuration.

The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
the model, you should first set it back in training mode with `model.train()`.

The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
task.

The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
weights are discarded.

Parameters:
    pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
        Can be either:

            - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
            - A path to a *directory* containing model weights saved using
              [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            - `None` if you are both providing the configuration and state dictionary (resp. with keyword
              arguments `config` and `state_dict`).
    model_args (sequence of positional arguments, *optional*):
        All remaining positional arguments will be passed to the underlying model's `__init__` method.
    config (`Union[PreTrainedConfig, str, os.PathLike]`, *optional*):
        Can be either:

            - an instance of a class derived from [`PreTrainedConfig`],
            - a string or path valid as input to [`~PreTrainedConfig.from_pretrained`].

        Configuration for the model to use instead of an automatically loaded configuration. Configuration can
        be automatically loaded when:

            - The model is a model provided by the library (loaded with the *model id* string of a pretrained
              model).
            - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
              save directory.
            - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
              configuration JSON file named *config.json* is found in the directory.
    state_dict (`dict[str, torch.Tensor]`, *optional*):
        A state dictionary to use instead of a state dictionary loaded from saved weights file.

        This option can be used if you want to create a model from a pretrained configuration but load your own
        weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
        [`~PreTrainedModel.from_pretrained`] is not a simpler option.
    cache_dir (`Union[str, os.PathLike]`, *optional*):
        Path to a directory in which a downloaded pretrained model configuration should be cached if the
        standard cache should not be used.
    ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
        Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
        as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
        checkpoint with 3 labels).
    force_download (`bool`, *optional*, defaults to `False`):
        Whether or not to force the (re-)download of the model weights and configuration files, overriding the
        cached versions if they exist.
    proxies (`dict[str, str]`, *optional*):
        A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
        'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
    output_loading_info(`bool`, *optional*, defaults to `False`):
        Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
    local_files_only(`bool`, *optional*, defaults to `False`):
        Whether or not to only look at local files (i.e., do not try to download the model).
    token (`str` or `bool`, *optional*):
        The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
        the token generated when running `hf auth login` (stored in `~/.huggingface`).
    revision (`str`, *optional*, defaults to `"main"`):
        The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
        git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
        identifier allowed by git.

        <Tip>

        To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

        </Tip>
    attn_implementation (`str`, *optional*):
        The attention implementation to use in the model (if relevant). Can be any of
            - `"eager"` (manual implementation of the attention)
            - `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html))
            - `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention))
            - `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper))
            - `"flash_attention_4"` (using [Dao-AILab/flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)).
        By default, if available, SDPA will be used. The default is otherwise the manual `"eager"` implementation.

        Accept HF kernel references in the form:
          <namespace>/<repo_name>[@<revision>][:<kernel_name>]

        - <namespace> and <repo_name> are any non-"/" and non-":" sequences.
        - "@<revision>" is optional (branch, tag, or commit-ish), e.g. "@main", "@v1.2.0", "@abc123".
        - ":<kernel_name>" is optional and selects a function inside the kernel repo.
        - Both options can appear together and in this order only: @revision first, then :kernel_name.
        - We intentionally allow a leading "<wrapper>|" prefix (e.g., "flash|...") because the code
          strips it before loading; '|' is not excluded in the character classes here.

        Examples that match:
          "org/model"
          "org/model@main"
          "org/model:custom_kernel"
          "org/model@v1.2.3:custom_kernel"
    experts_implementation (`str`, *optional*):
        The experts implementation to use in the model (if relevant). Can be any of:

        - `"eager"` (sequential implementation of the experts matrix multiplications).
        - `"batched_mm"` (using [`torch.bmm`](https://pytorch.org/docs/stable/generated/torch.bmm.html)).
        - `"grouped_mm"` (using [`torch.nn.functional.grouped_mm`](https://docs.pytorch.org/docs/main/generated/torch.nn.functional.grouped_mm.html)).

        By default, if the model supports it, `"grouped_mm"` will be used. The default is otherwise the manual `"eager"` implementation.

    > Parameters for big model inference

    dtype (`str` or `torch.dtype`, *optional*, defaults to `"auto"`):
        Override the default `torch_dtype` and load the model under a specific `dtype`. The different options
        are:

        1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
          `dtype`, ignoring the model's `config.dtype` if one exists. If not specified
          - the model will get loaded in `torch.float` (fp32).

        2. `"auto"` - A `dtype` or `torch_dtype` entry in the `config.json` file of the model will be
          attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
          the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
          using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
          the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.

        3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc.

        <Tip>

        For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
        reach out to the authors and ask them to add this information to the model's card and to insert the
        `dtype` or `torch_dtype` entry in `config.json` on the hub.

        </Tip>

    device_map (`str` or `dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
        A map that specifies where each submodule should go. It doesn't need to be refined to each
        parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
        same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
        like `1`) on which the model will be allocated, the device map will map the entire model to this
        device. Passing `device_map = 0` means put the whole model on GPU 0.

        To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
        more information about each option see [designing a device
        map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
    max_memory (`Dict`, *optional*):
        A dictionary device identifier to maximum memory if using `device_map`. Will default to the maximum memory available for each
        GPU and the available CPU RAM if unset.
    tp_plan (`Optional[Union[dict, str]]`, *optional*):
        A torch tensor parallel plan, see [here](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). Use `tp_plan="auto"` to
        use the predefined plan based on the model. If it's a dict, then it should match between module names and desired layout.
        Note that if you use it, you should launch your script accordingly with `torchrun [args] script.py`. This will be much
        faster than using a `device_map`, but has limitations.
    tp_size (`str`, *optional*):
        A torch tensor parallel degree. If not provided would default to world size.
    device_mesh (`torch.distributed.DeviceMesh`, *optional*):
        A torch device mesh. If not provided would default to world size. Used only for tensor parallel for now.
        If provided, it has to contain dimension named `"tp"` in case it's > 1 dimensional, this dimension will be used for tensor parallelism
    offload_folder (`str` or `os.PathLike`, *optional*):
        If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
    offload_buffers (`bool`, *optional*):
        Whether or not to offload the buffers with the model parameters.
    quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
        A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
        bitsandbytes, gptq).
    subfolder (`str`, *optional*, defaults to `""`):
        In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
        specify the folder name here.
    variant (`str`, *optional*):
        If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin.
    use_safetensors (`bool`, *optional*, defaults to `None`):
        Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
        is not installed, it will be set to `False`.
    weights_only (`bool`, *optional*, defaults to `True`):
        Indicates whether unpickler should be restricted to loading only tensors, primitive types,
        dictionaries and any types added via torch.serialization.add_safe_globals().
        When set to False, we can load wrapper tensor subclass weights.
    disable_mmap (`bool`, *optional*):
        Whether to disable memory mapping when loading safetensors checkpoints. When `None` (default),
        it is auto-detected to `True` when the checkpoint lives on an `hf-mount` FUSE filesystem
        (used by HF Spaces/Endpoints), where mmap + parallel page-faults can deadlock. When `True`,
        files are read fully into memory and parsed with `safetensors.torch.load`. When `False`, the
        default memory-mapped loader is always used.
    fusion_config (`dict[str, bool | dict[str, Any]]`, *optional*):
        Optional fusion configuration applied before model instantiation. Each key enables a fusion family and
        its value can either be `True` to enable that fusion with default options or a dictionary of
        family-specific options. For example, `{"patch_embeddings": True}` enables patch embedding fusion.
        This should only be used as an inference optimization, as it can slightly change outputs. If omitted,
        `from_pretrained()` falls back to `config.fusion_config` when available. Refer to the fusion mapping
        guide in `docs/source/en/fusion_mapping.md` for more details.
    key_mapping (`dict[str, str], *optional*):
        A potential mapping of the weight names if using a model on the Hub which is compatible to a Transformers
        architecture, but was not converted accordingly.
    kwargs (remaining dictionary of keyword arguments, *optional*):
        Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
        `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
        automatically loaded:

            - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
              underlying model's `__init__` method (we assume all relevant updates to the configuration have
              already been done)
            - If a configuration is not provided, `kwargs` will be first passed to the configuration class
              initialization function ([`~PreTrainedConfig.from_pretrained`]). Each key of `kwargs` that
              corresponds to a configuration attribute will be used to override said attribute with the
              supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
              will be passed to the underlying model's `__init__` function.

<Tip>

Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
use this method in a firewalled environment.

</Tip>

Examples:

```python
>>> from transformers import BertConfig, BertModel

>>> # Download model and configuration from huggingface.co and cache.
>>> model = BertModel.from_pretrained("google-bert/bert-base-uncased")
>>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
>>> model = BertModel.from_pretrained("./test/saved_model/")
>>> # Update configuration during loading.
>>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
>>> assert model.config.output_attentions == True
```
r   Nr  r  output_loading_infoF_from_pipeline
_from_autor   r  r   
max_memoryoffload_folderr   rr  r  r  r  r  adapter_kwargsadapter_namer  rg  r  r  tp_sizer  r   trust_remote_coderX  r  r  key_mappingr  )mirror
_fast_initlow_cpu_mem_usagefrom_tf	from_flaxoffload_state_dictT)r  r  r  r  r  r  r  r  zq`state_dict` cannot be passed together with a model name or a `gguf_file`. Use one of the two loading strategies.
WORLD_SIZEr   a  You've set device_map=`auto` while triggering a distributed run with torchrun. This might lead to unexpected behavior. If your plan is to load the model on each device, you should set device_map={: PartialState().process_index} where PartialState comes from accelerate library)r  r   r   zIaccelerate is required when loading a GGUF file `pip install accelerate`.re  pytorch)	file_typer"  from_auto_classusing_pipelinezN does not define `config_class`; pass an explicit config to `from_pretrained`.)return_unused_kwargsr  r  r  r  r  zYou cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub.rg  zxOne or more modules is configured to be mapped to disk. Disk offload is not supported for models loaded from GGUF files.zA kernel_config was provided but use_kernels is False; setting use_kernels=True automatically. To suppress this warning, explicitly set use_kernels to True.transformers_weights)	r   r  r  r   r   r  r  r  r  r   )load_gguf_checkpointr  r   )return_tensorsmodel_to_loadr  rO  r  )register_fusion_patches)re  r   r   r  r  )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   adjust_generation_fnr  )r  load_configr  )@rT  rs  r   r   r   r   r   r   r  r  rG   rd   r@   r3   rm  r!   r)  r   from_pretraineddeepcopyrH  r_  rc  rT   r   r   ru  r  r  r  r  modeling_gguf_pytorch_utilsr  r   r   r]  r  fusion_mappingr  r  r   r]   rQ   preprocess_modelr  r"   r   rE   r0   r   _load_pretrained_model_finalize_model_loadingevalr  re  r   r  r   ra  r2   r   postprocess_modelload_adapterto_dict)CrO  r   r  r  r   r  r  r  r  r   r   r  r   
model_argsr  r   r  r  r  from_pipeliner  r   r  r   r  r  r   rr  r  r  r  r  r  rg  r  r  r  r  r   r  rX  r  r  r  rK  rX  r   download_kwargs_with_commit_adapter_model_pathr  config_pathr)  model_kwargsr   r  r   r   r  dummy_modelr  model_init_contextre  r   weight_conversionsr  loading_infodisk_offload_indexsC                                                                      r   r	  PreTrainedModel.from_pretrained  s   j ZZd3
**Y-ZZd3
$jj)>F

#3T: **\59

7D)jj5ZZd3
ZZd3
$4d; **%6>$jj)>EJJ{B/	jj6**Y- **%5r:@bFFHzz.)<"JJ':DAJJ{D1	**Y-**Y-06

;OQU0Vjj5"JJ':DA"JJ':EBjj6

?D9jj5)goG pD

4&A p "".EKE=E%5# #, 0 "
 'V&U-&U#!'D'PT]Ti D  C

|S(I$J$JKKc '"5/Lkj0,JW  )@)B)Bhii!NM`)'N
 N
J:N
 .j9
#*Wfg
$+8J'( &"233$*$6&<YK++L# ||n$rs  $0#?#?$%)#*,$ "$ $ FL l*  -&**>;GK]]6*F!L!&.+FK5@#M2 !F**0**5J*KF'#v--3ZZ8P-QF*+;'\:,
(fj '  Y  %J--&J<M<M<O2OTZ^hTh". 
 $[ o K-K*G+7!--/+26;QSW+X!
.
** $4/ ##V-=zYe
 I f%!&k & . #DafJ < $#'==#?F   >$?#C? 11%GY[lmv&/0<<|<E"5)'--)%5 + .  1 **51
 :%l[''K,C$UG5GV]^E !(
JUJ **G$;-! .+!%#%-++%
" ,/+E+EeZYikv+w((225+|T

k=9 GE3I$J$JS\&&!-	
 " #4  !c#j.?.?.A*B&Ca&G|ZQcetu#!-E** * */w' --#)'-	 . L ,..000Y &%. 10s   	],2.]>,
];>
^re  r  r  expected_keysc           
         UR                   nUR                  nUSL=(       a7    UR                  R                  [        R
                  [        R                  1;   nUc'  [        U R                  5       R                  5       5      OUn[        R                  [        R                  :  a  [        U[        U SS5      5        SnUR                   ba  SUR                   R#                  5       ;   aC  [%        U UR&                  UUR                   UR(                  UR*                  UR,                  5      nUR                   b3  U(       d,  [/        UR                   U5      n	[1        X	UR                   5        / n
[3        5       (       at  U(       dm  Uc<  0 nU H2  nUR5                  [7        USUR8                  UR:                  S95        M4     Un[=        XU5      u  p[?        UU
[A        5       [A        5       0 S9nX4$ [A        5       nUb  UnGOUb  US   RC                  S5      (       a  Uc  0 nU H  nUR:                  (       d  [E        U5      (       a?  [G        US	5       nUR5                  [I        URK                  5       5      5        SSS5        Mc  [M        US
SS9nURO                  U5        UR                  5        H  nURQ                  U5      UU'   M     M     O<Ub.  0 nU H%  nUR5                  [7        XR:                  S95        M'     O[S        S5      e[U        U UUU RV                  US9u  pU H  nURY                  SSS5        M     X4$ ! , (       d  f       N= f)zzPerform the actual loading of some checkpoints into a `model`, by reading them from disk and dispatching them accordingly.Nr=  rg  r   )r  r   r   )r  
error_msgsunexpected_keysmismatched_keysconversion_errorsr   r  r   r!  )r"  r   )r   z5Neither a state dict nor checkpoint files were found.)re  r   r  r  r  )-r   r   rr  rs  r{   r  r  r   r   r+  r  levelrk   WARNINGrI   rH  r   r   r1   r   r   r   r   r4   caching_allocator_warmupr.   rx  r:  r   r   r7   rw   ra  r&  r  r  r'  r(  r   rS  r,  r   r%   r  __exit__)re  r   r  r  r   r   r   is_hqq_or_quarkr  expanded_device_mapr"  merged_state_dict	ckpt_filer  r  all_pointerfiler3  file_pointerr4  s                       r   r  &PreTrainedModel._load_pretrained_model  s(    #//"//&d2 
|7W7W7d7d""$$i
 8
 <I;PU--/4467Vc<<7??*='%T*JK "!!-&K<R<R<Y<Y<[2[!8// &&,,!!**" !!-o"3K4J4JM"Z$UAYAYZ
%''!$&!!1I%,,'%).)4)A)A)4)A)A	 "2 /
'H\g'h$J,)% # #"$LT //E %K%$.!!-2B12E2N2N~2^2^cmcu$&!,D"//?43H3H!$--445Echhj5QR . #,TT%#PLOOL1)..0/;/E/Ea/H)!, 1 - "-$&!!1I%,,_YUmUm-no "2 !!XYY/S,'#50,L !

4t, ! //7 .-s   )M
M	r  c           
          U R                  U5        U R                  UR                  5       UR                  UR                  UR
                  5        U R                  UR                  5        U R                  UR                  SS9  U R                  U5        [        U UR                  UR                  U[        S9  U$ ! [        U UR                  UR                  U[        S9  f = f)a  Perform all post processing operations after having loaded some checkpoints into a model, such as moving
missing keys from meta device to their expected device, reinitializing missing weights according to proper
distributions, tying the weights and logging the loading report.F)r  r  )re  r   r   r  r  ) mark_tied_weights_as_initialized&_move_missing_keys_from_meta_to_devicemissing_and_mismatchedr   r   r   _initialize_missing_keysr   r  r  #_adjust_missing_and_unexpected_keysrx   r   r   r  )re  r  r  s      r   r  'PreTrainedModel._finalize_model_loading%  s    	22<@ 88335&&''((	 **;+C+CD <+D+DX]^ 55lC!.9.W.W(3(K(K)  ".9.W.W(3(K(K)s   BC   'C'c           
         U Vs1 s H&  nSR                  UR                  S5      S S 5      iM(     nnUR                  U Vs1 s HQ  n[        U5      S:  d  M  US   R	                  5       (       d  M.  SR                  UR                  S5      S S 5      iMS     sn5      n/ nU R                  5        H  u  pxU(       a!  U R                   S3n	UR                  U	5      nO?U(       a8  [        U5      S:  a  SR                  U R                  U/5      OU R                  nXu;   d  Ms  UR                  U5        M     U$ s  snf s  snf )NrF  r<  r   r  )	r  r  unionr   isdigitrG  r*  r{  rQ  )
r   rx  
add_prefixremove_prefixr	  module_keysretrieved_modulesrK  rC  _prefixs
             r   retrieve_modules_from_names+PreTrainedModel.retrieve_modules_from_namesL  s&   @EFsxx		#s 34F "''6;bess3x!|*PSTVPWP_P_Pa*SXXciinSb)*eb
  ..0LD!334A6((1CFt9q=sxx!7!7 >?VZVlVl"!((0 1 ! ) G
 cs   -D>EE4'Ec                     [        U[        5      (       d  UR                  nSSKJs  Jn  [        X!5      (       d  [        U S35      eXl        g)a%  
Register this class with a given auto class. This should only be used for custom models as the ones in the
library are already mapped with an auto class.



Args:
    auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
        The auto class to register this new model with.
r   Nz is not a valid auto class.)	rm  r   r   transformers.models.automodelsr  r   r   r}  )rO  
auto_classauto_modules      r   register_for_auto_class'PreTrainedModel.register_for_auto_classc  sE     *c**#,,J66{//
|+FGHH$r   c           
         [        U5      (       a  gUc  U R                  R                  c  gU R                  R                  USS2SS/4   ;   Ga#  Sn[        U R                  SS5      nU R                  R                  b.  U R                  R                  U R                  R                  :X  da  U R                  R
                  b.  U R                  R
                  U R                  R                  :X  d  Ubg  X@R                  R                  :X  aN  USU R                  R                   SU R                  R                   SU R                  R
                   S	U S
3	-  n[        R                  U5        gg)zf
Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
Nr<  r   zWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.sep_token_idz5
You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.)rv   r  pad_token_idrH  bos_token_ideos_token_idr  r  )r   r-  r  warn_stringrK  s        r   %warn_if_padding_and_no_attention_mask5PreTrainedModel.warn_if_padding_and_no_attention_masky  sK    i  &DKK,D,D,L ;;##yRG'<<F  #4;;EL))5$++:R:RVZVaVaVnVn:nKK,,8T[[=U=UY]YdYdYqYq=q ,AYAY1YLT[[MeMeLf g..2kk.F.F-GGZ[_[f[f[s[sZt u..:^;[] ,- =r   c                     U R                   (       a  gU R                  R                   (       a  gU R                  R                  (       a  gg)z:
Returns whether the model has a tensor parallelism plan.
TF)r=  r  r  rt  r   s    r   supports_tp_plan PreTrainedModel.supports_tp_plan  s1     ==??##;;))r   c                     U R                   $ )z0
Returns the model's tensor parallelism degree.
)rt  r   s    r   r  PreTrainedModel.tp_size  s     }}r   c                     U R                   (       a  gU R                  R                   (       a  gU R                  R                  (       a  ggr   )r>  r  r  rr  r   s    r   supports_pp_plan PreTrainedModel.supports_pp_plan  s1     ==??##;;))r   c                     [        U S5      (       a  U R                  $ [        U SS 5      nUb
  U[        ;  a  [        R                  SU S35        Sn[        U   $ )N_loss_functionri  z`loss_type=zZ` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.ForCausalLM)r   r[  rH  rJ   r  r  )r   ri  s     r   loss_functionPreTrainedModel.loss_function  sh    4)**&&&D+t4		 =i[ )= > &II&&r   c                     Xl         g r   )r[  r   r  s     r   r]  r^    s    #r   c                 r   [        5       (       d  [        S5      eSSKJnJnJn  S nS n U R                  U5        U R                  (       d  UR                  OUc  UR                  OUnU" X" U R                  R                  S9US9  S	U l        U R                  U5        g! U R                  U5        f = f)
zYTemporarily register hidden kernel wrappers so `kernelize` can discover and replace them.z`Kernels are not available. To use kernels, please install kernels using `pip install -U kernels`r   )DeviceMode	kernelizec                     [        U S0 5      R                  5        H5  u  pU[        U R                  5       5      ;  d  M$  U R	                  X5        M7     g N_hidden_kernels)rH  r)  r   rw  register_module)rC  rK  r  s      r   attach_hidden_kernels8PreTrainedModel.kernelize.<locals>.attach_hidden_kernels  sF    #F,=rBHHJtF$9$9$;<<**44 Kr   c                 B    [        U S0 5       H  n[        X5        M     g rf  )rH  r  )rC  rK  s     r   detach_hidden_kernels8PreTrainedModel.kernelize.<locals>.detach_hidden_kernels  s    (92>% ?r   N)ro  )r   modeT)rg   r   r  rb  rc  rd  rF  training	INFERENCETRAININGr   ro  _use_kernels)r   rn  rb  rc  rd  ri  rl  s          r   rd  PreTrainedModel.kernelize  s    #%%r  	43	5
	&	.JJ,-)-4>>T\DMM_cDd6t{{/?/?#@tL $D JJ,-DJJ,-s   A%B# #B6c                     [        U SS5      $ )Nrr  Fr  r   s    r   r  PreTrainedModel.use_kernels  s    t^U33r   r  c                     [        U5      (       a  [        U SS5      (       a  g U(       a  U R                  5         g [        U SS5      (       a  [        R	                  S5        SU l        g )Nrr  FzmDisabling kernels at runtime is a no-op as there is no 'unkernelize' routine; keeping current kernels active.)r   rH  rd  r  r  rr  r`  s     r   r  ru    sX     ;;74??NNt^U33## D !&Dr   compile_configc                    SU R                   R                  ;   a  U R                  $ U=(       d
    [        5       n[	        U R
                  SS5      =(       d
    [        5       n[        U S5      (       a  [	        U SU5      U:w  a:  Xl        [        R                  " U R                  40 UR                  5       D6U l        U R                  $ )at  Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between
non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't
want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding
(where we want the speed-ups of compiled version with static shapes).llama4rw  N_compiled_call_last_compile_config)r  
model_type__call__r)   rH  rg  r   r{  r   r  r  rz  )r   rw  default_configs      r   get_compiled_call!PreTrainedModel.get_compiled_call  s     t{{---== ':=? !7!79I4PcTaTc.//t3^DV(6%"'--"ZAWAWAY"ZD"""r   c                     U R                   $ r   )rA  rO  s    r   is_backend_compatible%PreTrainedModel.is_backend_compatible  s    ...r   r   r   r   r   c                    USLn[        5       (       a  U(       d  g[        5       (       a  [        5       (       d|  U(       du  U R                  5        H&  u  pg[        R
                  " USS9n[        XU5        M(     U R                  5        H&  u  pi[        R
                  " U	SS9n[        XU5        M(     gXR                  R                  5       -
   Ha  nU R                  U5      n[        X&SS9n
[        R                  " XzS9nUb   [        XXvSSUR                  5       U5        MU  [        XU5        Mc     U R                  5        H/  u  pi[        X&SS9n[        R                  " XS9n[        XU5        M1     g)a  Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts)
back from meta device to their device according to the `device_map` if any, else cpu. Takes care of sharding those
missing parameters if `device_mesh` is provided, i.e. we are using TP.
All non-persistent buffers are also moved back to the correct device (they are not part of the state_dict, but are
not missing either).
Nr   r  T)valid_torch_deviceF)r.   r/   r   r   r   
zeros_liker  r  rq  r+  r  r5   
empty_likerH   get_local_ranknamed_non_persistent_buffers)r   r  r   r   r   r   r	  r  r  r  param_devicebuffer_devices               r   r4  6PreTrainedModel._move_missing_keys_from_meta_to_device  sT    $4/%'' %9%;%;L"335
((u=*4e< 6  $113((>*4e<  4 
  "<"<"A"A"CCC005E%j$OL$$U@E&+T5+:T:T:VXc
 +4e< D  <<>KC&z4PM$$VBE&t%8 ?r   c           
      N   [        5       (       aF  [        5       (       d7  U R                  5        H  n U R                  U5      nSUl        M     SU l        [        5       (       a  U(       d  SSKn[        U R                  SS9R                  5        Vs1 s H  n[        USS5      (       a  M  UiM     sn5      nUR                  R                  USS9   U R                  5         SSS5        gU R                  5         g! [
         a     M  f = fs  snf ! , (       d  f       g= f)a  
Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to
`_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to
be initialized correctly (i.e. weight initialization distribution).

Also marks non-missing params/buffers with `_is_hf_initialized` and propagates this flag to modules,
so that `_initialize_weights` can skip fully-initialized modules entirely.
Tr   N)	keep_varsrx  Fr  )r/   r   r   r  rx  AttributeErrorr.   r  r   r   rH  r  r  r  )r   r   r	  param_or_bufferr  r5  not_initialized_parameterss          r   r6  (PreTrainedModel._initialize_missing_keysK  s    %9%;%; (&*&B&B3&GO9=O6 ) '+D# &'' *. OOdO;BBDtDqGTUWkmrLsDt*& 223M]^2_'') `_ ##% &  u__s)   D D.DD 
DD
D$c                    [        S U R                  5        5       5      nU(       a  S/O/ n[        S U R                  5        5       5      nU(       a  UR                  S5        U R                  =(       d    / nU R                  =(       d    / U-   nSu  px[        U5      S:  a,  [        R                  " SR                  S U 5       5      5      n[        U5      S:  a,  [        R                  " SR                  S	 U 5       5      5      nUb5  UR                   V	s1 s H  oR                  U	5      b  M  U	iM     sn	Ul	        Ub6  UR                   V	s1 s H  oR                  U	5      b  M  U	iM     sn	Ul        g
g
s  sn	f s  sn	f )zAdjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid
raising unneeded warnings/errors. This is performed in-place.
c              3   H   #    U  H  u  pUR                  S 5      v   M     g7f)zrotary_emb.inv_freqNr&  r   r  rX  s      r   r   FPreTrainedModel._adjust_missing_and_unexpected_keys.<locals>.<genexpr>v  s!     "p[oif6??3H#I#I[o    "zrotary_emb\.inv_freqc              3   H   #    U  H  u  pUR                  S 5      v   M     g7f)position_idsNr  r  s      r   r   r  y  s      &mXl96v~'F'FXlr  z(^|\.)position_ids$)NNr   rZ  c              3   .   #    U  H  nS U S3v   M     g7frY  r  Nr   r   patterns     r   r   r    s     6gVf7!G9AVfr  c              3   .   #    U  H  nS U S3v   M     g7fr  r   r  s     r   r   r    s     9mYlgQwiq/Ylr  N)rq  r  rQ  r6  r7  r   ri  r  r  r  rj  r#  )
r   r  has_inv_freq_buffersadditional_unexpected_patternshas_position_ids_buffersmissing_patternsunexpected_patternsignore_missing_regexignore_unexpected_regexr	  s
             r   r7  3PreTrainedModel._adjust_missing_and_unexpected_keyso  sc     #"p[_[m[m[o"ppFZ*A)B`b&#&&mX\XjXjXl&m#m #*112HI??E2#FFL"Pnn8B5 1$#%::chh6gVf6g.g#h "#a'&(jj9mYl9m1m&n#  ++88)8<W<WX[<\8)L%
 #.+;;,;?]?]^a?b;,L( /),s   E>6E>F.Fc                 b   [        U S0 5      R                  5        H!  nU R                  U5      n[        USS5        M#     U R	                  5       (       aU  UR
                   Vs1 s H7  nX@R                  ;   d#  [        U R                  U5      SS5      (       a  M5  UiM9     snUl        ggs  snf )a-  Adds the `_is_hf_initialized` flag on parameters that will be tied, in order to avoid initializing them
later as they will be tied (overwritten) anyway.
This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so
running inits on them is very costly.rq  rx  TFN)rH  r+  rp  r  r  r  rq  r  )r   r  
tied_paramr  r	  s        r   r3  0PreTrainedModel.mark_tied_weights_as_initialized  s    
 "$(?DIIKJ&&z2EE/6 L   
 (44)4C444t;;C@BVX]^ 4)L% !)s   $4B,B,r  c                     U R                  U5      $ ! [         a     Of = f U R                  U5      $ ! [         a     Of = f[        X5      u  p#US:X  an  [	        UR
                  S[        R                  R                  R                  5      [        R                  R                  R                  La  UR                  5       $ [        SU S35      e)aI  
Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines
`get_parameter()` and `get_buffer()` in a single handy function. If the target is an `_extra_state` attribute,
it will return the extra state provided by the module. Note that it only work if `target` is a leaf of the model.
_extra_stateget_extra_state`z2` is neither a parameter, buffer, nor extra state.)
rp  r  
get_bufferrU   rH  r  r   r   r'  r  )r   r  rC  r  s       r   r  'PreTrainedModel.get_parameter_or_buffer  s    	%%f-- 			??6** 		1$?.(((*;UXX__=\=\]88??223 ))++q(Z[\\s    
  5 
AAr  r  c              #      #    U R                  XS9 HJ  u  p4SU;   a  UR                  SS5      OSU4u  pVU R                  U5      nXeR                  ;   d  ME  X44v   ML     g7f)zSimilar to `named_buffers`, but only yield non-persistent ones. It is handy as it's not perfectly straightforward
to know if they are persistent or not)r  r  rF  r   r  N)r  r  r  _non_persistent_buffers_set)r   r  r  rK  r   r  buf_names          r   r  ,PreTrainedModel.named_non_persistent_buffers  sg     
 !..w.bLD 7:Tkt{{32DzF''/F===l" cs   AA!
A!rn  c                 h   > [         TU ]  U5      nU R                  (       a  U R                  5         U$ r   )rJ  trainr  rd  )r   rn  outr  s      r   r  PreTrainedModel.train  s)    gmD!NN
r   c                 $    U R                  S5      $ )NF)r  r   s    r   r  PreTrainedModel.eval  s    zz%  r   c                     U R                   S L$ r   )r}  r  s    r   r  PreTrainedModel.is_remote_code  s    d**r   )rz  rp  rM  rx  r3  r4  r{  r[  r1  r>  r?  r>  r=  rr  rq  r  rg  r  ri  re  r,  r]  r  r  r   )r   NFr  r7  )NNT)NFT)TNF50GBNNTT)T)	AutoModel)TT)r   r   r   r   r   r)  ro  r!   r   r}  r*  r   r+  r   r,  r   r.  r0  r1  ra  r2  r3  r4  rE  r   r5  r6  r7  r8  r9  r:  r;  r<  r=  rt  r>  r  r?  r@  rA  rB  r   r   compilerallow_in_graphrz   rD  r   rG  rK  r[  r  r  r  setterr  rz  r  classmethodr  r   r'  r  re  r   r   r  r  r  r  r  r^  rb  r  r  r&  r  r0  r3  rE  rJ  r^  rd  rk  rn  no_gradr  r  r  guard_torch_init_functionsr  rv  r  r  r  r  r  r  r  r  r  r  r  r  r5  r8  ry  r  r   rE  rQ  rW  r   PathLiker~  r   r_   r[  r  r  r*  r  r  r   r  r  r^   r  r   r   r	  r  r   rw   r  r  rA  rH  rP  rS  r  rX  r]  rd  r  r)   r  r  rS   r4  r6  r7  r3  r  r   r  r  r  r  r   __classcell__r  s   @r   r   r     s   ( 37L$'(4/6KsL$#'JS	D ' 'OS& )/cDIo. 6:s3x$s)+d29:>tCy4!7>
 :>3s8d3i/$6=@D #c(T#Y"6"=D *.S#X-57"DcN78<#T#Y%5<;?&S	D(8?04T#Y-4 !ND !&$& %%:>%tCy4'7>  $Hd38n#H ,0Hd3c3h'(/ -2#T1#(D( ).-'++
^^""'.Dn)<$= '. # '.R 9d3#45 9 9/0&M/ &MP/>b c3h   c5c?23   ^^!DcNT1 ! !F ^^DeCHo!56=  
:;,49s? ,t ,@ E EN ;BII ; ; #T # #V FH-1@@ %-@ !)	@
 !x}!5s!:;@ #(hm(<c(A"B@ !$d
@DI3 It I`d IV  <	$ 	T d 8 gll.#&:l.>Bl._cl.	l.\1sUYz 1^a 1$$3: $$^b $$or $$L"C$J "SV "0 T  $ 5 5 5d83: d8Z^ d8L*Wt *WX*X)C$J @%S4Z %4.%" ]]_7? 7?r)$ )2 ]]_	$$&H ' H8p%D p%T p%dQ8C4 Q8SW Q8f
M &*)-"	9d
9  $J9 	9
 
9v%+T &*)-"]] d
]  $J	]
 ] 
]D &* "AYYA d
A 	A
 A 
AF@ !@ @,d
d
c 

bll8K)K 

2(.T :>gq $ \d ,/( n4 n n !%"&!$*"#'!%%)\bkk)\ \ 4K	\
 \ c	\ t\ TzD \ \ #\| >%%&4 '4$ 588?? - !-2 588??:+ :+x'( KK/3IMbfimbm >U[[ T  %%,:M %%N 
 ?C.2(-$!&#''+!AE$(M-.M'*R[['84'?M !3&4t;	M
 $t+M "&M M M TzD M M M M CS#X!667$>M TkM  
%!M M^  +/c0 c04Kc0 s)d*c0 )	c0
 Cy4'c0 
 $&	'c0 c0J $/$?P$	$ $L!. % %*#-J     
 
 ' ' $ $.6 4T 4 4 & &$ & &#0D # #$ / //93i/9 4K/9 -	/9
 "D(/9 
/9b"&T "&d "&H @Q  VZ  D8]c ]2 >B##6:#	%U\\)*	+#$  ! +t + +r   r  z
model file)objectobject_classobject_files	recursivec                     g r   r   re  r  s     r   rz  rz    s    VYr   c                     g r   r   r  s     r   rz  rz    s    JMr   c                     [        5       (       a  0 nU(       a  XS'   [        U 40 UD6$ [        U S5      (       a  [        U R                  5      $ U $ )a  
Recursively unwraps a model from potential containers (as used in distributed training).

Args:
    model (`torch.nn.Module`): The model to unwrap.
    recursive (`bool`, *optional*, defaults to `False`):
        Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
        recursively, not just the top-level distributed containers.
r  rC  )rd   r}   r   rz  rC  )re  r  r  s      r   rz  rz    sP       "+;*5;F;; 5(##--Lr   r   c                 V    U S:X  a  g[         R                  " U 5      R                  S;  $ )zCheck if the device is an accelerator. We need to function, as device_map can be "disk" as well, which is not
a proper `torch.device`.
rg  F)r  r   )r   r   ro  r  s    r   is_accelerator_devicer    s)     ||F#((??r   accelerator_device_mapc                    [        S 5      nU R                  R                  5       n[        5       (       a  U R                  O/ nUR                  5        H  u  pgXd;   a  M  U R                  U5      nUb  UR                  XU5      n	OUR                  5       n	UR                  5       U	-  n
[        U5      S:  a!  [        XeSS9SLnX(       a
  [        5       OS-  n
X7==   U
-  ss'   M     U$ )z
This utility function calculates the total bytes count needed to load the model on each device.
This is useful for caching_allocator_warmup as we want to know how much cache we need to pre-allocate.
c                      g)Nr   r   r   r   r   r  &get_total_byte_count.<locals>.<lambda>  s    1r   Nr   T)	is_weightr   )r   rq  r+  r   r=  r)  r  param_element_sizer@  r  r   rD   r   )re  r  r   total_byte_counttied_param_namesr  r  r   r  
dtype_sizeparam_byte_countis_part_of_plans               r   get_total_byte_countr    s     #9-22779 A C CennG4::<
)--j9#%88ERJ++-J ;;=:5w<!4ZTXYaeeO!B!D]^^ $44 % =& r   r+  c                    UR                  5        VVs0 s H.  u  p4[        U5      (       d  M  U[        R                  " U5      _M0     nnnU(       d  g[	        XU5      nUR                  5        H  u  pGUR
                  S;   a  [        [        UR
                  5      nUR                  b  UR                  OUR                  5       n	UR                  U	5      u  pUR                  U	5      UR                  U	5      -
  nX|-
  U:  a  X|-
  nOX|-
  S:  a  US-   U
:  a  SnOUS-   nOSn[        X{S-
  5      n[        R                  " [        US-  5      [        R                  USS	9nM     gs  snnf )
a  This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
the model, which is actually the loading speed bottleneck.
Calling this function allows to cut the model loading time by a very large margin.

A few facts related to loading speed (taking into account the use of this function):
- When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
to cache the different state dicts (if enough resources/RAM are available)
- Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
- As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
These numbers are reported for TP on 4 H100 GPUs.
- It is useless to pre-allocate more than the model size in this function (i.e. using an `allocation_factor` > 1) as
cudaMalloc is not a bottleneck at all anymore
- Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
N)r  xpug      Ar   r   g333333Ar   F)r   r   r  )r)  r  r   r   r  ro  rH  r  current_devicemem_get_infomemory_reservedmemory_allocatedr  r/  r   r  )re  r+  r   r  r   r  r  
byte_countaccelerator_moduler  free_device_memorytotal_device_memoryunused_memoryrX  s                 r   r(  r(  4  sg   * :M9R9R9T9TXmntXu#u||F##9T   "+E<X /446;;/)!(!<$*LL$<FLLBTBcBcBeE6H6U6UV[6\3.>>uEHZHkHklqHrrM )M9'7
+m; !1$'99!"J "/!2J 
 Z})LMJKKJ!O,EMM&`efG 7s
   E+E+c                   V   ^  \ rS rSrSr\\\\\\\\\	\
S.
rS\S\S\4U 4S jjrSrU =r$ )	AttentionInterfaceiw  aO  
Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`,
it needs to declare a new instance of this class inside the `modeling_<model>.py`, and declare it on that instance.
)
flash_attention_4flash_attention_3r  r  r  zpaged|flash_attention_4zpaged|flash_attention_3zpaged|flash_attention_2z
paged|sdpazpaged|eagerr  r  r   c                    > Uc  [         R                  S5        OUS:w  a  X;  a  [        SU S35      e[        TU ]  X5      $ )zcReturn the requested `attn_implementation`. Also strictly check its validity, and raise if invalid.a	  You tried to access the `AttentionInterface` with a `config._attn_implementation` set to `None`. This is expected if you use an Attention Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._attn_implementation`r  r  zP` is not a valid attention implementation registered in the `AttentionInterface`)r  r  KeyErrorrJ  r   )r   r  r  r  s      r   get_interface AttentionInterface.get_interface  s[    &K
 !G+0C0O'((xy  w{.88r   r   )r   r   r   r   r   r:   r<   rA   r;   rB   r8   _global_mappingr   r   r  r   r  r  s   @r   r  r  w  sN     5440&#:#:#:24O9 9x 9H 9 9r   r  r  c                   p    \ rS rSrSr\S\R                  4S j5       r\S\R                  4S j5       r	Sr
g)	PreTrainedAudioTokenizerBasei  a  
Class that additionally defines the behavior of any `audio_tokenizer` to be added.
Characteristic for any of them:
    1. Encode raw audio into discrete audio codebooks (with x channels)
    2. Decode from discrete audio codebooks back to raw audio
It is possible that they can decode in different ways given a different representation
but they are forced to support 2. nonetheless, e.g. see `DAC`.
input_valuesc                     g)zq
Encode raw audio retrieved from a respective `FeatureExtractor` into discrete audio codebooks (with x channels)
Nr   )r   r  r  r  s       r   encode#PreTrainedAudioTokenizerBase.encode      r   audio_codesc                     g)z6Decode from discrete audio codebooks back to raw audioNr   )r   r  r  r  s       r   decode#PreTrainedAudioTokenizerBase.decode  r  r   r   N)r   r   r   r   r   r   r   r   r  r  r   r   r   r   r  r    sH     5<<  
 E%,, E Er   r  r   )r   TN)NNNr  (  r`  rs  rC  rL  r  r   ri  r  r  abcr   r   collections.abcr   r   
contextlibr   dataclassesr   r	   r
   r   	itertoolsr   	threadingr   typingr   r   r   r   r   zipfiler   r   huggingface_hubr   r   r   	packagingr   safetensorsr   safetensors.torchr   r'  r   r  r   r   torch.distributionsr   torch.utils.checkpointr   r  r    r  configuration_utilsr!   conversion_mappingr"   core_model_loadingr#   r$   r%   r&   r   r'   dynamic_module_utilsr(   
generationr)   r*   integrationsr+   r,   r-   r.   r/   integrations.accelerater0   r1   r2   r3   r4   r5   r6   r  r7   integrations.eager_pagedr8   integrations.finegrained_fp8r9   integrations.flash_attentionr:   integrations.flash_pagedr;   integrations.flex_attentionr<   r  r=   r>   integrations.moer?   integrations.peftr@   integrations.sdpa_attentionrA   integrations.sdpa_pagedrB   integrations.tensor_parallelrC   rD   rE   rF   rG   rH   rI   loss.loss_utilsrJ   modeling_flash_attention_utilsrK   rL   rM   rN   modeling_rope_utilsrO   monkey_patchingrP   rQ   pytorch_utilsrR   
quantizersrS   quantizers.autorT   quantizers.quantizers_utilsrU   safetensors_conversionrV   utilsrW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   utils.genericrl   rm   rn   	utils.hubro   rp   rq   utils.import_utilsrr   rs   rt   ru   rv   utils.loading_reportrw   rx   utils.output_capturingry   rz   utils.quantization_configr{   accelerate.hooksr|   accelerate.utilsr}   _typingr~   is_availabler   !smdistributed.modelparallel.torchmodelparallelr  smdistributed.modelparallelr   SMP_VERSIONr  r  
get_loggerr   r  r   r   upperr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   uint8int8int16uint16r  r  int32uint32r   float64int64uint64float8_e4m3fnfloat8_e5m2r.  r  r  r   r   r:  rB  r'  r   rN  ra  r  r^  rd  r  r  r  ro  r  r  r  r  r   r[  r   rd  rz  r  r  r(  r  r  r   r  r   r   r   <module>rC     s        	 	 
   # . % ( $   H H   \ \  ! 6 9  + - $ 1 <  + 4 7 v v   F C C A = ? F 3 2 ? A   *  5 B , # - = 3     . j i \ \  K H 9 3<'  %00==? 33F 'k :gmmF>S S % 
		H	%zz~~nc288:JJNN#6<BBD %&CK\]   $- - -64 .3 .`   # # 0U[[ 0C$J 0 0.1  JJ
++
**;;<<==NN;;<<====;;<<""   &- $ 6 (- $	/k2;;&/k$/k /k +	/k
 
#u||
/kdU\\ c ")) S	 ,DSN ,S%,,=N8O ,TYZ^_bcf_gZhjnorjsZsTt ,>%#c(^%)-c5<<.?)@%
4C>4C>)*%*MS%,,&'M0AM	#u||
M`(&7 (S (RWR^R^ (s S4Z 3  26-1"I.#&#4t#;I.4ZI. TzI. D[	I.
 tI. I. %($JI. $d*I. tI. 49tTD[()I.f (,Pt#d*P3i$&P P Tk	P
 tP P $P U[[()Pf[ [|Z* Z*z^9+bii!57GYi ^9+Bs ((C(CD &&2*9*E*E*M*M*T*T[| +U +O'
 
 Y YD Y_ Y 
 Y 
 M		 Md Mryy M 
 M		 d ryy 2@#)ell": @t @ ^b48HSVZHZD@gO @g$ @g^ilp^p @gF"9) "9L /A.B + BE? Er   