
    Z j[                       % S r SSKJr  SSKrSSKrSSKJr  SSKJ	r	  SSK
JrJr  SSKrSSKJr  SSKJr  S	S
KJr  S	SKJr  S	SKJr  S	SKJrJr  S	SKJrJrJrJrJrJ r J!r!J"r"J#r#  \(       a  S	SK$J%r%  S	SKJ&r&  \RN                  " \(5      r)\	 " S S5      5       r*S(S jr+\\*/\,\\-\.\4   4   4   r/S)S jr0 S*               S+S jjr1S,S jr2S,S jr3S,S jr4S,S jr5S,S jr6S,S jr7S,S jr8S,S jr9S,S jr:S,S jr;S,S  jr<S,S! jr=S,S" jr>S,S# jr?S,S$ jr@S,S% jrA\R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  /rQ\R                  \R                  \R                  \R                  \R                  \R                  /rX\R                  \R                  /r[\R                  \R                  /r^\R                  \R                  \R                  /rb\R                  \2\R                  \3\R                  \3\R                  \4\R                  \5\R                  \7\R                  \8\R                  \9\R                  \:\R                  \>\R                  \A\R                  \=\R                  \=0\-R                  \Q\65      E\-R                  \X\;5      E\-R                  \[\<5      E\-R                  \^\?5      E\-R                  \b\@5      ErqS&\rS''   g)-z,
Optimizer utilities for the Trainer class.
    )annotationsN)Callable)	dataclass)TYPE_CHECKINGAny)version)nn   )	Adafactor)LayerWiseDummyOptimizer)check_target_module_exists)OptimizerNamesParallelMode)	is_apollo_torch_availableis_bitsandbytes_availableis_galore_torch_availableis_grokadamw_availableis_lomo_availableis_schedulefree_availableis_torch_optimi_availableis_torchao_available	strtobool)PreTrainedModel)TrainingArgumentsc                  L    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
\S'   Srg)OptimizerContext6   z0Context object passed to all optimizer handlers.r   argszPreTrainedModel | Nonemodeldict[str, Any]optimizer_kwargsadam_kwargsdict[str, str]
optim_args N)__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r%       o/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/trainer_optimizer.pyr   r   6   s#    :
!!$$r-   r   c                    U (       d  0 $ 0 nU R                  SS5      R                  S5       H  nUR                  S5      u  p4XAU'   M     U$ )z8Parse optimizer arguments from a comma-separated string.  ,=)replacesplit)optim_args_strr$   mappingkeyvalues        r.   _parse_optim_argsr:   A   sP    	J!))#r288=]]3'
3 > r-   c                    [        U [        5      (       a*  [        U [        R                  R
                  5      (       a  gg)a  
Check if the returned value from a handler is a factory rather than an Optimizer class.

Factory callables are used for complex optimizers like Muon or Dion that need to:
- Split parameters between multiple internal optimizers
- Handle complex sharding logic
- Access the full model structure for parameter grouping

Args:
    optimizer_cls_or_factory: The first element returned by an optimizer handler.

Returns:
    `bool`: True if it's not an Optimizer class (i.e., likely a factory), False if it's an Optimizer class.
FT)
isinstancetype
issubclasstorchoptim	Optimizer)optimizer_cls_or_factorys    r.   is_optimizer_factoryrC   P   s2      *D11jAY[`[f[f[p[p6q6qr-   c                   ^ UR                  5       R                  S5      nU(       a4  U R                  [        R                  :X  a  U(       a  [        SU S35      eX2   nU R                  c  [        SU S35      e[        U R                  [        [        45      (       d  [        SU R                   35      eUc  [        SU S35      e[        U R                  [        5      =(       a    U R                  R                  S	S
5      S:H  n	/ n
UR                  5        H  u  p[        U R                  USS9u  p[        U[        R                   5      (       d+  U(       a"  U(       d  ["        R%                  U SU S35        Mf  U(       d	  U	(       d  Mv  U
R'                  US-   5        M     [)        U
5      S:X  a  [        SU SU R                   S35      eUR+                  5        VVs/ s H  u  nnX;   d  M  UPM     nnnUR+                  5        VVs/ s H  u  nnX;  d  M  UPM     nnnSU0SU0UE/nU(       a  U R,                  S:w  a  [        SU S35      e0 mU H  nU" SU/0/40 UD6TU'   M     U H  nU" SU/0UE/40 UD6TU'   M     U4S jnUR/                  5        H'  nUR0                  (       d  M  UR3                  U5        M)     [4        nUR7                  ST05        UR7                  SU05        X4$ s  snnf s  snnf )z
Helper function to set up low-rank optimizers like GaLore and Apollo.

These optimizers apply low-rank projections to specific target modules (typically linear layers).
	layerwisezLayer-wise z" does not support DDP at this timez1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer._-z
all-linearT)return_is_regexz matched but ignored. z only supports linear layers.z.weightr   zNo target modules found for z (z).paramsr
   z
Layerwise z( does not support gradient accumulation!c                n   > U R                   b'  TU    R                  5         TU    R                  5         g g )N)gradstep	zero_grad)paramoptimizer_dicts    r.   optimizer_hook1_setup_low_rank_optimizer.<locals>.optimizer_hook   s4    zz%u%**,u%//1 &r-   rO   )lowerendswithparallel_moder   DISTRIBUTEDNotImplementedErroroptim_target_modules
ValueErrorr<   liststr	TypeErrorr4   named_modulesr   r	   Linearloggerwarningappendlennamed_parametersgradient_accumulation_steps
parametersrequires_grad"register_post_accumulate_grad_hookr   update)r   r   optimizer_nameoptimizer_mappingoptim_kwargsr!   is_layerwise_supportedis_layerwiseoptimizer_cls
all_lineartarget_params_namesmodule_namemoduletarget_module_existsis_regexnptarget_paramsnon_target_paramsparam_groupsrN   rP   rO   s                         @r.   _setup_low_rank_optimizerry   e   s/    "'')22;?L**l.F.FFKa!K/??a"bcc%5M  (L^L\\ghiid//$==--.0
 	

 }B>BRR]^__ 	4,,c2rt7P7P7X7XY\^a7bfr7r  $224)C%%{D*
& &")),,#H+.D^DTTqrs#J"";#:;  5 1$77Gr$JcJcIddfghh#(#9#9#;X#;41aq?WQ#;MX','='='?`'?tq!1C_'?` 
$%	=1L1L
 ++q0z.)99abcc&E$1Hug3F2G$\K[$\N5! '"E$1Hug3V3V2W$l[k$lN5! #	2
 %%'E"""88H ( 0!1> BCX|45**? Y`s   )L9LL
%L
c                b    U R                   R                  SSS.5        [        U R                   4$ )zGet Adafactor optimizer.Fscale_parameterrelative_step)r!   rg   r   )ctxs    r.   _get_adafactorr      s,    EE RSc****r-   c                    SSK Jn  U R                  R                  U R                  5        U R
                  R                  [        R                  :X  a  U R                  R                  SS05        XR                  4$ )z/Get PyTorch AdamW optimizer (regular or fused).r   AdamWfusedT)	torch.optimr   r!   rg   r"   r   r@   r   ADAMW_TORCH_FUSEDr~   r   s     r.   _get_adamw_torchr      sZ    !0
xx~~999##WdO4&&&&r-   c                     SSK Jn  U R                  R                  U R                  5        XR                  4$ ! [
         a    [        S5      ef = f)z'Get Torch XLA syncfree AdamW optimizer.r   r   z7Trainer failed to import syncfree AdamW from torch_xla.)torch_xla.amp.syncfreer   r!   rg   r"   ImportErrorrX   r   s     r.   _get_adamw_torch_xlar      sO    T0##COO4**** TRSST	   7: Ac                     SSK Jn  U R                  R                  U R                  5        XR                  4$ ! [
         a    [        S5      ef = f)zGet NPU Fused AdamW optimizer.r   )NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.)torch_npu.optimr   r!   rg   r"   r   rX   )r~   r   s     r.   _get_adamw_torch_npu_fusedr      sO    P1##COO42222 PNOOPr   c                   [        5       (       d  [        S5      eSSKJnJnJn  U R                  R                  nSU;   nSU;   a  SOSnSnU R                  nS	U;   a  UnGOS
U;   a2  UnSU R                  R                  U R                  R                  40nGOoSU;   a  UnU R                  nGOYSU;   GaR  SSKJn	  U	n[        U R                  R                  SU R                  R                  5      5      [        U R                  R                  SU R                  R                  5      5      [        U R                  R                  SS5      5      4[        U R                  R                  SS5      5      [        U R                  R                  SU R                  R                  5      5      S.nSU R                  ;   a  [!        U R                  S   5      US'   SU R                  ;   a  [!        U R                  S   5      US'   SU0n
SU;  a  XZS'   U R"                  R%                  U5        U R"                  R%                  U
5        XpR"                  4$ )z;Get bitsandbytes optimizer (AdamW, Lion, RMSprop variants).ziYou need to install `bitsandbytes` in order to use bitsandbytes optimizers: `pip install -U bitsandbytes`r   )r   LionRMSproppaged8bit       Nadamlionbetasrmspropademamix)AdEMAMixbeta1beta2beta3gH.?alphag      @eps)r   r   r   t_alphat_beta3
optim_bitsis_paged)r   r   bitsandbytes.optimr   r   r   r   r@   r"   
adam_beta1
adam_beta2r$   r   floatgetadam_epsilonintr!   rg   )r~   r   r   r   
optim_namer   r   rm   additional_optim_kwargsr   
bnb_kwargss              r.   _get_bitsandbytes_optimizerr      s   $&&w
 	
 87J*$H
*JM!oo	:	#*SXX-@-@#((BUBU,V"W	j	 "%..	z	!/  cnn((#((2E2EFGcnn((#((2E2EFGcnn((&9:
 3>>--gs;<++E3883H3HIJ#
 &14S^^I5N1O#I.&14S^^I5N1O#I.
+J
"!): 78
+....r-   c                    SSK Jn  U R                  R                  U R                  5        U R                  R                  [        U R                  R                  SS5      5      [        [        U R                  R                  SS5      5      [        [        U R                  R                  SS5      5      [        [        U R                  R                  SS	5      5      S
.5        XR                  4$ ! [         a    [        S5      ef = f)z!Get AnyPrecision AdamW optimizer.r   )AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtypebfloat16)r   r   r   r   z4Please install https://github.com/pytorch/torchdistx)torchdistx.optimizersr   r!   rg   r"   r   r$   r   getattrr?   r   rX   )r~   r   s     r.   _get_adamw_anyprecisionr     s    Q;##COO4##'01C1CDY[b1c'd")%1C1CDTV_1`"a")%1C1CDTV_1`"a-43>>--.I:V.			
 !"6"666 QOPPQs   C2C5 5Dc                ^   U R                   R                  5       nU R                  (       ag  S H.  nX R                  ;   d  M  [        U R                  U   5      X'   M0     SU R                  ;   a#  U R                  S   R	                  5       S;   US'   [
        R                  R                  U4$ )zGet SGD optimizer.)momentum	dampeningweight_decaynesterovtrue1yes)r!   copyr$   r   rR   r?   r@   SGDr~   kwargsr8   s      r.   _get_sgdr   0  s    !!&&(F
~~<Cnn$#CNN3$78 = '!$
!;!A!A!CG[![F:;;??F""r-   c                    U R                   R                  5       nU R                  (       a4  S H.  nX R                  ;   d  M  [        U R                  U   5      X'   M0     [        R
                  R                  U4$ )zGet Adagrad optimizer.)lr_decayr   r   )r!   r   r$   r   r?   r@   Adagradr   s      r.   _get_adagradr   <  s]    !!&&(F
~~6Cnn$#CNN3$78 7 ;;&&r-   c                ^   U R                   R                  5       nU R                  (       ag  S H.  nX R                  ;   d  M  [        U R                  U   5      X'   M0     SU R                  ;   a#  U R                  S   R	                  5       S;   US'   [
        R                  R                  U4$ )zGet RMSprop optimizer.)r   r   r   r   centeredr   )r!   r   r$   r   rR   r?   r@   r   r   s      r.   _get_rmspropr   F  s    !!&&(F
~~?Cnn$#CNN3$78 @ '!$
!;!A!A!CG[![F:;;&&r-   c                &   [        5       (       d  [        S5      eSSKJnJnJn  [        R                  U[        R                  U[        R                  U[        R                  U[        R                  U[        R                  U0n[        U R                  R                  SS5      5      [        U R                  R                  SS5      5      [!        U R                  R                  SS	5      5      U R                  R                  S
S5      S.n[#        U R$                  U R&                  U R$                  R(                  XEU R*                  5      u  pgU R$                  R(                  [        R                  :X  a  UR-                  SSS.5        Xg4$ )zGet GaLore optimizer.zYou need to install `galore_torch` in order to use GaLore optimizers. Install it with `pip install git+https://github.com/jiaweizzhao/GaLore`r   )GaLoreAdafactorGaLoreAdamWGaLoreAdamW8bitrank   update_proj_gap   scaleg      ?	proj_typestd)r   r   r   r   Fr{   )r   r   galore_torchr   r   r   r   GALORE_ADAMWGALORE_ADAMW_8BITGALORE_ADAFACTORGALORE_ADAMW_LAYERWISEGALORE_ADAMW_8BIT_LAYERWISEGALORE_ADAFACTOR_LAYERWISEr   r$   popr   ry   r   r   r@   r!   rg   )r~   r   r   r   ri   galore_optim_kwargsrm   r!   s           r.   _get_galore_optimizerr   R  sD   $&&V
 	
 KJ 	##[((/''--{22O11? CNN&&vs34s~~112CSIJs~~))'489^^''U;	 '@#))SXX^^->UXUiUi'#M xx~~888EE RS**r-   c           
        [        5       (       d  [        S5      eSSKJn  [        R
                  U[        R                  U0n[        U R                  R                  SS5      5      U R                  R                  SS5      U R                  R                  SS	5      [        U R                  R                  S
S5      5      [        U R                  R                  SS5      5      U R                  R                  SS5      S.nUR                  U R                  5        [        U R                  U R                  U R                  R                   X#U R"                  5      $ )zGet Apollo optimizer.zYou need to install `apollo_torch` in order to use APOLLO optimizers. Install it with `pip install git+https://github.com/zhuhanqing/APOLLO`r   )APOLLOAdamWr   r   projrandom
scale_typechannelr   r   r         ?r   r   )r   r   r   r   r   r   )r   r   apollo_torchr   r   APOLLO_ADAMWAPOLLO_ADAMW_LAYERWISEr   r$   r   r   rg   r"   ry   r   r   r@   r!   )r~   r   ri   apollo_optim_kwargss       r.   _get_apollo_optimizerr   s  s   $&&U
 	
 ) 	##[--{ CNN&&vs34""684nn((yAs~~112CSIJs~~))'378^^''U; s/$#))SXX^^->UXUiUi r-   c                   [        5       (       d  [        S5      eU R                  c  [        S5      eSSKJnJn  SU R                  R                  ;   a  UOUnU R                  R                  SU R                  05        X0R                  4$ )zGet LOMO optimizer.zjYou need to install `lomo_optim` in order to use LOMO optimizers. Install it with `pip install lomo-optim`zMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.r   )AdaLomoLomoadar   )r   r   r   rX   
lomo_optimr   r   r   r@   r!   rg   )r~   r   r   rm   s       r.   _get_lomo_optimizerr     sz    7
 	

 yyhii($6GDM#)) 45....r-   c                   [        5       (       d  [        S5      eSSKJn  U R                  R                  [        U R                  R                  SS5      5      [        U R                  R                  SS5      5      [        U R                  R                  SS	5      5      [        U R                  R                  S
S	5      5      [        U R                  R                  SS5      5      S.5        XR                  4$ )zGet GrokAdamW optimizer.z5Please install grokadamw with `pip install grokadamw`r   )	GrokAdamW
alpha_initg\(\?lamb       @gammag?grokking_signal_decay_rategradient_clippingr   )r   r   r   r   r  )	r   rX   	grokadamwr   r!   rg   r   r$   r   )r~   r   s     r.   _get_grokadamwr    s    !##PQQ# 2 2< FG#..,,VS9:3>>--gs;<*/0B0BC_ad0e*f!&s~~'9'9:Ms'S!T	
 ****r-   c           	        [        5       (       aK  [        R                  " [        R                  R                  S5      5      [        R                  " S5      :  a  [        S5      e[        R                  " [        R                  R                  S5      5      [        R                  " S5      ::  a  [        S5      e[        R                  " [        R                  R                  S5      5      [        R                  " S5      :  a	  SS	KJnJn  OSS	K	JnJn  U R                  R                  [        R                  :X  a  UnOUnU R                  R                  U R                   R#                  S
S5      [%        U R                   R#                  SS5      5      S.5        U R                  R                  U R&                  5        X0R                  4$ )z%Get TorchAO 4-bit or 8-bit optimizer.torchaoz0.4.0zYou need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers. Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aor?   z2.4zYou need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.z0.11.0r   )	AdamW4bit	AdamW8bit
block_size   bf16_stochastic_roundr   )r  r
  )r   r   parse	importlibmetadatar   torchao.optimr  r  torchao.prototype.low_bit_optimr   r@   r   ADAMW_TORCH_4BITr!   rg   r$   r   r   r"   )r~   r  r  rm   s       r.   _get_torchao_optimizerr    sQ   !!W]]93E3E3M3Mi3X%Y\c\i\ijq\r%r,
 	

 }}Y''//89W]]5=QQ<
 	
 }}Y''//	:;w}}X?VV66H
xx~~888!!..,,\3?%.s~~/A/ABY[b/c%d	
 0....r-   c           	     R   [        5       (       d  [        S5      eSSKJnJn  0 nSnU R
                  R                  [        R                  :X  a2  [        S5      (       d  [        S5      eSSKJ	n  UnU R                  nSnOmU R
                  R                  [        R                  :X  a  UnU R                  nO6U R
                  R                  [        R                  :X  a  UnO[        S	5      eU R
                  R                  US
'   U(       a  U R
                  R                  US'   UR!                  [#        U R$                  R'                  SS5      5      [#        U R$                  R'                  SS5      5      S.5        U R(                  R!                  U5        X`R(                  4$ )zGet ScheduleFree optimizer.zwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`r   )AdamWScheduleFreeSGDScheduleFreeTz1.4.0zYou need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)RAdamScheduleFreeFzInvalid schedulefree optimizerr   warmup_stepsweight_lr_powerr   rg        )r  r  )r   r   schedulefreer  r  r   r@   r   SCHEDULE_FREE_RADAMr  r"   SCHEDULE_FREE_ADAMWSCHEDULE_FREE_SGDrX   r   r  rg   r   r$   r   r!   )r~   r  r  r   require_warmupr  rm   s          r.   _get_schedule_free_optimizerr    s_   $&&:
 	
 @ N
xx~~;;;(11>  	3)"%//	>==	=)"%//	>;;	;'9::.1hh.C.CN+25((2G2G/""$S^^%7%78I3%OPs~~))#s34	
  78....r-   c                   [        5       (       d  [        S5      eSSKJn  U R                  R                  SS5      nUb  [        U5      nU R                  R                  SS5      nUb  [        U5      nU R                  R                  U R                  S'   [        U R                  R                  SS	5      5      UUS
.nU R                  R                  U R                  5        U R                  R                  U5        XR                  4$ )z,Get StableAdamW optimizer from torch-optimi.zwYou need to install `torch-optimi` in order to use stable_adamw optimizers. Install it with `pip install torch-optimi`.r   )StableAdamWmax_lrN	kahan_sumr   decouple_lrF)r#  r!  r"  )r   r   optimir   r$   r   r   boolr   r   r"   r!   rg   )r~   r   r!  r"  stable_adamw_kwargss        r.   _get_stable_adamwr'    s    $&&:
 	
 #^^$/Fv"";5IO	&)hh&;&;COON#CNN..}eDE 0 34,,,,r-   zdict[str, OptimizerHandler]_OPTIMIZER_HANDLERS)r6   z
str | Nonereturnr#   )rB   r   r)  r%  )T)r   r   r   r   rh   rZ   ri   r    rj   r    r!   r    rk   r%  r)  tuple[Any, dict[str, Any]])r~   r   r)  r*  )sr*   
__future__r   importlib.metadatar  loggingcollections.abcr   dataclassesr   typingr   r   r?   	packagingr   r	   optimizationr   trainer_pt_utilsr   trainer_utilsr   training_argsr   r   utilsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   	getLoggerr&   r^   r   r:   tupledictrZ   OptimizerHandlerrC   ry   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r'  	ADAMW_BNB
ADAMW_8BITPAGED_ADAMWPAGED_ADAMW_8BITADEMAMIXADEMAMIX_8BITPAGED_ADEMAMIXPAGED_ADEMAMIX_8BITLION	LION_8BIT
PAGED_LIONPAGED_LION_8BITRMSPROP_BNBRMSPROP_8BITRMSPROP_32BIT_BITSANDBYTES_OPTIMIZERSr   r   r   r   r   r   _GALORE_OPTIMIZERSr   r   _APOLLO_OPTIMIZERSr  ADAMW_TORCH_8BIT_TORCHAO_OPTIMIZERSr  r  r  _SCHEDULE_FREE_OPTIMIZERS	ADAFACTORADAMW_TORCHr   ADAMW_TORCH_XLAADAMW_TORCH_NPU_FUSEDADAMW_ANYPRECISIONr   ADAGRADRMSPROP	GROKADAMWSTABLE_ADAMWLOMOADALOMOfromkeysr(  r+   r%   r-   r.   <module>r]     s   #   $ ! %    # 5 5 7
 
 
 /0			8	$    -.c4S>6I0JJK 8 $(V+
V+V+ V+ &	V+
 !V+ %V+ !V+  V+|+'TP//dQ*	#'	'+B:/$+& /F)/X-D ##  !!&&""   & $$##))..--  ))  ####  &&&&$$  n 0$$&6""$8((*D%%'>LLn!2,/4 mm,.IJ4 mm&(=>4  mm&(=>!4" mm')?@#4$ mm-/KL%4 0 r-   