
    Z j                     |   S r SSKrSSKrSSKrSSKrSSKJr  SSKJ	r	  SSK
JrJrJr  \" 5       (       a
  SSKrSSKJr  \R                   " \5      rS r\" 5       (       a  \" 5       (       a  SS	KJr  OSS
KJr   " S S\5      r " S S\5      rSqS rS rS rS rS rS r SS jr!S r"SS jr#SS jr$SS jr%S r&g)z
Integration with Deepspeed
    N)partialmethod   )dep_version_check)is_accelerate_availableis_torch_availablelogging)nnc                      [         R                  R                  S5      S Ln U (       a!   [         R                  R                  S5      ngg ! [         R                  R                   a     gf = f)N	deepspeedTF)	importlibutil	find_specmetadataPackageNotFoundError)package_exists_s     t/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/deepspeed.pyis_deepspeed_availabler   $   sc    ^^--k:$FN 	""++K8A  !!66 		s   A A,+A,)HfDeepSpeedConfig)objectc                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ )r   9   a"  
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
it's important that this object remains alive while the program is still running.

[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
the DeepSpeed configuration is not modified in any way.

Args:
    config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.

c                 f   > [        U 5        [        S5        [        S5        [        TU ]  U5        g )N
accelerater   )set_hf_deepspeed_configr   super__init__selfconfig_file_or_dict	__class__s     r   r   HfDeepSpeedConfig.__init__J   s)    %,'+&,-     )__name__
__module____qualname____firstlineno____doc__r   __static_attributes____classcell__r!   s   @r   r   r   9   s     . .r#   r   c                   `   ^  \ rS rSrSrU 4S jrS rS rSS jr\	" \SS9r
SS	 jrS
 rSrU =r$ )HfTrainerDeepSpeedConfigR   z
The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
same lifespan as the latter.
c                 @   > [         TU ]  U5        S U l        / U l        g N)r   r   _dtype
mismatchesr   s     r   r   !HfTrainerDeepSpeedConfig.__init__X   s    ,-r#   c                 J    U R                   c  [        S5      eU R                   $ )Nz8trainer_config_process() wasn't called yet to tell dtype)r2   
ValueError)r   s    r   dtypeHfTrainerDeepSpeedConfig.dtype]   s"    ;;WXX{{r#   c                 6    U R                  U5      nUc  gUS:H  $ )NFauto)	get_value)r   ds_key_longvals      r   is_auto HfTrainerDeepSpeedConfig.is_autob   s"    nn[);&= r#   c           
          U R                  U5      u  pVUc  gUR                  U5      S:X  a  X%U'   gU(       d  gUR                  U5      nUb.  Xr:w  a(  U R                  R                  SU SU SU SU 35        ggg)a  
A utility method that massages the config file and can optionally verify that the values match.

1. Replace "auto" values with `TrainingArguments` value.

2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to `self.mismatched` - will assert during
`trainer_config_finalize` for one or more mismatches.

Nr:   z- ds =z vs hf )find_config_nodegetr3   append)r   r<   hf_valhf_key
must_matchconfigds_keyds_vals           r   
fill_match#HfTrainerDeepSpeedConfig.fill_matchi   s     ..{;>::f'#6NF#&"2OO""U;-qxqQWPX#YZ #3r#   F)rG   c                    UR                   UR                  -  UR                  -  nU R                  SUR                  SU(       + 5        U R                  SUR                  S5        U R                  SUSU(       + 5        U R                  SUR                  S5        U R                  SUR
                  S	5        U R                  S
UR                  UR                  /S5        U R                  SUR                  S5        U R                  SUR                  S5        U R                  SS5        U R                  SUR
                  S	5        UR                  (       aE  U R                  R                  S0 5      U R                  S'   UR                  U R                  S   S'   U R                  SUR                  =(       d    UR                  S5        U R                  SUR                   =(       d    UR"                  S5        U R%                  S5      (       a  [&        R(                  U l        gU R%                  S5      (       a  [&        R,                  U l        g[&        R.                  U l        g)zr
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
train_micro_batch_size_per_gpuper_device_train_batch_sizegradient_accumulation_stepstrain_batch_sizeztrain_batch_size (calculated)gradient_clippingmax_grad_normzoptimizer.params.lrlearning_ratezoptimizer.params.betaszadam_beta1+adam_beta2zoptimizer.params.epsadam_epsilonzoptimizer.params.weight_decayweight_decayzscheduler.params.warmup_min_lrr   zscheduler.params.warmup_max_lr
checkpointuse_node_local_storagezfp16.enabledzfp16|fp16_full_evalzbf16.enabledzbf16|bf16_full_evalN)
world_sizerO   rP   rK   rS   rT   
adam_beta1
adam_beta2rU   rV   	fill_onlysave_on_each_noderH   rC   fp16fp16_full_evalbf16bf16_full_evalis_truetorchbfloat16r2   float16float32)r   argsauto_find_batch_sizerQ   s       r   trainer_config_process/HfTrainerDeepSpeedConfig.trainer_config_process   s     ??T-M-MMPTPpPpp,,,)$$		
 	),,)	

 	+$$		
 	+T-?-?Q-t/A/A?S$__doo.#	

 	.0A0A>R79J9JN[7;8$:L:Lo^ !!(,b(IDKK%BFBXBXDKK%&>? 	)Id6I6ILab)Id6I6ILab <<''..DK\\.))--DK--DKr#   c                    / SQnU Vs/ s H  oPR                  U5      (       d  M  UPM     nn[        U5      S:  Ga  Sn[        US5      (       Ga8  [        UR                  S5      (       a  UR                  R                  nGO[        UR                  S5      (       a   [        UR                  R                  5      nO[        UR                  S5      (       aF  [        UR                  R                  S5      (       a!  UR                  R                  R                  nOi[        UR                  S5      (       aN  [        UR                  R                  S5      (       a)  [        UR                  R                  R                  5      nUc  [        SU S	35      eU R                  S
Xw-  5        U R                  5       (       a6  U R                  S[        SU-  U-  5      5        U R                  SSU-  5        U R                  SUS5        U R                  SUR                  U5      S5        [        U R                  5      S:  a*  SR                  U R                  5      n[        SU S35      egs  snf )zx
This stage is run after we have the model and know num_training_steps.

Now we can complete the configuration process.
)$zero_optimization.reduce_bucket_size-zero_optimization.stage3_prefetch_bucket_size4zero_optimization.stage3_param_persistence_thresholdr   NrH   hidden_sizehidden_sizestext_configzThe model's config file has neither `hidden_size` nor `hidden_sizes` entry, therefore it's not possible to automatically fill out the following `auto` entries in the DeepSpeed config file: zb. You can fix that by replacing `auto` values for these keys with an integer value of your choice.rl   rm   g?rn   
   z scheduler.params.total_num_stepsznum_training_steps (calculated)z!scheduler.params.warmup_num_stepswarmup_steps
z]Please correct the following DeepSpeed config values that mismatch TrainingArguments values:
zF
The easiest method is to set these DeepSpeed config values to 'auto'.)r>   lenhasattrrH   ro   maxrp   rq   r6   r\   is_zero3intrK   get_warmup_stepsr3   join)	r   rg   modelnum_training_stepshidden_size_based_keysxhidden_size_auto_keysro   r3   s	            r   trainer_config_finalize0HfTrainerDeepSpeedConfig.trainer_config_finalize   s    "

 -C V,BqllSTo,B V$%)Kuh''5<<77"',,":":KU\\>::"%ell&?&?"@KU\\=99gellF^F^`m>n>n"',,":":"F"FKU\\=99gellF^F^`n>o>o"%ell&>&>&K&K"LK" 55J4K LYY  NNA;C\]}}Ck)K78 J$ 	.-	

 	/!!"45	
 t!#4??3J'L(oq  $a !Ws
   I4I4)r2   r3   )NTF)r%   r&   r'   r(   r)   r   r7   r>   rK   r   r\   ri   r   r*   r+   r,   s   @r   r.   r.   R   s=    


![4 jU;I8(tC Cr#   r.   c                 0    [         R                  " U 5      qg r1   )weakrefref_hf_deepspeed_config_weak_ref)hf_deepspeed_config_objs    r   r   r   	  s    
 %,KK0G$H!r#   c                      S q g r1   )r   r$   r#   r   unset_hf_deepspeed_configr     s
     %)!r#   c                  X    [         b#  [        5       b  [        5       R                  5       $ g)NF)r   rx   r$   r#   r   is_deepspeed_zero3_enabledr     s&    $05R5T5`,.7799r#   c                  P    [         b  [        5       b  [        5       R                  $ g r1   )r   rH   r$   r#   r   deepspeed_configr     s#    $05R5T5`,.555r#   c                 "  ^^^^ SSK mSSKnSSKJn  SSKJm  U R                  5       mUUUU4S jmUR                  " 5          U" 5          T" X R                  5        SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f)a-  
DeepSpeed ZeRO-3 variant of `PreTrainedModel.initialize_weights`. Mirrors the `smart_apply`
dispatch logic but gathers each module's partitioned parameters before calling
`_initialize_weights`, so initialization operates on full tensors instead of empty shards.
Only rank 0 performs the actual init.
r   Nr   )guard_torch_init_functions)PreTrainedModelc                   > U R                  5        H0  n[        UT5      (       a  T" X"R                  5        M(  T" X!5        M2     [        U R	                  SS95      nU(       aK  TR
                  R                  USS9   TR                  R                  5       S:X  a	  U" U T5        S S S 5        g U" U T5        g ! , (       d  f       g = f)NF)recurser   modifier_rank)	children
isinstance_initialize_weightslist
parameterszeroGatheredParameterscommget_rank)model_or_modulefnchildparamsr   _apply_zero3r   is_remote_codes       r   r   .initialize_weights_zero3.<locals>._apply_zero34  s    $--/E%11U$=$=>U'	 0 o000?@2262K>>**,17 LK /	 LKs   ?(B::
C)	r   rc   initializationr   modeling_utilsr   r   no_gradr   )r|   rc   r   r   r   r   r   s      @@@@r   initialize_weights_zero3r   %  sc     ;0))+N0 0 
') 9 9: * 
)) 
s$   B A/B /
A=	9B  
Bc                 Z  ^! [        5       nUb  UR                  S0 5      R                  SS5      nUR                  S0 5      n[        U[        5      (       a+  [	        XER                  S0 5      R                  SS5      5      nUS:  a  [        S5      eSS	KJnJnJ	m!J
n  [        US
S5      n	U R                  n
0 nU R                  5       R                  5        H1  u  p[        R                   " UR"                  UR$                  SS9X'   M3     U Vs/ s H  n[        X5      (       d  M  UPM     nnU Vs/ s H  n[        X5      (       d  M  UPM     nn['        U5      S:X  aC  0 nUR                  5        H!  u  nnU" UU/ X5      u  nnUU;   d  M  UUU'   M#     U	b  U	Ul        U$ U VVs0 s H  nUR*                    H  nUU_M     M     nnn0 n0 n[-        UR/                  5       U!4S jS9nU H  nUR1                  U5      nU" UUUX5      u  nnUU;   d  M*  UbS  UU   nU" UR*                  UR2                  UR4                  S9nUR7                  UU5      nUR9                  UUUU5        M  UUU'   M     UR                  5        H]  u  nn UR;                  UU U R<                  S9nUR                  5        H'  u  nn[        U[>        5      (       a  US   OUnUUU'   M)     M_     U	b  U	Ul        U$ s  snf s  snf s  snnf ! [@         a  n [C        SU SU  35      U eSn A ff = f)z
Apply weight conversions (renaming and merging/splitting operations) to a state dict.
This is a simplified version that handles the conversion without loading into the model.
Ntensor_parallelautotp_size   	inferencetp_sizezWeight conversions (e.g., MoE expert fusion) with DeepSpeed Tensor Parallelism are not yet implemented but support is coming soon. Please disable tensor_parallel in your DeepSpeed config or convert your checkpoint to the expected format first.r   )WeightConverterWeightRenamingdot_natural_keyrename_source_key	_metadatameta)r7   devicer   c                    > T" U 5      $ r1   r$   )kr   s    r   <lambda>9_apply_weight_conversions_to_state_dict.<locals>.<lambda>  s
    /!:Lr#   )key)source_patternstarget_patterns
operations)r|   rH   z'Failed to apply weight conversion for 'zb'. This likely means the checkpoint format is incompatible with the current model version. Error: )"r   rC   r   dictrw   NotImplementedErrorcore_model_loadingr   r   r   r   getattrbase_model_prefix
state_dictitemsrc   emptyshaper7   ru   r   r   sortedkeyspopr   r   
setdefault
add_tensorconvertrH   r   	ExceptionRuntimeError)"r|   r   weight_mapping	ds_configr   inference_configr   r   r   r   prefixmodel_state_dictr   paramentry	renamings
convertersnew_state_dictoriginal_keytensorrenamed_keyr   	converterr   pattern_to_converterconversion_mappingsorted_keyssource_patternnew_convertermappingrealized_valuetarget_nameer   s"                                    @r   '_apply_weight_conversions_to_state_dictr   H  sg    !"I-- 126::=!L$==b9&--'#7#78I2#N#R#RS\^_#`aGQ;%d  ih z;5H$$F &&(..0
 %EKKu{{SY Z 1 %3XN5j6WNIX%3Z^Ez%7Y%^JZ :!$.$4$4$6 L&.|YFeNK...4{+ %7
 '/N$ ;Eh*YiNgNgAyLNgA*h
 N*0LMK#-&7iQ[]c&v#^ **) 1@	 /$-$=$=$-$=$=(33!
 -77]S"";nfU /5{++ $0 !3 8 8 :W	$__|| - N
 '5&:&:&<"U$.ud$;$;a.3{+ '= !;$ #+ K YZ iT  	9+ G 		s7   K9*K96K>K>-!LAL		
L*L%%L*c           	        ^^	^
^ [        USS5      m
UR                  5       nT
b  T
Ul        SnUb  [        USS5      nUb!  [        U5      S:  a  [	        XU5      nX0l        / mU R                  5       n[        UR                  5       5      m[        U SS5      nUR                  5        VVs0 s H&  u  pgUR                  U SU 35      b  U SU 3OUU_M(     nnnSS[        R                  4UU	U
U4S	 jjjm	T	" XSS
9  TT4$ s  snnf )a  
Loads state dict into a model specifically for Zero3, since DeepSpeed does not support the `transformers`
tensor parallelism API.

Nearly identical code to PyTorch's `_load_from_state_dict`

Args:
    model_to_load: The model to load weights into
    state_dict: The state dict containing the weights
    load_config: Optional LoadStateDictConfig containing weight_mapping and other loading options
r   Nr   r   r   .Fmodulec                   > Tc  0 OTR                  US S 0 5      nX4S'   XUS/ / T4n[        5       (       GaL  SS Kn[        U R	                  US S SS95      n/ nU H7  n	X;   d  M
  Xy   n
SU
l        UR                  U
5        TR                  U	5        M9     [        U5      S:  aT  UR                  R                  USS9   [        R                  R                  5       S:X  a  U R                  " U6   S S S 5        [        U R                  US S SS95      nUR!                  5        HZ  u  pX;   d  M  Uc  M  TR                  U	5        [        R"                  " 5          UR%                  X   5        S S S 5        SUl        M\     U R&                  R!                  5        H  u  pUc  M
  T" XX--   S-   U5        M     g ! , (       d  f       N= f! , (       d  f       Nb= f)	Nassign_to_params_buffersTr   F)r   r   r   r   )rC   r   r   r   named_parameters_is_hf_initializedrD   discardru   r   r   rc   distributedr   _load_from_state_dictnamed_buffersr   r   copy__modules)r   r   r   r   local_metadatarg   r   r   params_to_gatherr   r   r   bufnamer   
error_msgsloadr   missing_keyss                  r   r   /_load_state_dict_into_zero3_model.<locals>.load  s   '/X\\&"+r5R5M12ND"b*M &''  $F$;$;6#2;X]$;$^_!%?,/E/3E,$++E2 ((+ & #$q( ^^667GWX6Y((113q844d; Z
 !!5!5VCR[RW!5!XYM'--/?s ((+		*-0 )-1C* 0 "??002KD U(;=UV 3 ZY )s    2F<G<
G

G	)r   ) F)r   copyr   ru   r   _weight_conversionsr   setr   r   rC   r	   Module)model_to_loadr   load_configr   meta_model_state_dictprefix_modelr   vr   r   r   r   s           @@@@r   !_load_state_dict_into_zero3_modelr    s9    z;5H"J'
 N .>E !c.&9A&=<]Xfg
,:)J)446,1134L=*=tDL $$&&DA #8";";|nAaS<Q"R"^L>1#	dehi	i&  )WRYY )W )WV 	UC|##is   --Dc                 0  ^ ^ SSK JnJn  UR                  nSnSU;   a  U" US9nO?UR	                  5       (       a  [
        R                  S5        T R                  5       nSUS'   Sn	S	U;   a  U" U5      n	X4$ [        X5      (       a  UU 4S
 jn
U" XS9n	X4$ )zQ
A convenience wrapper that deals with optimizer and lr scheduler configuration.
r   )
DummyOptimDummySchedulerN	optimizer)r   zDetected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)Tzero_allow_untested_optimizer	schedulerc                 b   > [         R                   " T5      nS Ul        UR                  TU S9nU$ )N)r}   r  )r   lr_schedulercreate_scheduler)r  trainer_copyr  r}   trainers      r   _lr_scheduler_callable5deepspeed_optim_sched.<locals>._lr_scheduler_callable3  s=    #yy1 -1)+<<'9Y  =   $#r#   )lr_scheduler_callable)	accelerate.utilsr
  r  rH   
is_offloadloggerinfocreate_optimizerr   )r  hf_deepspeed_configrg   r}   model_parametersr
  r  rH   r  r  r  s   `  `       r   deepspeed_optim_schedr    s     < ''F If&67	))++KKV ,,.	26./Lf%i0" "" i,,	$ *)bL""r#   c                    SSK Jn  U R                  nU R                  nU R                  R
                  R                  R                  nUR                  XTU5        UR                  UR                  5       5        U(       aK  UR                  5       (       d  [        S5      eUR                  S5        UR                  S5        Su  pxSn	Xx4$ SU l        UR                  R!                  S0 5      R!                  S	S
5      n
U
S
:  a.  SSKnUR%                  UU
UR'                  5       UR                  S9n[)        [+        S UR-                  5       5      5      n	[/        XXQU	5      u  pxXx4$ )a  
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.

If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.

Args:
    trainer: Trainer object
    num_training_steps: per single gpu
    resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
    inference: launch in inference mode (no optimizer and no lr scheduler)
    auto_find_batch_size: whether to ignore the `train_micro_batch_size_per_gpu` argument as it's being
        set automatically by the auto batch size finder

Returns: optimizer, lr_scheduler

We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
https://github.com/deepspeedai/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
can't resume from a checkpoint after it did some stepping https://github.com/deepspeedai/DeepSpeed/issues/1612

r   )r  zMZeRO inference only makes sense with ZeRO Stage 3 - please adjust your configr  r  )NNNr   r   r   )r|   r   r7   rH   c                     U R                   $ r1   )requires_grad)ps    r   r    deepspeed_init.<locals>.<lambda>{  s    r#   )deepspeed.utilsr  r|   rg   acceleratorstatedeepspeed_pluginhf_ds_configr   setLevelget_process_log_levelrx   r6   del_config_sub_treer  rH   rC   r   tp_model_initr7   r   filterr   r  )r  r}   r   	ds_loggerr|   rg   r  r  r  r  deepspeed_tp_sizer   s               r   deepspeed_initr0  C  sd   * 4MME<<D!--33DDQQ //=OP t1134"++--lmm 	//<//?",	* ""' !/66::;LbQUUVcefgq ++))//1*11	 , E  '@%BRBRBT UV"7$DT#
	 ""r#   c                     SS K n[        UR                  U S35      5      n[        U5      S:  a>  [        R	                  SU 35        U R                  UUSSS9u  pVUc  [        SU 35      eg [        SU 35      e)Nr   z/global_step*zAttempting to resume from T)load_module_strictload_optimizer_statesload_lr_scheduler_statesz-[deepspeed] failed to resume from checkpoint z!Can't find a valid checkpoint at )globr   ru   r  r  load_checkpointr6   )deepspeed_enginecheckpoint_pathr2  r5  deepspeed_checkpoint_dirs	load_pathr   s          r   deepspeed_load_checkpointr;    s    
  &tyyO3DM1R'S T
$%)00ABC'771"&%)	 8 
	 L_L]^__  <_<MNOOr#   c                     U R                   R                  n[        UR                  R                  5      Ul        UR                  R                  Ul        UR                  R                  X5        g)aw  
Sets values in the deepspeed plugin based on the TrainingArguments.

Args:
    accelerator (`Accelerator`): The Accelerator object.
    args (`TrainingArguments`): The training arguments to propagate to DeepSpeed config.
    auto_find_batch_size (`bool`, *optional*, defaults to `False`):
        Whether batch size was auto-discovered by trying increasingly smaller sizes.
N)r&  r'  r.   r(  rH   r   ri   )r%  rg   rh   	ds_plugins       r   propagate_args_to_deepspeedr>    sV     !!22I5i6L6L6S6STI!*!7!7!>!>I11$Mr#   c                   ^^ SU;  a  SU;   a  US   US'   U" S0 UD6nUR                   nUR                  S:X  a'  UR                  S:  a  SSKJn  UR                  5       nO6U R                  b  U R                  S   R                  5       nO[        S5      eUR                  n	[        R                  R                  R                  R                  XhS	9mUS   S
:g  R                  S5      R                  5       n
[        R                  R                  R                  R                  XS	9m[        UU4S j[!        U	5       5       5      n[        T5      nU[#        US5      -  nU(       a  Xe4$ U$ )aA  
Computes the loss under sequence parallelism with `sp_backend="deepspeed"` and `sp_size > 1`.

Performs weighted loss aggregation across SP ranks, accounting for varying numbers of valid tokens per rank
(e.g., when some ranks receive only padding or prompt tokens that are masked with -100).

Args:
    accelerator (`Accelerator`): The accelerator instance with `torch_device_mesh` support.
    model (`torch.nn.Module`): The model to compute the loss for.
    inputs (`dict[str, torch.Tensor | Any]`): The input data for the model. Must include `"shift_labels"` key.
    return_outputs (`bool`): Whether to return the model outputs along with the loss.
    pc (`accelerate.parallelism_config.ParallelismConfig`): The parallelism configuration.

Returns:
    The loss, or a tuple of `(loss, outputs)` if `return_outputs` is `True`.
labelsshift_labelsr   r   r   )groupsspzSequence parallelism is enabled but no SP process group is available. Ensure torch_device_mesh is initialized or sp_backend='deepspeed' with sp_size > 1.)groupir   c              3   P   >#    U  H  nTU   S :  d  M  TU   TU   -  v   M     g7f)r   Nr$   ).0rankgood_tokens_per_ranklosses_per_ranks     r   	<genexpr>,deepspeed_sp_compute_loss.<locals>.<genexpr>  s7      (D%) 	; 4T ::(s   &&r$   )loss
sp_backendsp_sizer$  rB  _get_sequence_parallel_grouptorch_device_mesh	get_groupr6   rc   r   r	   
functional
all_gatherviewsumrangerw   )r%  r|   inputsreturn_outputspcoutputsrL  rB  sp_groupsp_world_sizegood_tokens
total_losstotal_good_tokensrH  rI  s                @@r   deepspeed_sp_compute_lossr`    s\   , v.F":!.1xofoG<<D 
}}#

Q*668		&	&	2006@@Bb
 	
 JJM''**55@@@VO.)T177;??AK ,,//::EEkEb -( J
 01-q11D,D?6$6r#   r1   r   )T)'r)   r   importlib.metadatar   importlib.utilr   	functoolsr   dependency_versions_checkr   utilsr   r   r   rc   r	   
get_loggerr%   r  r   accelerate.utils.deepspeedr   DeepSpeedConfigbuiltinsr   r.   r   r   r   r   r   r   r   r  r  r0  r;  r>  r`  r$   r#   r   <module>rj     s        # 9 H H  
		H	%
 !7!9!9O 3. .2p0 ph !% I) ;FhVW$t3#l@#FP0N"77r#   