
    Z j&                         S SK r S SKrS SKrS SKJr  SSKJrJrJrJ	r	  \	R                  " \5      rS\4S jrS rS	S jrg)
    N)
DataLoader   )WEIGHTS_NAMEPushToHubMixinis_torch_xla_availablelogging
dataloaderc                     [        5       (       ac  SS KJs  Jn  [	        XR
                  5      (       d   S5       eSS KJs  Jn  UR                  UR                  5       S5      nX0R                  S'   U $ U $ )Nr   zPThe dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`.)fsdpNinput_sharding)r   %torch_xla.distributed.parallel_loaderdistributedparallel_loader
isinstanceMpDeviceLoadertorch_xla.distributed.spmdspmdShardingSpecget_global_mesh_parallel_loader_kwargs)r	   plxssharding_specs       n/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/tpu.pytpu_spmd_dataloaderr      sn    ::*&7&788 	
^	
8 	0/(:(:(<nM?L**+;<    c                 \  ^^^^^^ SSK Js  Jm  SSKJs  Jm  SSKJn   SSKJ	m  SSKJ
m  SSKJnJn  T(       a  SSKJm  SnSn[#        U S
S5      nUR$                  R'                  SU5      n	UR$                  S   S:  a"  [(        R*                  " XAR$                  S   S9nORU	bO  [-        5       n
U	 H*  nU" X5      nUc  [/        S5      eU
R1                  U5        M,     [(        R*                  " UU
S9nUR2                  nUR$                  S   (       aJ  U R4                  R6                  (       a&  [8        R;                  S5        SU R4                  l        UUUU4S jnT(       a  U4S jnT" U UUUS9n OT" U 4UUS.UD6n S0 4U4S jjnUTl        U $ ! [          a    [!        S	5      ef = f)a  
Wraps a model with XLA Fully Sharded Data Parallelism (FSDP).

Handles both FSDP v1 (`XlaFullyShardedDataParallel`) and v2 (`SpmdFullyShardedDataParallel`),
including auto-wrap policies, gradient checkpointing, and patching `xm.optimizer_step`.

Args:
    model (`torch.nn.Module`): The model to wrap.
    args (`TrainingArguments`): The training arguments containing FSDP configuration.
    is_fsdp_xla_v2_enabled (`bool`): Whether FSDP v2 (SPMD) is enabled.

Returns:
    `torch.nn.Module`: The FSDP-wrapped model.
r   Nr   )get_module_class_from_name)XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policy)SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)r&   z@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fc                 >   > T(       d  TOTnU" T" U 5      /UQ70 UD6$ N )margskwargs
target_clsFSDPFSDPv2r    is_fsdp_xla_v2_enableds       r   auto_wrapper_callable2wrap_model_xla_fsdp.<locals>.auto_wrapper_callablet   s'    %;J/2DTDVDDr   c                   > SSK Jn  S n[        U [        R                  5      (       a  U nO7[        U [
        5      (       a  U S   nO[        X5      (       a  U R                  nUc  [        S5      eTR                  X1S5        g )Nr   )CausalLMOutputWithPastr   zASomething went wrong, the output of the model shouldn't be `None`)r   NN)	modeling_outputsr6   r   torchTensortuplelogits
ValueErrormark_sharding)outputmeshr6   real_outputr   s       r   shard_output)wrap_model_xla_fsdp.<locals>.shard_output{   sn    AK&%,,//$FE**$QiF;;$mm" !dee[0DEr   )rA   auto_wrap_policyr3   )rC   r3   c                 Z   > U R                   " S0 UD6nU(       a  TR                  5         U$ )Nr+   )step	mark_step)	optimizerbarrieroptimizer_argslossxms       r   patched_optimizer_step3wrap_model_xla_fsdp.<locals>.patched_optimizer_step   s%    ~~//LLNr   )torch_xla.core.xla_modelcore	xla_modelr   r   r   trainer_pt_utilsr   torch_xla.distributed.fsdpr   r    torch_xla.distributed.fsdp.wrapr!   r"   7torch_xla.experimental.spmd_fully_sharded_data_parallelr#   ImportErrorgetattrfsdp_configget	functoolspartialset	Exceptionaddxla_fsdp_configconfig	use_cacheloggerwarning_onceoptimizer_step)modelr-   r2   r   r!   r"   rC   r3   %default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wraplayer_classtransformer_clsfsdp_kwargsrA   rL   r0   r1   r    rK   r   s     `             @@@@@r   wrap_model_xla_fsdprk   .   s    *)++=hR@	

 "  ,3E;NPT,U))-)9)9)=)=')N*& ()A-$,,'8H8HIY8Z
 
,	7"%%=K8LO& bcc'++O< > %,,("9
 &&K,-<<!!j &+ELL"	E 	E
 	F %-"7	
 
-"7
 	
 38  /BLi  hfgghs   !F F+c           	         SSK Js  Jn  Ub  UOUR                  n[        R                  SU 35        UR                  5         UR                  SS9(       aJ  [        R                  " USS9  [        R                  " U[        R                  R                  US5      5        [        4nUR                  S	5        U(       Ga\  U R!                  5       U R#                  5       S
.n[        R                  R                  USUR$                   SUR&                   S[(         35      n	UR                  XSS9  UR                  S5        UR*                  (       a  SSKJn
  U
" [        R                  R                  US5      S[(         3SS9u  pU R0                  R0                  n UR3                  U 5      n[5        X5      (       a  UR7                  X[S9  GOd[        R                  S5        UR                  U[        R                  R                  U[(        5      5        GO[5        X5      (       d  [5        UR3                  U 5      U5      (       aH  UR3                  U 5      R7                  UUR*                  UR9                  U R!                  5       5      S9  O[        R                  S5        UR9                  U R!                  5       5      nUR                  U[        R                  R                  U[(        5      5        O8U R7                  UUR*                  UR9                  U R!                  5       5      S9  Ub$  UR*                  (       a  UR7                  U5        ggg)ah  
Saves a model checkpoint on TPU/XLA devices.

Handles FSDP v1 sharded checkpoints (with consolidation on master), as well as
standard XLA model saving via `save_pretrained` or `xm.save`.

Args:
    model (`torch.nn.Module`): The model to save.
    args (`TrainingArguments`): The training arguments.
    accelerator (`Accelerator`): The accelerator instance.
    processing_class: The processing class (tokenizer/processor) to save alongside the model.
    is_fsdp_xla_v1_enabled (`bool`): Whether FSDP XLA v1 is enabled.
    output_dir (`str`, *optional*): The directory to save to. Defaults to `args.output_dir`.
r   NzSaving model checkpoint to F)localT)exist_okztraining_args.binsaving_checkpoint)rd   shard_metadatarankz-of--)master_onlysave_full_checkpoints)%consolidate_sharded_model_checkpoints zrank*-of-*-)ckpt_prefixckpt_suffix
save_model)
state_dictzETrainer.model is not a `PreTrainedModel`, only saving its state dict.)is_main_processrz   )rN   rO   rP   
output_dirra   inforF   is_master_ordinalosmakedirsr8   savepathjoinr   
rendezvousrz   get_shard_metadataprocess_index
world_sizer   should_saverR   ru   moduleunwrap_modelr   save_pretrained_maybe_convert_to_cpu)rd   r-   acceleratorprocessing_classis_fsdp_xla_v1_enabledr|   rK   supported_classesckpt	ckpt_pathru   full_state_dict_unwrapped_modelrz   s                  r   save_tpu_checkpointr      s    *))54??J
KK-j\:;LLN	%(
J.

4j2EFG ()MM%&%%'#668
 GGLLtD4F4F3GtDOOK\\]^j]k-lm	
U3
-.X!FGGLLR8),8 "O
 LL''E)66u=O/==//
/Wcdj,)OP11k..u57HII$$U+;; $ 0 033E4D4D4FG <  KK_`11%2B2B2DEJGGJZ FG ,,//0@0@0BC 	 	

 #(8(8((4 )9#r   r*   )rY   r   r8   torch.utils.datar   utilsr   r   r   r   
get_logger__name__ra   r   rk   r   r+   r   r   <module>r      sD     	  ' Q Q 
		H	%J &tnJ5r   