
    R j              	       	   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKJrJ	r	  S SK
Jr  S SK Jr  S SKJr  S SKJrJr  S SKJr  S SKJrJrJrJrJr  S S	KJr  S SKrS SKJr  S SKJ r   S SK!J s  J"r#  S S
K$J%r%  S SK&J'r'  S SK(J)r)J*r*J+r,  S SK-J.r.  S SK/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7  S SK8J9r9  S SK:J;r;J<r<J=r=  S SK>J?r?J@r@JArA  S SKBJCrCJDrDJErEJFrF  S SKJGrGJHrH  S SKIJJrK  S SKLJMrMJNrNJOrOJPrPJQrQ  S SKRJSrSJTrTJUrUJVrVJWrWJXrXJYrY  S SKZJ[r[  \X(       a)  \\" S\]" S\R                  R                  5       5      5      r`OSr`\V(       a  SraSrb\R                  R                  5       r`O8\W(       a  SraSrbO,\Y(       a  S raS!rb\R                  R                  5       r`OS"raS#rbS$r` " S% S&\5      rd " S' S(\5      re " S) S*\ R                  \5      rgS+\ R                  S,\R                  S-\4S. jriS/ rj  SS+\ R                  S0\k4S1 jjrlSS2 jrmS3 rnS4 roSS+\ R                  S5\k4S6 jjrpS+\ R                  S7\k4S8 jrqS+\ R                  S9\k4S: jrr " S; S<5      rs " S= S>\g5      rt " S? S@\g5      ru " SA SB\u5      rv " SC SD\u5      rw " SE SF\g5      rx " SG SH\x5      ry " SI SJ\ R                  5      rz " SK SL\u5      r{ " SM SN\ R                  5      r| " SO SP\ R                  5      r~ " SQ SR\ R                  5      r\ GR                   SS\4ST j5       r\ GR                   SU\4SV j5       r\ GR                   SW\4SX j5       r\ GR                   SY\4SZ j5       r\ GR                   S[\4S\ j5       r\\ GR                   S]\4S^ j5       5       r\\ GR                   S_\4S` j5       5       r\\ GR                   Sa\4Sb j5       5       r\\ GR                   Sc\4Sd j5       5       rSe\S-\Sf\Sg\4Sh jr SSi\ R                  Sj\ R                  Sk\\Sl4   4Sm jjr\GR                  " \YSn5       " So Sp\O5      5       r " Sq Sr5      r " Ss St\\N5      r " Su Sv\\M5      rSSw\\   4Sx jjr " Sy Sz\ R                  5      r " S{ S|\ R                  5      r " S} S~\ R                  5      rg)    N)ABCabstractmethod)Callable)nullcontext)deepcopy)autoEnumwraps)Anycastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcContinuousTestMultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msset_rng_seed	TEST_CUDATEST_HPUTEST_WITH_ROCMTEST_XPU)
has_triton      cudancclzhpu:0hcclxpuxcclcpugloo   c                   0    \ rS rSr\" 5       r\" 5       rSrg)FSDPInitMode^    N)__name__
__module____qualname____firstlineno__r   NO_FSDP	RECURSIVE__static_attributes__rF       t/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/testing/_internal/common_fsdp.pyrD   rD   ^   s    fGIrN   rD   c                   >    \ rS rSr\" 5       r\" 5       r\" 5       rSrg)DEVICEInitModeg   rF   N)	rG   rH   rI   rJ   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrM   rF   rN   rO   rQ   rQ   g   s    FM6L6LrN   rQ   c                       \ rS rSrSr\S\\R                  S4   4S j5       r	\S\R                  4S j5       r
\SS j5       r\\S	\S
\S\R                  4S j5       5       rSrg)FSDPTestModelp   zVThis defines the interface expected from all models used commonly for
FSDP unit tests.return.c                     g)z+Returns an input for the model as as tuple.NrF   selfdevices     rO   	get_inputFSDPTestModel.get_inputt        	rN   c                     g)z,Returns the loss given the input and output.NrF   )r\   inputoutputs      rO   get_lossFSDPTestModel.get_lossy   r`   rN   Nc                     g)z<Runs the backward pass (e.g. including ``loss.backward()``).NrF   r\   losss     rO   run_backwardFSDPTestModel.run_backward~   r`   rN   argskwargsc                      g)z&Initializes an instance of this model.NrF   )rk   rl   s     rO   initFSDPTestModel.init   s     	rN   rF   rY   N)rG   rH   rI   rJ   __doc__r   tupletorchTensorr^   rd   ri   staticmethodr   nnModulern   rM   rF   rN   rO   rW   rW   p   s     5s):#;        C 3 299   rN   rW   modelprocess_group	assert_fnc                 X   U R                  5        VVs/ s H$  u  p4X4R                  5       R                  5       4PM&     nnnUU R                  5        VVs/ s H$  u  pgXgR                  5       R                  5       4PM&     snn-  n[        R
                  " U5      n[        U5       V	s/ s H  n	SPM     n
n	[        R                  " XUS9  U
S   nUc  [        S5      eU
SS  H1  nUc  [        S5      e[        XSS9 H  u  u  pu  pU" X5        M     M3     gs  snnf s  snnf s  sn	f )	z
All-gathers module states across ranks and calls ``assert_fn`` on each pair
of corresponding states from rank 0 and a nonzero rank. For example, if
``assert_fn`` is ``self.assertEqual()``, then this checks that all module
states are equal across ranks.
Ngroupr   z$Expected rank0_states to not be NonerB   zExpected state to not be NoneTstrict)
named_parametersdetachr@   named_buffersdistget_world_sizerangeall_gather_objectAssertionErrorzip)rx   ry   rz   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rO   _assert_module_statesr      s3    "'!7!7!9!9J 
\\^'')*!9   #(#6#6#8#8K 
mmo))+,#8  $$]3J ,-,aT,E-5]K8LCDDqr= !@AA #L EGQWab !F 
 .s   +D+D!*D'c                  6    [         R                  " [        5      $ N)rs   r]   DEVICE_TYPErF   rN   rO   get_devtyper      s    <<$$rN   zero_buffersc                    U(       a  [         R                  " U 5      O	[        5       nU   U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     U(       aE  U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     SSS5        g! , (       d  f       M  = f! , (       d  f       M_  = f! , (       d  f       g= f)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrs   no_gradzero_buffers)rx   r   summon_fullctxr   r   s         rO   _zero_modelr      s     -8$
!
!%
([]C	%%'E ! ( --/]]_LLN %_ * 
  %_ 
s;   )C)C$;C)C0C)
CC)
C&!C))
C7c                     U(       d  U R                  [        5      n U(       a  U R                  5         U R                  5       $ r   )tor   half
state_dict)rx   cpu_offloadr   s      rO   _get_state_dictr      s.    %

rN   c           	      p    SR                  U Vs/ s H  o"b  U [        U5         OSPM     sn5      $ s  snf )Nr   none)joinstr)test_name_mappingrk   ss      rO   subtest_namer      s9    88IMNAm	3q6	"	?N Ns   3c                    UR                  5        H=  u  p#UR                  [        R                  " S5      :w  d  M+  UR                  5       X'   M?     U S:X  a  UOS /n[        R
                  " U5        [        [        [        [        R                  4   US   5      nU H  nX   R                  [        5      X'   M     U$ )Nr@   r   )itemsr]   rs   r@   r   broadcast_object_listr   dictr   rt   r   r   )rankr   r   r   r   s        rO   _broadcast_state_dictr      s     (--/
<<5<<..%*YY[J" 0  19Z$/Eu%d3,-uQx8J 
!+!7!:!:;!G
 !rN   recursec                     [         R                  " XS9   [        [        U R	                  5       5      5      sSSS5        $ ! , (       d  f       g= f)a?  
Returns the full unsharded parameters of ``model``. Any FSDP-managed
parameters offloaded to CPU are moved to GPU in the returned list.

Args:
    recurse (bool): If ``False``, only unshards the parameters immediate to
        ``model``; if ``True``, recurses through the module hierarchy
        rooted at ``model``.
)r   N)r   r   r   listr   )rx   r   s     rO   get_full_paramsr      s4     
	 	 	8U--/01 
9	8	8s   "A
Amove_to_devicec                 >    U(       a  U R                  [        5      $ U $ r   )r   r   )rx   r   s     rO   _move_to_devicer      s    $2588K ==rN   	wrap_fsdpc                 2    U(       d  U $ [        U /UQ70 UD6$ r   r   )rx   r   rk   rl   s       rO   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrN   c                   H    \ rS rSrS\S\4S jrS\4S jrS\4S jrS rS	r	g
)DummyProcessGroup   r   sizec                     Xl         X l        g r   _rank_size)r\   r   r   s      rO   __init__DummyProcessGroup.__init__   s    

rN   rY   c                     U R                   $ r   )r   r\   s    rO   r   DummyProcessGroup.rank       zzrN   c                     U R                   $ r   )r   r   s    rO   r   DummyProcessGroup.size   r   rN   c                 B    [         R                  " 5       nS nXCl        U$ )Nc                  d    [         R                  R                  5       n U R                  S5        U $ )NrB   )rs   futuresFuture
set_result)futures    rO   
get_future/DummyProcessGroup.allreduce.<locals>.get_future  s'    +0==+?+?+AFa MrN   )r   Mockr   )r\   rk   rl   	dist_waitr   s        rO   	allreduceDummyProcessGroup.allreduce  s     IIK		
  *rN   r   N)
rG   rH   rI   rJ   intr   r   r   r   rM   rF   rN   rO   r   r      s2    S  c c 	rN   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\   SS\R
                  S\S\S\\\\4      S\S\S\\R*                  \4   4S jj5       rS rSrU =r$ )TransformerWithSharedParamsi  r}   device_init_modeadd_bndeterministicc                   > [         TU ]  5         UR                  5       U l        UR                  5       U l        U(       a  [
        R                  " S5        SnSn[        R                  " XV5      U l	        [        R                  " USSSSS9U l        [        R                  " Xe5      U l        U R                  R                  U R                  l        U R                  SU R                  R                  R!                  U45      5        U R                  S	[
        R"                  " U R$                  [
        R&                  S
95        SU l        U(       a)  [
        R                  R+                  U R(                  5      O[
        R                  R-                  5       U l        U[0        R2                  :X  a  U R5                  [6        5      n U(       a  U R9                  5         g g )Nr         r:      g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rs   manual_seedrv   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrQ   rS   r   r   eval)r\   r}   r   r   r   d_vocabr   	__class__s          rO   r   $TransformerWithSharedParams.__init__  s]    	JJL	**,a LL:>>  
 99W6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rN   c                 $   [         R                  " SU R                  -   5        [         R                  " SUS9R	                  SU R
                  5      n[         R                  " U R
                  S-  US9R	                  SU R
                  5      nX#4$ )NrB      r]      r9   )rs   r   r   arangeviewr   )r\   r]   srctgts       rO   r^   %TransformerWithSharedParams.get_input8  sj    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGzrN   c                     U R                  U5      nX0R                  -   U R                  R                  U5      -   nU R                  U5      nU R	                  U5      nU R                  X45      nU R                  U5      $ r   )r   r   r   type_asr  r   r   )r\   src_idstgt_idsr  r  xs         rO   forward#TransformerWithSharedParams.forward>  sr    (OO#d&6&6&>&>s&CC(ggclS&""rN   c                     Uu  p4[         R                  R                  UR                  SUR	                  S5      5      UR                  S5      SS9$ )Nsum)	reduction)rv   
functionalcross_entropyr  r   )r\   rb   rc   r   r  s        rO   rd   $TransformerWithSharedParams.get_lossF  sG    }}**KKFKKO,chhrle + 
 	
rN   c                 $    UR                  5         g r   backwardrg   s     rO   ri   (TransformerWithSharedParams.run_backwardL      rN   fsdp_init_modefsdp_kwargsrY   c                 x   Uc  0 nU[         R                  :X  a)  [        U [        5      (       a  U S   nOU n[	        XbXT5      $ U[         R
                  :X  a  SU;  a  [        [        [        15      nOUR                  S5      nSU;   a?  US   [        R                  [        R                  1;   a  [        U [        5      (       d  SnOU n[        U [        5      (       a  U S   n	OU n	[	        XXT5      n
[        U
U4SU0UD6nU[        R                  :X  a  UR!                  ["        5      nU$ [%        SU 35      e)a  
Initializes a :class:`TransformerWithSharedParams` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps with
        top-level FSDP. By default, the top-level FSDP uses the
        ``ModuleWrapPolicy`` for encoder and decoder layers, but a
        different auto wrap policy may be specified via
        ``fsdp_kwargs``.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    add_bn (bool): Whether to include batch norm in the model.
Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )rD   rK   
isinstancerr   r   rL   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rQ   rT   r   r   
ValueError)r}   r"  r   r#  r   r   pgr%  fsdp_pg
tformer_pgm
fsdp_models               rO   rn    TransformerWithSharedParams.initO  sW   6 K\111%''1X.f  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%00%''"1X
"
+fA  "2 	J  >#>#>>']];7
77GHIIrN   c                     U R                   /$ r   )r   r   s    rO   get_ignored_modules/TransformerWithSharedParams.get_ignored_modules  s      !!rN   )r  r   r   r   r   r   r   )NFT)rG   rH   rI   rJ   r   ProcessGrouprQ   boolr   r^   r  rd   ri   ru   rD   r   r   r   r   r   rv   rw   r   rn   r4  rM   __classcell__r  s   @rO   r   r     s    (  ( )( 	(
 (T#
 
 15#KJ  KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ" "rN   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\  SS\R
                  S\S\S\\\\4      S\S\R(                  4S jj5       rSrU =r$ )NestedWrappedModulei  r}   r   r   r   c                   >^^^ [         TU ]  5         TR                  5       U l        TR                  5       U l        U[
        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   layerr#  r}   r   s    rO   _maybe_wrap1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrN   r   r   r9   r   )r   r   r   r   r   rQ   rS   rs   r   rv   
Sequentialr   r   module	r\   r}   r   r   r   r#  r   r@  r  s	    ``  `  rO   r   NestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rN   c                 v    [         R                  " SU R                  -   5        [         R                  " SSUS94$ )NrB   r9   r   r  )rs   r   r   randr[   s     rO   r^   NestedWrappedModule.get_input  s.    !dii-(

1a/11rN   c                 $    U R                  U5      $ r   rD  r\   r  s     rO   r  NestedWrappedModule.forward      {{1~rN   c                 &    UR                  5       nU$ r   )r  r\   rb   rc   rh   s       rO   rd   NestedWrappedModule.get_loss  s    zz|rN   c                 $    UR                  5         g r   r  rg   s     rO   ri    NestedWrappedModule.run_backward  r!  rN   r"  r#  rY   c                    Uc  0 nU[         R                  :X  a  [        U SUUS9$ U[         R                  :X  a;  [        U 4SUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP but not the top-level module. The model may
        later be wrapped with a top-level FSDP external to this method
        if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
Fr   r   r   Tr'  )	rD   rK   r;  rL   rQ   rT   r   r   r,  )r}   r"  r   r#  r   r1  s         rO   rn   NestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrN   rD  r   r   NF)rG   rH   rI   rJ   r   r6  r7  rQ   r   r^   r  rd   ri   ru   rD   r   r   r   r   rv   rw   rn   rM   r8  r9  s   @rO   r;  r;    s    
  
 
 )	

 
@2 
 15#+J  +J$+J )+J d38n-	+J
 +J 
+J +JrN   r;  c                   v   ^  \ rS rSr\  S	S\R                  S\S\S\	\
\\4      S\4
U 4S jjj5       rSrU =r$ )
AlwaysWrapNestedWrappedModulei  r}   r"  r   r#  r   c                 :  > [         [        [        ]   U [        R                  UUUS9nU[        R                  :X  a  U$ U[        R
                  :X  aH  U=(       d    0 n[        U4S[        0UD6nU[        R                  :X  a  UR                  [        5      nU$ g)z
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
policy.
)r}   r"  r   r#  r   r%  N)r   rZ  rn   rD   rK   rL   r   r   rQ   rT   r   r   )r}   r"  r   r#  r   rx   r1  r  s          rO   rn   "AlwaysWrapNestedWrappedModule.init   s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rN   rF   rX  )rG   rH   rI   rJ   ru   r   r6  rD   rQ   r   r   r   r   r7  rn   rM   r8  r9  s   @rO   rZ  rZ    s^    
 15#  $ ) d38n-	
  rN   rZ  c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjr\	SS j5       r
\	  SS\R
                  S\S\S	\\\\4      S\4
S
 jj5       rSrU =r$ )NonUniformReqGradNWMi!  r}   r   r   r   c                   >^^^ [         [        U ]  5         TR                  5       U l        TR	                  5       U l        U[        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        R                  " [        [        R                  " SS5      U5      [        [        R                  " SS5      U5      5      5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   r>  s    rO   r@  2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap5  rB  rN   r   r   r9   r   )r   r;  r   r   r   r   rQ   rS   rs   r   rv   rC  r   r   rD  rE  s	    ``  `  rO   r   NonUniformReqGradNWM.__init__"  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rN   c                     U R                  5        H3  u  p#[        R                  " X5      (       a  M"  UR                  S5        M5     g rX  )r   rematchrequires_grad_)rx   req_grad_masknps       rO   _set_nonuniform_req_grad-NonUniformReqGradNWM._set_nonuniform_req_gradL  s4    **,DA88M--  ' -rN   r"  r#  c                    [         R                  " S5      nU[        R                  :X  a#  [	        U SUUS9n[        R                  Xe5        U$ U[        R                  :X  aU  Uc  0 n[	        U 4SUUS.UD6nU[        R                  :X  a  UR                  [        5      n[        R                  Xu5        U$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
container to enable the desired non-uniform ``requires_grad``
``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
init modes, freezes all parameters except the last two to validate
``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
FSDP ``use_orig_params=True`` mode.
zmodule\.2.*\.1.*FrU  Tr'  )rd  compilerD   rK   r^  rj  rL   rQ   rT   r   r   r,  )r}   r"  r   r#  r   req_grad_pattern	ddp_modelr1  s           rO   rn   NonUniformReqGradNWM.initR  s    ( ::&9:\111,!1+	I !99)V|555" -!1+	
 J  >#>#>>']];7
 99*W77GHIIrN   rW  rp   rX  )rG   rH   rI   rJ   r   r6  r7  rQ   r   ru   rj  rD   r   r   r   r   rn   rM   r8  r9  s   @rO   r^  r^  !  s    (
  (
 (
 )	(

 (
T ( (
 
 15#+J  +J$+J )+J d38n-	+J
 +J +JrN   r^  c                      ^  \ rS rSrSrS\R                  S\S\4U 4S jjrS r	S r
S	 rS
 r\S\\   S\S\S\S\4
S j5       rSrU =r$ )ModuleWithDelayi  zThis class wraps a :class:`FSDPTestModel` to optionally add a delay
after computing the loss and/or before the gradient reduction.rD  delay_after_loss_msdelay_before_reduction_msc                 F   > [         TU ]  5         X l        X0l        Xl        g r   )r   r   rs  rt  rD  )r\   rD  rs  rt  r  s       rO   r   ModuleWithDelay.__init__  s!     	#6 )B&rN   c                 8    U R                   R                  U5      $ r   )rD  r^   r[   s     rO   r^   ModuleWithDelay.get_input  s    {{$$V,,rN   c                 $    U R                  U5      $ r   rK  rL  s     rO   r  ModuleWithDelay.forward  rN  rN   c                 b   U R                   R                  X5      nU R                  S:  a  [        (       d  [        (       a%  [
        R                  " U R                  S-  5        U$ [        (       a=  [        R                  R                  [        U R                  [        5       -  5      5        U$ Nr     )rD  rd   rs  r5   r7   timesleepr4   rs   r;   _sleepr   r2   rP  s       rO   rd   ModuleWithDelay.get_loss  s}    {{##E2##a'x88

433d:;  

!!#d&>&>ARAT&T"UVrN   c                    ^ ^ [         R                  R                  mUU 4S jn[        R                  " SU5         T R
                  R                  U5        S S S 5        g ! , (       d  f       g = f)Nc                  8  > TR                   S:  a  [        (       a>  [        R                  R	                  [        TR                   [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR                   S-  5        T" U 0 UD6$ r|  )rt  r4   rs   r;   r  r   r2   r5   r7   r~  r  )rk   rl   orig_reduce_scatterr\   s     rO   _delayed_reduce_scatter=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sq    --19JJ%%D::=N=PPQ XJJt==DE&777rN   z'torch.distributed.reduce_scatter_tensor)rs   distributedreduce_scatter_tensorr   patchrD  ri   )r\   rh   r  r  s   `  @rO   ri   ModuleWithDelay.run_backward  sR    #//EE	8 ZZ57N
 KK$$T*
 
 
s   A  
A.module_class
model_argsmodel_kwargsc                <    [        U R                  " U0 UD6UU5      $ )a  
Args:
    module_class (Type[FSDPTestModel]): Wrapped module class to which
        to add delays.
    model_args: Positional arguments forwarded to the ``module_class``
        ``init()``.
    delay_after_loss_ms (int): Delay after computing the loss/before
        the optimizer step (in ms).
    delay_before_reduction_ms (int): Delay before reduce-scattering
        gradients (in ms).
    model_kwargs: Keyword arguments forwarded to the ``module_class``
        ``init()``.
)rr  rn   )r  rs  rt  r  r  s        rO   rn   ModuleWithDelay.init  s*    * z:\:%
 	
rN   )rs  rt  rD  )rG   rH   rI   rJ   rq   rv   rw   r   r   r^   r  rd   ri   ru   typerW   r   rn   rM   r8  r9  s   @rO   rr  rr    s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rN   rr  c                       \ rS rSr\\R                  SSSS4S\R                  S\	S\S\
\\\4      S	\S
\S\4S jj5       rSrg)NestedWrappedModuleWithDelayi  NFr   r}   r"  r   r#  r   rs  rt  c                 >    [         R                  [        U UUUUUUS9$ )Nr}   r"  r   r#  r   rs  rt  )rr  rn   r;  r  s          rO   rn   !NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rN   rF   )rG   rH   rI   rJ   ru   rQ   rT   r   r6  rD   r   r   r   r   r7  r   rn   rM   rF   rN   rO   r  r    s     ,:+F+F04##$)*
  
$
 )
 d38n-	

 
 !
 $'
 
rN   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DummyDDPi  c                 .   > [         TU ]  5         Xl        g r   )r   r   rD  )r\   rD  r  s     rO   r   DummyDDP.__init__  s    rN   c                 &    U R                   " U0 UD6$ r   rK  r\   rk   rl   s      rO   r  DummyDDP.forward  s    {{D+F++rN   rK  rG   rH   rI   rJ   r   r  rM   r8  r9  s   @rO   r  r    s    , ,rN   r  c                      ^  \ rS rSrS\R
                  S\S\S\S\4
U 4S jjr	S r
S	 r\   SS\R
                  S
\S\S\\\\4      S\S\4S jj5       rSrU =r$ )MixtureOfExpertsi  r}   r   r   delay_before_free_msr   c                   > [         TU ]  UUUUS9  Xl        X@l        X l        U[
        R                  :H  U l        U(       a#  [        R                  " SU R                  -   5        SnSnSn	[        [        R                  " Xx5      U R                  5      n
[        S U
R                  5        5       5      U l        U
R                  5        H
  nSUl        M     U(       a  [        R                  " S5        [        [        R                  " X5      U R                  5      nU(       aF  [        R$                  R'                  UR                  5       /5      n[)        X40 UD6n
[)        X40 UD6n[        R*                  " [        [        R                  " X5      U R                  5      UU
[        [        R                  " X5      U R                  5      5      U l        g )	N)r}   r   r   r   *   r   r  r   c              3   @   #    U  H  oR                  5       v   M     g 7fr   )numel).0ri  s     rO   	<genexpr>,MixtureOfExperts.__init__.<locals>.<genexpr>
  s     $L8K1WWYY8K   Tr   )r   r   r}   r  r   rQ   rS   r   rs   r   r   r   rv   r   r  r   num_expert_paramsexpertr  	new_groupr   rC  rD  )r\   r}   r   r   r  r   r#  d_expertd_sharedd_inputr  ri  sharedexpert_groupr  s                 rO   r   MixtureOfExperts.__init__  s{    	-'	 	 	
 
$8!"..2N2NNb499n- 8!>@S@ST!$$L8I8I8K$L!L""$AAH % a  8!>@S@ST ,,66L &>+>F&7;7FmmBIIg8$:M:MNBIIh8$:M:MN	
rN   c                   ^ ^ T R                   S:  a  T R                  S   n[        U[        5      (       ag  [        R
                  R                  R                  R                  mUU 4S jn[        R                  " SU5         T R                  U5      sS S S 5        $ T R                  U5      $ ! , (       d  f       N= f)Nr   r:   c                    > [         (       a>  [        R                  R                  [	        TR
                  [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR
                  S-  5        T" U 0 UD6$ )Nr}  )r4   rs   r;   r  r   r  r2   r5   r7   r~  r  )rk   rl   orig_reshardr\   s     rO   _delayed_reshard2MixtureOfExperts.forward.<locals>._delayed_reshard)  sc     y

)) 9 9<M<O OP "XX

4#<#<t#CD'888rN   z.torch.distributed.fsdp._runtime_utils._reshard)r  rD  r(  r   rs   r  fsdp_runtime_utils_reshardr   r  )r\   r  r  r  r  s   `   @rO   r  MixtureOfExperts.forward#  s    $$q([[^F&$''$0055DDMM9 ZZDFV  ;;q> 
 {{1~ s   B//
B=c                    UR                  5         U R                  (       d  [        R                  " 5          U R	                  5        H|  n[        US5      (       a  M  UR                  c  M%  UR                  R                  U R                  5        [        R                  R                  UR                  U R                  S9  M~     S S S 5        g g ! , (       d  f       g = f)Nr  r|   )r  r   rs   r   r   hasattrgraddiv_r   r  
all_reducer}   )r\   rh   ri  s      rO   ri   MixtureOfExperts.run_backward;  s    ~~*Aq(++ vv)DOO4))44QVV4::4N + !  s   2C-AC
C r"  r#  c                 
   Uc  0 nU[         R                  :X  a  [        U SUUUS9$ U[         R                  :X  a<  [        U 4SUUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`MixtureOfExperts` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP, including the expert and shared layers, but
        not the top-level module. The model may later be wrapped with a
        top-level FSDP external to this method if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    delay_before_free_ms (int): Delay before resharding expert
        parameters in the forward pass (in ms).
F)r   r   r  r   Tr'  )	rD   rK   r  rL   rQ   rT   r   r   r,  )r}   r"  r   r#  r   r  r1  s          rO   rn   MixtureOfExperts.initG  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrN   )r  r}   rD  r   r  r   )NFr   )rG   rH   rI   rJ   r   r6  r7  rQ   r   r   r  ri   ru   rD   r   r   r   r   rn   rM   r8  r9  s   @rO   r  r    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 15#$%0J  0J$0J )0J d38n-	0J
 0J "0J 0JrN   r  c                      ^  \ rS rSr SSSSS.S\S\\R                     S\S	\S
\4
U 4S jjjjr	S\R                  S\R                  4S jrS rSrU =r$ )MLPi{  TFr9   )biaswith_bufferdim_multiplierdimr]   r  r  r  c                   > [         TU ]  5         [        R                  " XU-  X#S9U l        [        R                  " XQ-  XUS9U l        U(       a'  U R                  S[        R                  " U4US95        g S U l	        g )N)r]   r  r   r  )
r   r   rv   r   in_projout_projr   rs   randnr   )r\   r  r]   r  r  r  r  s         rO   r   MLP.__init__|  sf     	yys&:6U		."6QUV  5;;vf+MNDKrN   r  rY   c                     U R                  U5      n[        R                  " U5      nU R                  U5      n[        R                  " U5      nU R                  b  X R                  -   nU$ r   )r  Frelur  r   )r\   r  zs      rO   r  MLP.forward  sQ    LLOFF1IMM!FF1I;;"KKArN   c                     U R                   b4  [        R                  R                  R	                  U R                   5        g g r   )r   rs   rv   rn   normal_r   s    rO   reset_parametersMLP.reset_parameters  s+    ;;"HHMM!!$++. #rN   )r   r  r  r   )rG   rH   rI   rJ   r   r   rs   r]   r7  r   rt   r  r  rM   r8  r9  s   @rO   r  r  {  s     *.
 ! &
    " %,, / /rN   r  c                   V   ^  \ rS rSrSS.S\S\4U 4S jjjrS\S\S	\S
S 4S jrSr	U =r
$ )MLPStacki  F)with_seq_parallelmlp_dimr  c                   > [        USS9[        U5      [        USS9/nU(       a$  UR                  [        R                  " USS95        [        TU ]  " U6   X l        g )N   )r  Fr  )r  appendrv   	LayerNormr   r   r  )r\   r  r  modulesr  s       rO   r   MLPStack.__init__  sV     *L*	$
 NN2<<e<='"!2rN   tp_meshdp_meshuse_activation_checkpointingrY   c           
         [        SS9[        SS9[        SS9[        SS9[        SS9U R                  (       a  [        [        S5      S9O	[        5       S.nU R                  (       a  [	        SS9US'   [        XUS9  U  HD  n[        U[        R                  5      (       a  M$  U(       a  [        U5        [        U4S	U0UD6  MF     [        U 4S	U0UD6  U $ )
NF)use_local_outputrB   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r(  rv   r  r   r   )r\   r  r  r  r#  r  rD  s          rO   parallelizeMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4GWXF&",,//+6"<W<<  	D6w6+6rN   )rG   rH   rI   rJ   r   r7  r   r   r  rM   r8  r9  s   @rO   r  r    sN    BG 
3 
34 
3 
3  '+	 
 rN   r  c                      ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\
\\R                  \R                  4   \R                  4   4S jrS	rU =r$ )DoubleLineari  z
This can be used for returning multiple outputs from a module
(``use_second_linear=True``) or for having an unused module (``False``).
r  use_second_linearc                    > [         TU ]  5         [        R                  " X5      U l        [        R                  " X5      U l        [        R                  " 5       U l        X l        g r   )	r   r   rv   r   lin1lin2ReLUr  r  )r\   r  r  r  s      rO   r   DoubleLinear.__init__  sA    IIc'	IIc'	GGI	!2rN   r  rY   c                     U R                   (       a@  U R                  U R                  U5      5      U R                  U R                  U5      5      4$ U R                  U R                  U5      5      $ r   )r  r  r  r  rL  s     rO   r  DoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rN   )r  r  r  r  T)rG   rH   rI   rJ   rq   r   r7  r   rs   rt   r   rr   r  rM   r8  r9  s   @rO   r  r    s^    
3C 3D 3 3''	uU\\5<</0%,,>	?' 'rN   r  new_all_gather_into_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   all_gather_into_tensorbarrier)r  orig_all_gathers     rO   patch_all_gatherr    sN     11OLLN"<D6&5# 	&5#   1A>A !A>"A;;A>new_foreach_all_gatherc              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rs   r  r  _fully_shard_fsdp_param_groupforeach_all_gatherr   r  )r  orig_foreach_all_gathers     rO   patch_foreach_all_gatherr    s      	++==PP  	LLN 
''99L
# 	++==P 	# 	++==P    BDC A	DA
DDnew_foreach_reducec              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rs   r  r  r  r  foreach_reducer   r  )r	  orig_foreach_foreach_reduces     rO   patch_foreach_reducer    s      	++==LL   	LLN 
''99H
' 	++==L 	' 	++==Lr  new_reduce_scatter_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  r  s     rO   patch_reduce_scatterr    sO     44LLN!:D9%8" 	%8"r   new_all_reducec              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  orig_all_reduces     rO   patch_all_reducer    sI     ooOLLN$DO*) 	)r   new_unshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   unshardr   r  )r  orig_unshards     rO   patch_unshardr  +  P      "))LLLN(N.!- 	!-r   new_reshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   reshardr   r  )r  r  s     rO   patch_reshardr  8  r  r   new_post_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   post_backwardr   r  )r  orig_post_backwards     rO   patch_post_backwardr#  E  sQ      (55LLN#4N :'9$ 	'9$r   new_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   r  r   r  )r$  orig_backwards     rO   *patch_register_post_backward_hook_backwardr'  R  sS      199MLLN,8 )>0=$- 	0=$-r   r  rk   rl   c                     [        U5      S:  a  US   nOSU;   a  US   nO[        SU SU 35      eU" U5        U" U0 UD6$ )Nr   rc   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenr   )clsr  rz   rk   rl   rc   s         rO   reduce_scatter_with_assertr+  _  sa     4y1}a	V	!;D6F8T
 	
 f///rN   replicated_modulesharded_moduleprefixes_to_ignore.c                    [        UR                  5       UR                  5       SS9 GH  u  u  pEu  pgUnU H  n	UR                  U	S5      nM     U R                  XH5        U R	                  U[
        5        [        U[
        5      (       d  [        S5      eUR                  UR                  p[        U5      [        S5      [        S5      4:X  a  [        S5      e[        XZU5      nU R                  UR                  5       UR                  5       5        UR                  c  U R                  UR                  5        GM  U R!                  UR                  5        [        UR                  X5      nU R	                  UR                  [
        5        [        UR                  [
        5      (       d  [        S5      eU R                  UR                  R                  5       UR                  5       5        GM     g )NTr~    z&Expected sharded_param to be a DTensorr   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using itz+Expected sharded_param.grad to be a DTensor)r   r   replaceassertEqualassertIsInstancer#   r(  r   r  
placementsrr   r$   r"   to_localr  assertIsNoneassertIsNotNone)r*  r,  r-  r.  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r4  sharded_ref_paramsharded_ref_grads                 rO   check_sharded_parityr@  r  s    OR**,'')OJ+-Jl
 *(F!3!;!;FB!G )<]G4-11 !IJJ(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BDU]//9-,,g66 !NOO**3357G7P7P7RS9OrN   znot-support-multithreadc                   J   ^  \ rS rSr\S 5       rU 4S jrS rS rS r	Sr
U =r$ )FSDPTestMultiThreadi  c                     [         $ r   DEVICE_COUNTr   s    rO   r   FSDPTestMultiThread.world_size      rN   c                 B   > [         TU ]  5         U R                  5         g r   )r   setUp_spawn_threadsr\   r  s    rO   rI  FSDPTestMultiThread.setUp  s    rN   c                      [        U /UQ70 UD6$ r   r/   r  s      rO   r/    FSDPTestMultiThread.run_subtests      D242622rN   c                 @    [         R                  R                  5         g r   rs   _dynamoresetr   s    rO   perThreadSetUp"FSDPTestMultiThread.perThreadSetUp      rN   c                 @    [         R                  R                  5         g r   rR  r   s    rO   perThreadTearDown%FSDPTestMultiThread.perThreadTearDown  rW  rN   rF   )rG   rH   rI   rJ   propertyr   rI  r/   rU  rY  rM   r8  r9  s   @rO   rB  rB    s.     3 rN   rB  c            $       d   \ rS rSrSrS rS rS rS r\	S 5       r
       S$S
\R                  S\S\S\S\\   S\S\\   S\S\S\\\\4      4S jjrSSS\" 5       SSSS	S	S	S	SS4S\\   S\S\S\\   S\S\S\S\\   S\\   S\\   S\S \S\S\S!\\\\4      S\\\\4      4 S" jjrS#rg)%FSDPTestMixini  z
Mixin class containing shared test utilities for FSDP tests.
Provides common helper methods for both FSDPTest and FSDPTestContinuous.
c                 :    U R                  X!R                  5        g r   )r2  r   )r\   r1  r   s      rO   _check_cpu_offload FSDPTestMixin._check_cpu_offload  s    &<&<=rN   c                 :    U R                  X!R                  5        g r   )r2  backward_prefetch)r\   r1  rb  s      rO   _check_backward_prefetch&FSDPTestMixin._check_backward_prefetch  s    *,H,HIrN   c                 :    U R                  X!R                  5        g r   )r2  forward_prefetch)r\   r1  rf  s      rO   _check_forward_prefetch%FSDPTestMixin._check_forward_prefetch  s    )+F+FGrN   c                      [        U /UQ70 UD6$ r   rN  r  s      rO   r/   FSDPTestMixin.run_subtests  rP  rN   c                    U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35        [
        S:w  a`  [        R                  R                  5       UR                  :  a4  [        R                  " [        SUR                   3   R                  5         U(       a^  [        R                  R                  R                  R                   R#                  5       n[$        R&                  " SUR                  UUS9  OC[$        R&                  " UR(                  [*        [-        UR                  5      UR                   S	9   S n
UR                   [2        -  n[4        (       d  [6        (       a  [        R                  R9                  U5        U/n
[$        R:                  " U
S9  [        R<                  R?                  5         [A        5         URC                  X$5        [        R<                  R?                  5         [$        R:                  " U
S9  [$        RD                  " 5         g ! [.         a@  n	S
U	R0                  S   ;   a'  [        R                  " [        S   R                  5        e S n	A	ff = f)Nfake_pgFdist init r=, world=r@   
multi-gpu-fakebackendr   r   storeinit_methodrr  r   r   	recompiler   backend_unavailable
device_ids)#r   	file_namegetprintr   r   rs   acceleratordevice_countsysexitr0   	exit_codetesting	_internalr  rl  	FakeStorer   init_process_groupru  DISTRIBUTED_BACKENDr   RuntimeErrorrk   rE  r4   r7   set_device_indexr  rS  rT  r3   run_testdestroy_process_groupr*  r   	test_namerz  piperl   r\   rl  rs  ery  	device_ids               rO   _runFSDPTestMixin._run  s   9~	"**Y.TYYKx/@AB%E$5$5$B$B$Dt$VHHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s    7A$I AI 
J;JJNFrx   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         U=(       a    UR                   n[        UR                  5       5      R                  nU
c  0 n
[	        S	SU0U
D6n[
        R                  R                  UR                  5       USS9n[        U5       GHt  nUR                  5         [
        R                  R                  [        US9   UR                  R                  [
        R                  " [        5      5      nU	(       d  U(       aW  [        U[         5      (       dB  [        U[
        R"                  5      (       a  UR%                  5       nO['        S U 5       5      nU" U6 nU(       ap  [        U[         5      (       a[  UR(                  [*        ;  aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR                  R/                  UU5      R1                  U5      nS S S 5        UR3                  W5      nU(       d1  U	(       d*  UR4                  [
        R6                  :w  a  [9        S5      eOU	(       a+  U R-                  UR4                  [
        R:                  5        Ot[        U[         5      (       a5  Uc  [9        S5      eU R-                  UR4                  UR<                  5        O*U R-                  UR4                  [
        R6                  5        UR                  R?                  U5        U(       a\  [        U[         5      (       aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     URA                  U5        URC                  5         U(       d  GM  URE                  5       RG                  5        VVs0 s H  u  nnUURI                  5       _M     nnn[K        U5        URM                  U5        GMw     [        U[         5      (       a  URO                  [P        RR                  5        WRU                  5       $ ! , (       d  f       GNN= fs  snnf )
Nenabledg?)r  momentum)r  c              3   @   #    U  H  oR                  5       v   M     g 7fr   )r   )r  r  s     rO   r  9FSDPTestMixin._train_for_several_steps.<locals>.<genexpr>  s     %>1ffhhr  r@   zQloss data type should be float32, as the original parameter data type is float32.z'Expected mixed_precision to not be NonerF   )+offload_paramsnextr   r]   r   rs   optimSGDr   	zero_gradampr  r   rD  r^   r(  r   rt   r   rr   r&  r   r2  rd   r   scaler   float32r   float16param_dtyperi   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEr   )r\   rx   r  r  r  r  r  r  r  r  r  cpu_offload_paramsmodel_devicesharded_grad_scalerr  r   rb   rc   ri  rh   kvr   s                          rO   _train_for_several_steps&FSDPTestMixin._train_for_several_steps  s@    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy!AOO##K#B..u||K/HI _Zt=T=T!%66 %

 %%>%> > '"5$// //>? #--/((5<<3FG 0 ||,,UF;>>|L- C. ',,T2D"=::.(:  / !$$TZZ?t,,&.,E  $$TZZ1L1LM$$TZZ?LL%%d+!j&=&=))+A$$QXXu||E/BC ,  $$U+&&(z7<7G7G7I7O7O7QR7Qtq!al7Q
R E"%%j1{ "~ eT"" 2 23{{} CBn Ss   9E Q :Q 
Q	r:   Tmodel_classr"  r   ref_init_fn	num_itersr   rb  r&  rf  use_orig_paramsinit_kwargsc                 @   U[         R                  :X  a  [        S5      eUc  0 nSnU R                  R	                  5       nUR
                  " U R                  [         R                  [        R                  4SS0UD6nUcC  [        (       a  [        U[        /[        S9nO+[        S:X  a  [        U5      nO[        UU/US9nOU" U5      nU(       a  UR                  5       nU R                  UUU
SLUUU
UUUS9	n[        UR                  5       5      nUR                  UUU	U
UUS	.5         UR
                  " U R                  UUU4SS0UD6n['        U[(        5      (       d  [)        UU R                  40 UD6nU(       a  UR                  5       nU[        R*                  :X  a  UR-                  [        5      nUSL=(       a    UR.                  nU=(       a    U[        R*                  :H  nU=(       a    U[        R*                  :g  nU(       aI  [0        R2                  " S5      nUR                  5        H  nU R5                  UR2                  U5        M!     U(       a  U R7                  [8        S[         35      O	[;        5       nU   U R                  UUSUUUU
UUUS9
n SSS5        U(       a  gU(       a^  [0        R2                  " S5      nUR                  5        H  nU R5                  UR2                  U5        M!     W R-                  [        5      n [=        U5      n![0        R>                  RA                  UW SS9  U
c  U(       d  U R5                  UU!SSS9  ggg! [          a   n[#        S
U S[%        U5       35      UeSnAff = f! , (       d  f       N= f)a  
Tests FSDP training against a reference, which defaults to DDP but
may be customized with ``ref_init_fn``.

Args:
    model_class (Type[FSDPTestModel]): A model class that inherits from
        ``FSDPTestModel``, which defines the expected interface.
    fsdp_init_mode (FSDPInitMode): The mode to initialize the
        FSDP-wrapped model. This should not be ``NO_FSDP``.
    ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
        non-wrapped model to construct the reference model, where this
        wrapper should provide data parallel semantics. If ``None``,
        then the callable defaults to the DDP constructor.
z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)ry  output_devicer@   )r  r  r  r  r  r  r  )r   rb  r&  r  rf  r  zInitializing z raised error zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)r  r  r  r  r  r  r  r  )check_dtypezFSDP did not match DDP)exact_devicemsg)!rD   rK   r   ry   r   rn   rQ   rS   r5   DDPr   r   r  r   r   r  	Exceptionr,  r   r(  r   rT   r   r  rs   r]   r2  assertRaisesRegexr  r   r   r  assert_close)"r\   r  r"  r   r  r  r  r   rb  r&  r  rf  r  r  r  r  r  r#  r  r   rx   	ref_modelref_loss
ddp_paramsr1  r  r  expects_device_errorexpects_cpu_device
cpu_devicer   context	fsdp_lossfsdp_unsharded_paramss"                                     rO   _test_fsdp_parityFSDPTestMixin._test_fsdp_parityO  s   D \111 !QRRK!!&&(    ((
 	

 
 x{m;	 %J	4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y$))"" 	
 # J *d++ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0  z: 1 $ ""%%0M3  	 55!,% /+E++E 6 I    e,J#..0  z: 1![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF Ws$   ."M" N"
N,NN
Nrz  r   )r  NFNFFN) rG   rH   rI   rJ   rq   r_  rc  rg  r/   classmethodr  rv   rw   r   r7  floatr   r   r   r   r   r   r  r  rW   rD   rQ   r   r   r   r  rM   rF   rN   rO   r]  r]    s   
>JH3 4% 4%v 15 48+0#?CYyyY Y 	Y
 Y #:.Y Y ".1Y %)Y Y %-T#s(^$<Y@ +/",,8<8<48!& %+0#04?C#h-(h %h )	h
 h'h h h  h $$45h $$45h ".1h h h %)h h  d38n-!h" %-T#s(^$<#h hrN   r]  c                      ^  \ rS rSrU 4S jr\S 5       r\S 5       r\S\4S j5       r	\S 5       r
\S 5       rS	rU =r$ )
FSDPTesti  c                 h   > [         TU ]  5         S[        R                  S'   U R	                  5         g )N0TORCH_NCCL_DESYNC_DEBUG)r   rI  osenviron_spawn_processesrK  s    rO   rI  FSDPTest.setUp  s)     14

,-rN   c                     [         $ r   rD  r   s    rO   r   FSDPTest.world_size  rG  rN   c                 >    [         R                  R                  5       $ r   )r   distributed_c10d_get_default_groupr   s    rO   ry   FSDPTest.process_group  s    $$7799rN   rY   c                     grX  rF   r   s    rO   destroy_pg_upon_exitFSDPTest.destroy_pg_upon_exit  s     rN   c                 *    [          U R                   3$ r   )r1   rz  r   s    rO   ru  FSDPTest.init_method  s    t~~.//rN   c                    U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35        [
        R                  R                  5       UR                  :  a4  [        R                  " [        SUR                   3   R                  5         U(       a^  [
        R                  R                  R                  R                  R!                  5       n["        R$                  " SUR                  UUS9  OC["        R$                  " UR&                  [(        [+        UR                  5      UR                   S9   S n
UR                   [0        -  n[2        (       d  [4        (       a  [
        R                  R7                  U5        U/n
["        R8                  " U
S9  [
        R:                  R=                  5         [?        5         URA                  X$5        [
        R:                  R=                  5         ["        R8                  " U
S9  ["        RB                  " 5         g ! [,         a@  n	S	U	R.                  S
   ;   a'  [        R                  " [        S   R                  5        e S n	A	ff = f)Nrl  Frm  rn  ro  rp  rq  rt  rv  r   rw  rx  )"r   rz  r{  r|  r   rs   r}  r~  r  r  r0   r  r  r  r  rl  r  r   r  ru  r  r   r  rk   rE  r4   r7   r  r  rS  rT  r3   r  r  r  s               rO   r  FSDPTest._run  s   9~	"**Y.TYYKx/@AB))+doo=HHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s    -A$H7 AH7 7
J;I<<Jr  )rG   rH   rI   rJ   rI  r[  r   ry   r7  r  ru  r  r  rM   r8  r9  s   @rO   r  r    ss        : : d   0 0 4% 4%rN   r  c                      ^  \ rS rSr% Sr\r\\S'   \	S\
4S j5       r\	S\
4S j5       r\	U 4S j5       rU 4S jrU 4S	 jr\S
 5       rSrU =r$ )FSDPTestContinuousiL  z
FSDP test base class using MultiProcContinuousTest for faster test execution.
This class reuses worker processes across tests, reducing process spawn overhead.
Use this for tests that don't require fresh process state between tests.
r   rY   c                     [         $ r   )r  r*  s    rO   backend_strFSDPTestContinuous.backend_strU  s    ""rN   c                     [         $ r   )r   r  s    rO   device_typeFSDPTestContinuous.device_typeY  s    rN   c                 `  > S[         R                  S'   [        R                  R	                  5       U:  a*  [
        R                  " [        SU 3   R                  5        U[        -  n[        (       d  [        (       a  [        R                  R                  U5        [        TU ]9  XU5        g )Nr  r  ro  )r  r  rs   r}  r~  r  r  r0   r  rE  r4   r7   r  r   _init_pg)r*  r   r   	rdvz_filer  r  s        rO   r  FSDPTestContinuous._init_pg]  s    
 14

,-))+j8HHZ*ZL 9:DDE<'	9..y995rN   c                    > [         TU ]  5         U R                  U R                  :w  a  [        R
                  " 5         [        R                  R                  5         [        5         g r   )
r   rI  r   MAIN_PROCESS_RANKr   r  rs   rS  rT  r3   rK  s    rO   rI  FSDPTestContinuous.setUpm  s>     99...LLNrN   c                    > U R                   U R                  :w  a  [        R                  " 5         [        TU ]  5         [        R                  R                  5         g r   )	r   r  r   r  r   tearDownrs   rS  rT  rK  s    rO   r  FSDPTestContinuous.tearDownw  s9    99...LLNrN   c                 .    U R                   R                  $ r   )r  r-  r   s    rO   ry    FSDPTestContinuous.process_group~  s    ~~   rN   rF   )rG   rH   rI   rJ   rq   rE  r   r   __annotations__r  r   r  r  r  rI  r  r[  ry   rM   r8  r9  s   @rO   r  r  L  sz     #J"#C # # C   6 6 ! !rN   r  compile_compute_on_modulec                 D   ^ ^^ U 4S jm " S S[         5      mUU4S jnU$ )Nc                     > [         R                  R                  R                  " U 0 UD6  Tb  [	        U S   T5      (       a  U S   R                  5         g g )Nr   )rs   r  r  r   r(  rm  )rk   rl   r   s     rO   !fully_shard_with_compiled_compute=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sS    **D;F;$,
G.1
 1
 GOO1
rN   c                   0    \ rS rSr\" 5       r\" 5       rSrg)*compiled_fsdp_test.<locals>.FullyShardModei  rF   N)rG   rH   rI   rJ   r   EAGERCOMPILED_COMPUTErM   rF   rN   rO   FullyShardModer    s    6rN   r	  c                 6   >^  [        T 5      UUU 4S j5       nU$ )Nc                    > [         R                  R                  R                  nT GH  nUTR                  :w  a&  [        5       (       d  [        R                  " SSS9  M:  [         R                  R                  R                  n[         R                  R                  R                  n[         R                  R                  5         UTR                  :X  a  UnO_UTR                  :X  aA  S[         R                  R                  l
        S[         R                  R                  l        TnO[        SU 35      eUT	R                   UR"                  '   T	" U 0 UD6  [         R                  R                  5         UT	R                   UR"                  '   U[         R                  R                  l
        U[         R                  R                  l        GM     g )Nz0Inductor on GPU needs Triton and recent GPU archr:   )
stacklevelTrB   z!Need to implement FullyShardMode=)rs   r  r  r   r  r8   warningswarnrS  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rG   )
rk   rl   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr	  r  funcs
          rO   wrapper6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  sf   (-(9(9(>(>(J(J &>///
MMJWX +0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&6= 'rN   r
   )r  r  r	  r  s   ` rO   	decorator%compiled_fsdp_test.<locals>.decorator  s#    	t 	R 
 	RD rN   )r	   )r   r  r	  r  s   ` @@rO   compiled_fsdp_testr    s"    " "$L rN   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )
SkipModulei  c                 X   > [         TU ]  5         [        R                  " SSSS9U l        g N
   Fr  )r   r   rv   r   linrK  s    rO   r   SkipModule.__init__  s"    99R%0rN   c                 $    U R                  U5      $ r   r%  rL  s     rO   r  SkipModule.forward  s    xx{rN   r(  rp   r  r9  s   @rO   r!  r!    s    1 rN   r!  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )NestedLineari  c                    > [         TU ]  5         U(       a8  [        [        R                  " SSSS9R                  [        5      5      U l        g [        R                  " SSSS9R                  [        5      U l        g r#  )r   r   r!   rv   r   r   r   nested_linear)r\   	fsdp_wrapr  s     rO   r   NestedLinear.__init__  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrN   c                 $    U R                  U5      $ r   r-  rL  s     rO   r  NestedLinear.forward  s    !!!$$rN   r1  r  r9  s   @rO   r+  r+    s    O% %rN   r+  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	SkipModeli  c                    > [         TU ]  5         [        R                  " SSSS9R	                  [
        5      U l        [        5       R	                  [
        5      U l        [        [        US9[
        S9U l        g )Nr$  Fr  )r.  )r  )r   r   rv   r   r   r   linearr!  linear_skipr!   r+  r-  )r\   double_nestr  s     rO   r   SkipModel.__init__  sW    iiBU366{C%<??;7!;/;
rN   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   r6  r7  r-  rL  s     rO   r  SkipModel.forward  s4    KKNQq!rN   r;  r  r9  s   @rO   r4  r4    s    
 rN   r4  )FT)FFr  )rF   r   )
contextlibr  rd  r  r~  unittestr  abcr   r   collections.abcr   r   copyr   enumr   r	   	functoolsr   typingr   r   r   r   r   r   rs   torch.distributedr  r   torch.nnrv   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   r  *torch.testing._internal.common_distributedr,   r-   r.   r/   r0   $torch.testing._internal.common_utilsr1   r2   r3   r4   r5   r6   r7   torch.utils._tritonr8   minmaxr;   r~  rE  r   r  r>   rD   rQ   rw   rW   r6  r   r   r7  r   r   r   r   r   r   r   r   r   r;  rZ  r^  rr  r  r  r  r  rC  r  r  contextmanagerr  r  r  r  r  r  r  r#  r'  r+  rr   r   r@  skipIfrB  r]  r  r  r  r  r!  r+  r4  rF   rN   rO   <module>r[     s    	 	 
    # $ "    < <        4 4 
 ? S 
 I R R F F  F H    + q#a!8!8!:;<LLK ::**,LK K 99))+LK L4 T BIIs 499$$ B% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 
X 
 
" 
X 
 
" 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	"Tyy"T II"T c3h	"TJ 
45/  6&L L^
O%}2 O%d4!(? 4!n2(4. 2j 	%299 	%		 rN   