
    Z j%                      % S SK Jr  S SKrS SKrS SKrS SKrS SKJr  SSKJ	r	  SSK
JrJr  SSKJr  SSKJr  \" 5       (       a*  S SKrS SKJr  S S	KJr  \R                  R+                  5       r\R.                  " \5      r SF   SGS
 jjrSHS jrSISJS jjr\" 5       (       a|  \R:                  \R<                  \R>                  \R@                  \RB                  \RD                  \RF                  \RH                  \RJ                  \RL                  \RN                  S.r(SKS jr)S r* SL         SMS jjr+SNSOS jjr,S r- " S S\R\                  R^                  5      r0 " S S\R\                  R^                  5      r1 " S S\R\                  R^                  5      r2 " S S\R\                  R^                  5      r3 " S S\R\                  R^                  5      r4S r5S r6S r7S  r8S! r9   SF   SPS" jjr: " S# S$5      r; " S% S&\;5      r< " S' S(\;5      r= " S) S*\;5      r> " S+ S,\;5      r? " S- S.\<5      r@ " S/ S0\?5      rA " S1 S2\;5      rB " S3 S4\;5      rC " S5 S6\;5      rD " S7 S8\;5      rE " S9 S:\;5      rF " S; S<\;5      rG " S= S>\5      rH\H" 5       rIS>\JS?'           SQS@ jrK        SRSA jrLSB rMSC rNSSSD jrOSE rPg)T    )annotationsN)reduce   )DistributedConfig)is_torch_greater_or_equallogging)GeneralInterface)is_torch_available)nnc                   Ub  U c  [        S5      eU b  Ub  [        S5      eUGc  [        S5      (       d  [        S5      e[        R                  R                  5       R                  nUS:X  a  [        S5      e[        [        U5      n[        R                  R                  5       (       d   [        [        R                  S   5      n[        [        R                  S	   5      n[        [        R                  S
   5      nSSSSSS.n	U	R                  U5      n
[        R                  R                  XUS9  [        [        U5      nUS:w  a  UR!                  U5        US:w  aT  UR!                  [        [        R                  S	   5      5        UR%                  5       n[        R&                  " XL5      nUnO![        R&                  " U5      nU=(       d    0 nUb  UO[        R                  R)                  5       n[        R                  R+                  UR                  U45      nOUR,                  S:  a   SUR.                  ;  a  [        S5      eUS   nUR1                  5       n[        R&                  " UR2                   S[        [        R                  S	   5       35      nX2U4$ ! ["         a  n[        S5      UeSnAff = f)z
Sets up the device mesh and initialized the backend for tensor parallelism.
This function is called when the model is loaded and the TP plan is set to 'auto'.
Nz-tp_plan has to be set when tp_size is passed.zY`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization.z2.5z3Tensor parallel is only supported for `torch>=2.5`.mpsz3Tensor parallelism is not supported on MPS devices.RANK
LOCAL_RANK
WORLD_SIZEncclglooxcclhcclneuron)cudacpuxpuhpur   )backendrank
world_sizer   zWe tried to initialize torch.distributed for you, but it failed. Make sure you init torch distributed in your script to use `tp_plan`.   tpzsWhen using `tp_plan` and n-d `device_mesh`, it must contain a 'tp' dimension. Please provide a valid `device_mesh`.:)
ValueErrorr   OSErrortorch_C_get_acceleratortypeRuntimeErrorgetattrdistributedis_initializedintosenvirongetinit_process_group
set_device	Exceptioncurrent_devicedeviceget_world_sizeinit_device_meshndimmesh_dim_namessizedevice_type)tp_plantp_sizedevice_mesh
device_mapr8   r1   r   
local_rankr   backend_mapr   eindex	tp_devices                 z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/tensor_parallel.pyinitialize_tensor_parallelismrC   (   s{    wHIIz5tuu(//OPP hh//166%TUU 4  //112::f-. L!9:
 L!9:
'-fVTZfno%//+6!!44W\f4g!(!<%'"--j9 %%%c"**\*B&CD"113E[8I"J[1I$*J$0'e6G6G6V6V6X''88'Ta;555 <  &d+K""$\\[%<%<$=Qs2::lC[?\>]"^_
G++9  W s   /B2J. .
K	8KK	c                4    [         R                  " SS U 5      $ )aS  
Replace the numbers in the `name` by wildcards, only if they are in-between dots (`.`) or if they are between
a dot (`.`) and the end of the string.
This matches how modules are named/numbered when using a nn.ModuleList or nn.Sequential, but will NOT match
numbers in a parameter name itself, e.g. if the param is named `"w1"` or `"w2"`.
z\.\d+(\.|$)c                *    SU R                  S5      -   $ )Nz.*r   group)ms    rB   <lambda>2replace_layer_number_by_wildcard.<locals>.<lambda>p   s    D1771:,=    )resub)names    rB    replace_layer_number_by_wildcardrO   i   s     66."=tDDrK   c                    [        U 5      nX1;   a  X   $ U(       a$  SU;   a  UR                  SS5      S   =oA;   a  X   $ g)a  
Get the TP style for a parameter from the TP plan.

The TP plan is a dictionary that maps parameter names to TP styles.
The parameter name can be a generic name with wildcards (e.g. "*.weight") or a specific name (e.g. "layer_1.weight").

The `is_weight` is important because for weights, we want to support `.weights` and `.bias` cases seamlessly! but
not parent classes for `post_init` calls
.r   r   N)rO   rsplit)parameter_namer9   	is_weightgeneric_param_namemodule_names        rB   _get_parameter_tp_planrW   s   sU     :.I$**	s00EWE^E^_bdeEfghEi6ik5u##rK   )BOOLU8I8I16F16BF16I32F32F64I64F8_E4M3c                    [        U[        5      (       a9  [        U5      nX-  S:X  d   SU  SU 35       eX-  nU Vs/ s H  oCU-  PM	     sn$ X-  S:X  d
   SU 35       eX-  nU/U-  $ s  snf )a  
Convert block count or proportions to block sizes.

This function accepts

- The number of blocks (int), in which case the block size is
  total_size//blocks; or
- A list of block sizes (list[int]).

In the second case, if sum(blocks) < total_size, the ratios between
the block sizes will be preserved. For instance, if blocks is
[2, 1, 1] and total_size is 1024, the returned block sizes are
[512, 256, 256].
r   zCannot split z in proportional blocks: zPrepacked is not divisible by )
isinstancelistsum)
total_sizeblockstotal_blocks	part_sizeblocksingle_sizes         rB   _blocks_to_block_sizesrm      s     &$6{(A-lzlJcdjck/ll-.	/56veE!v66"a'R+I&)RR' *}v%%	 7s   A+c                   U nUR                   U   nUR                  5       n[        USS9n/ n	Sn
U H*  nX-  nX<-  nUS-   U-  nU	[        X-   X-   5      -  n	X-  n
M,     UR	                  5       nSnUS:X  d  US:X  a$  US   R                  [        R                  5      nS	nUS:X  a  XYS4   nO:US:X  d  US
:X  a  USS2U	S4   nO#US:X  d  US:X  a  USU	4   nO[        SU S35      eU(       a  U$ UR                  [        U   5      $ )u  
When weights are packed (gate_up_proj), we need to make sure each shard gets its correct share.
So if you have: gate_proj       ( 16, 5120, 8190)
and             up_proj         ( 16, 5120, 8190)
packed as       gate_up_proj    ( 16, 5120, 2 * 8190)
And you shard along the last dimension, you need to interleave the gate and up values:

Now, if we shard along the last dimension across TP_size (Tensor Parallelism size), we must interleave the values from gate and up projections correctly.

Let's take TP_size = 4 for an example:

Packed tensor `gate_up_proj`
---------------------------------------------------------------
[ G0  G1  G2  G3 | G4  G5  G6  G7 | ... | U0  U1  U2  U3 | U4  U5  U6  U7 | ... ]
 ↑─────────────↑   ↑─────────────↑        ↑─────────────↑  ↑─────────────↑
   Gate Slice 0      Gate Slice 1            Up Slice 0       Up Slice 1

Explanation:
- The first half of the tensor (left of the center) holds the gate_proj values.
- The second half (right of the center) holds the up_proj values.
- For TP=4, we divide each half into 4 slices. In this example, we show two slices for brevity.
- Each shard receives one slice from the gate part and the corresponding slice from the up part.

For instance:
• Shard 0 gets: [ Gate Slice 0, Up Slice 0 ] = [ G0, G1, G2, G3, U0, U1, U2, U3 ]
• Shard 1 gets: [ Gate Slice 1, Up Slice 1 ] = [ G4, G5, G6, G7, U4, U5, U6, U7 ]
• … and so on.

This ensures that each shard receives an equal portion of both gate and up projections, maintaining consistency across tensor parallelism.
r   )rg   rh   r   r   Frb   F8_E5M2.TNzUnsupported dim z", only dim 0, 1 or 2 are supported)
shaper7   rm   range	get_dtypetor"   float16r    str_to_dtype)paramempty_paramr;   r   dimslice_rg   r   block_sizestensors_slicesblock_offset
block_sizeshard_block_sizestartstopslice_dtypecastedtensors                     rB   get_packed_weightsr      sG   > F""3'J!!#J(JqIKNL!
%3'q,,% 4l6IJJ" " ""$K Fi;)#;.
ax+,	SBY>3./	SBY^+,+C50RSTTyyk233rK   c                   US:w  a  [        S5      eUS:  a  UOXR                  -   nU R                  U   nXS-  nXb-  nU R                  SU nU R                  US-   S n	U R                  " / UQUPUPUPU	Q76 n
[	        U5      n[	        U5      S-   n[        [        U
R                  5      5      nX   X   sX'   X'   U
R                  " U6 nUR                  U 5      nU$ )a?  
Reorders a tensor that was reconstructed from sharded packed weights into its canonical packed format.

For example, if a weight was packed (e.g., gate_proj and up_proj) and then sharded,
DTensor.full_tensor() might produce an interleaved layout like [G0, U0, G1, U1, ...]
along the sharded dimension. This function reorders it to [G0, G1, ..., U0, U1, ...].
This is an inverse operation to get_packed_weights.

Args:
    reconstructed_tensor: The tensor reconstructed from DTensor (e.g., via .full_tensor().contiguous()).
    sharded_dim: The dimension index in the reconstructed_tensor that was originally sharded.
    world_size: The tensor parallel world size.
    num_packed_projs: The number of projections that were packed together (e.g., 2 for gate_up_proj).

Returns:
    The reordered tensor in canonical packed format.
r   zNum blocks different from 2 is not supported yet. This is most likely a bug in your implementation as we only pack gate and up projections together.r   Nr   )	r    r5   rr   viewlenre   rs   permute
reshape_as)packed_parametersharded_dimr   
num_blocksactual_sharded_dimtotal_size_on_sharded_dimoriginal_block_size_on_dimshard_chunk_sizeprefix_shapesuffix_shapetensor_viewaxis_ws_absaxis_npp_abspermute_ordertensor_permutedfinal_ordered_tensors                   rB   repack_weightsr      s:   0 Q c
 	
 )4q(8kLaLa>a 0 6 67I J!:!H1?#))*=+=>L#))*<q*@*BCL"'' 	 	 		
 
K l#K|$q(L{//01M>K>Y[h[u;M ;!))=9O +556FGrK   c                d   UR                   nUR                  n[        [        R                  U5      n[        U [        R                  5      (       a  [        U R                  5      OU R                  5       n	US:  a  Xd-   nUR                  5       S:X  a  US:X  a  [        U	5      S:X  a  SnO+UR                  5       S:X  a  US:X  a  [        U	5      S:X  a  Sn[        R                  " X   U-  5      n
X:-  n[        X-   X   5      nXF:  a  [        SU SU 35      eX8:  a  [        SU SU 35      eUba  UR                  5       S:X  aM  US:X  aG  [        U	5      S:X  a8  Xs=::  a  U:  a  O  OU S	S	 $ [        R                   " / [        R"                  US
9$ [%        S	5      /[        U	5      -  nXU   :  aF  [%        X5      X'   U ['        U5         n [        U [        5      (       a  U  Vs/ s H  oS	S	 PM	     n nU $ SX'   [        R                   " ['        U	5      [        R"                  S9$ s  snf )a}  
Generalized tensor sharding across a multi-dimensional device mesh.
Extract only the fraction of the parameter owned by the given `rank` when the parameter would have gone sharding at provided `dim`.
Extraction follows the pytorch `Shard` placement so that sharding and materializing back to full tensor follows `Shard` semantics.
`Shard` follows torch.chunk style sharding of the tensor. We demonstrate some cases below on how sharding happens including some edge cases
such as some ranks having an empty tensor as shard. Below implementation is robut to all these cases.

Case (1)
empty_param                 (16, 5120, 8190)
dim                         0
device_mesh.size()          4
rank 0 gets                                 (4, 5120, 8190)                  (0 ... 4, 5120, 8190)
rank 1 gets                                 (4, 5120, 8190)                  (4 ... 8, 5120, 8190)
rank 2 gets                                 (4, 5120, 8190)                  (8 ... 12, 5120, 8190)
rank 3 gets                                 (4, 5120, 8190)                  (12 ... 16, 5120, 8190)

Case (2)
empty_param                 (16, 5120, 8190)
dim                         0
device_mesh.size()          14
rank 0 gets                                 (2, 5120, 8190)                  (0 ... 2, 5120, 8190)
rank 1 gets                                 (2, 5120, 8190)                  (2 ... 4, 5120, 8190)
rank 2 gets                                 (2, 5120, 8190)                  (4 ... 6, 5120, 8190)
rank 3 gets                                 (2, 5120, 8190)                  (6 ... 8, 5120, 8190)
rank 4 gets                                 (2, 5120, 8190)                  (8 ... 10, 5120, 8190)
rank 5 gets                                 (2, 5120, 8190)                  (10 ... 12, 5120, 8190)
rank 6 gets                                 (2, 5120, 8190)                  (12 ... 14, 5120, 8190)
rank 7 gets                                 (2, 5120, 8190)                  (14 ... 16, 5120, 8190)
rank 8 gets                                 (0, 5120, 8190)
rank 9 gets                                 (0, 5120, 8190)
rank 10 gets                            (0, 5120, 8190)
rank 11 gets                                (0, 5120, 8190)
rank 12 gets                                (0, 5120, 8190)
rank 13 gets                                (0, 5120, 8190)

Case (3)
empty_param                 (16, 5120, 8190)
dim                         0
device_mesh.size()          3
rank 0 gets                                 (6, 5120, 8190)                  (0 ... 6, 5120, 8190)
rank 1 gets                                 (6, 5120, 8190)                  (6 ... 12, 5120, 8190)
rank 2 gets                                 (4, 5120, 8190)                  (12 ... 16, 5120, 8190)

In case (2), empty shards are returned with appropriate dimension to allow for operations to work smoothly.
Args:
    param (torch.Tensor): The tensor to shard.
    empty_param (torch.Tensor): A tensor used for shape reference.
    device_mesh (torch.Tensor): Shape [d_0, ..., d_n] representing the mesh.
    rank (int): Global rank of the current process/device.
    dim (int): Dimension along which to shard the tensor.
r      r   r   zdim z* is out of bounds for tensor of dimension zRank z  is out of bounds for mesh size Ndtyper2   )r   )r5   rr   r   operatormulrd   r"   Tensorre   	get_shaperz   r   mathceilminr    emptyint64slicetuple)rx   ry   r;   r   rz   
tensor_idx	param_dim
mesh_shaper   param_shape
shard_sizer   endslice_indicesps                  rB   get_tensor_shardr   9  s   h   I""Jj1J'1%'F'F$u{{#EOOL]K
QwoA#(s;/?1/D		a	C1H[1AQ1F;+j89JE
e +"2
3C
4u$NykZ[[5&FzlSTT +//"3q"8SAX#kJZ^_J_$$8O;;rTBB4[MC$44M3"5.eM*+eT""#()5aqT5E)K;;u[)==	 *s   +H-c                ,    [         R                  " XSS9$ )z9Split tensor along last dimension into world_size chunks.rq   rz   )r"   chunk)xr   s     rB   _split_along_last_dimr     s    ;;q"--rK   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)_AllReduceBackwardi  zRIdentity forward, all-reduce backward. Used before colwise layers (f in Megatron).c                    X l         U$ N)r;   ctxr   r;   s      rB   forward_AllReduceBackward.forward  s    %rK   c                    U R                   nUR                  5       S:X  a  US 4$ UR                  5       n[        R                  " U[        R
                  R                  UR                  5       S9  US 4$ Nr   oprG   )r;   r7   
contiguousdist
all_reduceReduceOpSUM	get_group)r   grad_outputr;   s      rB   backward_AllReduceBackward.backward  sc    oo"$$!,,.(9(9AVAVAXYD  rK    N	__name__
__module____qualname____firstlineno____doc__staticmethodr   r   __static_attributes__r   rK   rB   r   r     s+    \  ! !rK   r   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)_AllReduceForwardi  zQAll-reduce forward, identity backward. Used after rowwise layers (g in Megatron).c                    UR                  5       S:X  a  U$ [        R                  " U[        R                  R                  UR                  5       S9  U$ r   )r7   r   r   r   r   r   r   s      rB   r   _AllReduceForward.forward  s@    "Hdmm//{7L7L7NOrK   c                
    US 4$ r   r   )r   r   s     rB   r   _AllReduceForward.backward  s    D  rK   r   Nr   r   rK   rB   r   r     s+    [  ! !rK   r   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)
_AllGatheri  z<All-gather forward, split backward. Gathers sharded outputs.c                   X l         UR                  5       nUS:X  a  U$ UR                  5       S-
  nUR                  5       nUR	                  5       nUR                  5       n[        U5       Vs/ s H  n[        R                  " U5      PM     nnXU'   [        R                  " XUS9  [        R                  " XS9R                  5       $ s  snf Nr   rF   r   r;   r7   rz   get_local_rankr   r   rs   r"   
empty_liker   
all_gathercat)	r   r   r;   r   last_dimr   rG   _tensor_lists	            rB   r   _AllGather.forward  s    % %%'
?H557Q;))+%%'LLN49*4EF4Equ''*4EFDe4yy3>>@@ Gs   / Cc                    U R                   nUR                  5       nUS:X  a  US 4$ UR                  5       n[        X5      nXT   R	                  5       S 4$ Nr   r;   r7   r   r   r   )r   r   r;   r   r   chunkss         rB   r   _AllGather.backward  sY    oo %%'
?$$))+&{?|&&($..rK   r   Nr   r   rK   rB   r   r     s-    FA A" 	/ 	/rK   r   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)_Spliti  z>Split forward, all-gather backward. Scatters replicated input.c                    X l         UR                  5       nUS:X  a  U$ UR                  5       n[        X5      nXT   R	                  5       $ r   r   )r   r   r;   r   r   r   s         rB   r   _Split.forward  sJ    % %%'
?H))+&q5|&&((rK   c                   U R                   nUR                  5       nUS:X  a  US 4$ UR                  5       S-
  nUR                  5       nUR	                  5       nUR                  5       n[        U5       Vs/ s H  n[        R                  " U5      PM     nnXU'   [        R                  " XUS9  [        R                  " XS9R                  5       S 4$ s  snf r   r   	r   r   r;   r   r   r   rG   r   r   s	            rB   r   _Split.backward      oo %%'
?$$??$q())+%%'!,,.>CJ>OP>Ou''4>OP'D>yy3>>@$FF Q   7 Cr   Nr   r   rK   rB   r   r     s-    H	) 	) G GrK   r   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)_ReduceScatteri   zCReduce-scatter forward, all-gather backward. For sequence parallel.c                   X l         UR                  5       nUS:X  a  U$ UR                  5       S-
  nUR                  5       n[	        UR                  X4S95      n[	        UR                  5      nXt==   U-  ss'   [        R                  " XqR                  UR                  S9n[        R                  " X[        R                  R                  US9  U$ )Nr   r   r   r   )r;   r7   rz   r   re   r   rr   r"   r   r   r2   r   reduce_scatterr   r   )	r   r   r;   r   r   rG   input_chunksoutput_shapeoutputs	            rB   r   _ReduceScatter.forward#  s    % %%'
?H557Q;%%'AGGJG=>AGG}:-\JFT]]5F5FeTrK   c                   U R                   nUR                  5       nUS:X  a  US 4$ UR                  5       S-
  nUR                  5       nUR	                  5       nUR                  5       n[        U5       Vs/ s H  n[        R                  " U5      PM     nnXU'   [        R                  " XUS9  [        R                  " XS9R                  5       S 4$ s  snf r   r   r   s	            rB   r   _ReduceScatter.backward6  r   r   r   Nr   r   rK   rB   r   r      s-    M $ G GrK   r   c                ,    [         R                  X5      $ )zAIdentity forward, all-reduce backward. Use before colwise layers.)r   applyr   r;   s     rB   all_reduce_backwardr  N  s    ##A33rK   c                ,    [         R                  X5      $ )z@All-reduce forward, identity backward. Use after rowwise layers.)r   r  r  s     rB   all_reduce_forwardr  S  s    ""122rK   c                ,    [         R                  X5      $ )z#All-gather forward, split backward.)r   r  r  s     rB   r   r   X  s    A++rK   c                ,    [         R                  X5      $ )z#Split forward, all-gather backward.)r   r  r  s     rB   splitr
  ]  s    <<''rK   c                ,    [         R                  X5      $ )z,Reduce-scatter forward, all-gather backward.)r   r  r  s     rB   r   r   b  s    //rK   c                p   ^^^ Tb  U R                  UU4S j5        Tb  U R                  UU4S j5        U $ )z
Copy pasted from torch's function but we remove the communications (partitioning)
as well as buffer registering that is similarly not efficient.
c                   > T" XT5      $ r   r   )modinputsr;   input_fns     rB   rI   #distribute_module.<locals>.<lambda>r  s    XcS^=_rK   c                   > T" XT5      $ r   r   )r  r  outputsr;   	output_fns      rB   rI   r  t  s    )CZeBfrK   )register_forward_pre_hookregister_forward_hook)moduler;   r  r  s    ```rB   distribute_moduler  g  s4     (()_`$$%fgMrK   c                  r    \ rS rSrSrSrSrSrSS jrS r	S r
 S     SS jjrSS jrSS	 jrSS
 jrSrg)TensorParallelLayerix  z.General tensor parallel layer for transformersNc                (    X l         Xl        X0l        g r   )r   r;   ry   )selfr;   r   ry   s       rB   __init__TensorParallelLayer.__init__  s    	&&rK   c                    [         er   NotImplementedErrorr  r  r  r;   s       rB   _prepare_input_fn%TensorParallelLayer._prepare_input_fn      !!rK   c                    [         er   r   r  r  r  r;   s       rB   _prepare_output_fn&TensorParallelLayer._prepare_output_fn  r%  rK   c                    [         er   r   r  rx   r   r2   r   s        rB   shard_tensor TensorParallelLayer.shard_tensor  s
     "!rK   c                H    [        UUU R                  U R                  5        g r   )r  r#  r(  r  r  r;   kwargss       rB   prepare_module_tp%TensorParallelLayer.prepare_module_tp  s"    ""##		
rK   c                    [        U5      $ )z
Compute the expected shape after TP sharding for a given full shape.

Args:
    full_shape: The full (unsharded) parameter shape

Returns:
    The expected sharded shape for this rank
)r   )r  
full_shapes     rB   get_expected_sharded_shape.TensorParallelLayer.get_expected_sharded_shape  s     Z  rK   c                    g)z
Update module attributes (e.g. in_features, out_features) to reflect sharded dimensions.

Args:
    module: The module to update

Returns:
    None, update the module in-place
Nr   r  r  s     rB   update_module_attributes,TensorParallelLayer.update_module_attributes  s     	rK   )r;   ry   r   NNNrx   torch.Tensorr   
int | Nonereturnr=  r  	nn.Moduler?  rA  r4  ztuple[int, ...] | torch.Sizer?  ztuple[int, ...]r  rA  )r   r   r   r   r   r;   r   ry   r  r#  r(  r,  r1  r5  r9  r   r   rK   rB   r  r  x  sV    8KDK'
"" VZ"!"/9"	"

!
rK   r  c                  n   ^  \ rS rSrSrS
SU 4S jjjrS rS r S     SS jjrSS jr	SS jr
S	rU =r$ )ColwiseParalleli  z
Column-wise parallel: weight is sharded on dim -2 (output features).
Forward: input replicated -> output sharded on last dim.
If gather_output=True, output is all-gathered to produce full tensor.
c                2   > [         TU ]  " S0 UD6  Xl        g Nr   )superr  gather_output)r  rI  r0  	__class__s      rB   r  ColwiseParallel.__init__  s    "6"*rK   c                4    U(       a  US   OUn[        XC5      $ Nr   r  r  r  r  r;   input_tensors        rB   r#  !ColwiseParallel._prepare_input_fn  s    $*vay"<==rK   c                >    U R                   (       a  [        X#5      $ U$ r   )rI  r   r'  s       rB   r(  "ColwiseParallel._prepare_output_fn  s    g33rK   c                n   [        U[        R                  5      (       a  UR                  5       O[	        UR                  5       5      nUS:X  a-  [        XR                  U R                  U R                  S5      nO,[        XR                  U R                  U R                  S5      nUR                  X4S9$ Nr   rq   rp   r2   r   rd   r"   r   rz   r   r   r   ry   r;   r   ru   r  rx   r   r2   r   rz   	parameters          rB   r,  ColwiseParallel.shard_tensor  s     (u||<<eiik#eooFWBX!8(0@0@$BRBRTXT]T]_abI(0@0@$BRBRTXT]T]_abI||6|77rK   c                4   U R                   R                  5       n[        U5      n[        U5      S:X  a  SOSnUS:  a  [        U5      U-   OUn[        R
                  " X4   U-  5      nU R                  U-  n[        Xe-   X4   5      nXv-
  X4'   [        U5      $ )Nr   rq   rp   r   )	r;   r7   re   r   r   r   r   r   r   r  r4  r   rr   rz   r   r   r   s           rB   r5  *ColwiseParallel.get_expected_sharded_shape  s    %%**,
Z J!Ob"%'c%j3sYYuzJ67
		J&%$ej1[
U|rK   c                    U R                   (       d7  [        US5      (       a%  U R                  UR                  45      S   Ul        g g g )Nout_featuresr   )rI  hasattrr5  r_  r8  s     rB   r9  (ColwiseParallel.update_module_attributes  sD     !!gfn&E&E"&"A"A6CVCVBX"YZ["\F 'F!rK   rI  F)rI  boolr;  r<  rB  rC  r   r   r   r   r   r  r#  r(  r,  r5  r9  r   __classcell__rJ  s   @rB   rE  rE    sQ    + +> VZ	8!	8/9	8		8
] ]rK   rE  c                  4    \ rS rSrSrS rS rS	S jrS rSr	g)
ReplicatedWithGradAllReducei  a5  
Replicated parameter with gradient all-reduce.

For parameters like q_norm/k_norm that sit between colwise and rowwise
layers. The parameter is replicated (not sharded), but its gradient
accumulates from local heads only in TP mode. This class registers a
backward hook to all-reduce the parameter gradient.
c                    U$ r   r   r"  s       rB   r#  -ReplicatedWithGradAllReduce._prepare_input_fn  s    rK   c                    U$ r   r   r'  s       rB   r(  .ReplicatedWithGradAllReduce._prepare_output_fn  s    rK   Nc                &    US   R                  X4S9$ N.rV  ru   r+  s        rB   r,  (ReplicatedWithGradAllReduce.shard_tensor      Sz}}F}88rK   c                2    U4S jnUR                  U5        g )Nc                |    U R                  5        H(  nUR                  c  M  [        UR                  U5        M*     g r   )
parametersgradr  )r  
grad_inputr   meshrx   s        rB   _backward_hookEReplicatedWithGradAllReduce.prepare_module_tp.<locals>._backward_hook  s+    )::)&uzz48 *rK   )register_full_backward_hook)r  r  r;   r0  ry  s        rB   r1  -ReplicatedWithGradAllReduce.prepare_module_tp  s     ?J 	9
 	**>:rK   r   r;  )
r   r   r   r   r   r#  r(  r,  r1  r   r   rK   rB   ri  ri    s    9;rK   ri  c                  2    \ rS rSrSrS rSS jrS	S jrSrg)
MlaKvAProjParalleli  a  
For MLA attention used in DeepSeek-V2 style models (deepseek_v2, longcat_flash, glm_moe_dsa, glm4_moe_lite):
kv_a_proj_with_mqa output is [kv_lora_rank + qk_rope_head_dim] (can have different naming but important thing
to understand is that it is split)
Example below (from modeling_longcat_flash.py):

kv_a_proj_with_mqa
        |
        split
        /            k_pass    k_rot  <-- "bypasses kv_b_proj"
    |          |        (goes straight to attention,
kv_a_layernorm |         never touches kv_b_proj)
    |          |
kv_b_proj      |
(colwise)      |
    |          |
    k_pass     k_rot
        \      /
           cat
            |
        key_states

k_pass is passed to kv_b_proj (colwise) which has built-in all_reduce_backward so we don't have a partial gradient for it.
However, k_rot goes straight to attention, never touches kv_b_proj. So we need to average gradient across all ranks otherwise we only get gradient for one rank (partial gradient).
c                2   [        UR                  S5      (       d"  [        S[        U5      R                   S35      eUR                  R
                  nUR                  UR                  S   U-
  U/SS9u  pV[        Xc5      n[        R                  " XV/SS9$ )Nqk_rope_head_dimzConfig for z does not have `qk_rope_head_dim`. MlaKvAProjParallel requires `qk_rope_head_dim` to be defined in the model config. Please add it to the model's config or update the TP plan mapping.rq   r   )r`  configAttributeErrorr%   r   r  r
  rr   r  r"   r   )r  r  r   r;   rope_dimpass_outputrope_outputs          rB   r(  %MlaKvAProjParallel._prepare_output_fn  s    szz#566 d3i001 2U U 
 ::..#)<<b1AH1Lh0W]_<#` )+Cyy+3<<rK   Nc                &    US   R                  X4S9$ ro  rp  r+  s        rB   r,  MlaKvAProjParallel.shard_tensor)  rr  rK   c                8    X1l         [        XU R                  S9  g )N)r  )r  r  r(  )r  r  r;   r  r0  s        rB   r1  $MlaKvAProjParallel.prepare_module_tp,  s    &9P9PQrK   r   r;  r   )	r   r   r   r   r   r(  r,  r1  r   r   rK   rB   r~  r~    s    6
=9RrK   r~  c                  n   ^  \ rS rSrSrS
SU 4S jjjrS rS r S     SS jjrSS jr	SS jr
S	rU =r$ )RowwiseParalleli1  a  
Row-wise parallel: weight is sharded on dim -1 (input features).
Forward: input (optionally split) -> output partial -> all-reduce to replicate.

Args:
    split_input: If True, splits replicated input before matmul. Use when input
                 comes from a non-parallelizable operation (chunk/slice).
                 Default False (expects pre-sharded input from colwise layer).
c                2   > [         TU ]  " S0 UD6  Xl        g rG  )rH  r  split_input)r  r  r0  rJ  s      rB   r  RowwiseParallel.__init__<  s    "6"&rK   c                    [        US5      (       a%  UR                  b  UR                  Ul        S Ul        U(       a  US   OUnU R                  (       a  [	        XC5      $ U$ )Nbiasr   )r`  r  _biasr  r
  rO  s        rB   r#  !RowwiseParallel._prepare_input_fn@  sQ    3CHH$8CICH$*vay33rK   c                t    [        X#5      n[        US5      (       a  UR                  b  X!R                  -   nU$ )Nr  )r  r`  r  r'  s       rB   r(  "RowwiseParallel._prepare_output_fnL  s3    $W:3  SYY%:		)GrK   c                    [        U[        R                  5      (       a  UR                  5       O[	        UR                  5       5      nUS:X  a  US   nO,[        XR                  U R                  U R                  S5      nUR                  X4S9$ )Nr   .rq   rV  rW  rX  s          rB   r,  RowwiseParallel.shard_tensorR  sr     (u||<<eiik#eooFWBX!8c
I(0@0@$BRBRTXT]T]_abI||6|77rK   c                F   [        U5      S:X  a  [        U5      $ U R                  R                  5       n[	        U5      nSnUS:  a  [        U5      U-   OUn[
        R                  " X4   U-  5      nU R                  U-  n[        Xe-   X4   5      nXv-
  X4'   [        U5      $ Nr   rq   r   )	r   r   r;   r7   re   r   r   r   r   r\  s           rB   r5  *RowwiseParallel.get_expected_sharded_shape]  s    z?a$$%%**,
Z "%'c%j3sYYuzJ67
		J&%$ej1[
U|rK   c                v    [        US5      (       a(  SUR                  4nU R                  U5      S   Ul        g g )Nin_featuresr   )r`  r  r5  )r  r  rr   s      rB   r9  (RowwiseParallel.update_module_attributesk  s>    6=)) **+E!%!@!@!G!JF	 *rK   r  rc  )r  rd  r;  r<  rB  rC  re  rg  s   @rB   r  r  1  sQ    ' '
 VZ	8!	8/9	8		8K KrK   r  c                  2    \ rS rSrSr S     SS jjrSrg)PackedColwiseParallelis  z@Packed column-wise parallel for fused weights like gate_up_proj.Nc                0   [        U[        R                  5      (       a  UR                  5       O[	        UR                  5       5      nUS:X  a-  [        XR                  U R                  U R                  S5      nOU R                  U R                  R                  5      nU[	        U5      :  a-  [        XR                  U R                  U R                  S5      nO,[        XR                  U R                  U R                  S5      nUR                  X4S9$ rU  )rd   r"   r   rz   r   r   r   ry   r;   r   r5  rr   r   ru   )r  rx   r   r2   r   rz   rY  expected_shapes           rB   r,  "PackedColwiseParallel.shard_tensorv  s     (u||<<eiik#eooFWBX!8(0@0@$BRBRTXT]T]_abI!<<T=M=M=S=STNS(( -U4D4DdFVFVX\XaXacef	 /u6F6FHXHXZ^ZcZcegh	||6|77rK   r   r;  r<  r   r   r   r   r   r,  r   r   rK   rB   r  r  s  s.    J VZ8!8/98	8 8rK   r  c                  2    \ rS rSrSr S     SS jjrSrg)PackedRowwiseParalleli  z=Packed row-wise parallel for fused weights like gate_up_proj.Nc                   [        U[        R                  5      (       a  UR                  5       O[	        UR                  5       5      nUS:X  a  US   nO[        U[        R                  5      (       a  UR                  OUR                  5       nU R                  R                  5       S:  a  U R                  R                  S   OSn[	        U5      S:  a  US   OSn	X:  a-  [        XR                  U R                  U R                  S5      nO,[        XR                  U R                  U R                  S5      nUR                  X4S9$ )Nr   .rq   r   rV  )rd   r"   r   rz   r   r   rr   ry   r   r;   r   r   ru   )
r  rx   r   r2   r   rz   rY  r   expected_packed_dim
actual_dims
             rB   r,  "PackedRowwiseParallel.shard_tensor  s
    (u||<<eiik#eooFWBX!8c
I *4E5<<)H)H%++eooN_K@D@P@P@T@T@VZ[@[$"2"2"8"8"<ab,/,<,ARqJ/,U4D4DdFVFVX\XaXacef	 /u6F6FHXHXZ^ZcZcegh	||6|77rK   r   r;  r<  r  r   rK   rB   r  r    s.    G VZ8!8/98	8 8rK   r  c                  r   ^  \ rS rSrSrSS.SU 4S jjjrS rS r S     SS jjrSS	 jr	SS
 jr
SrU =r$ )EmbeddingParalleli  zXEmbeddingParallel: shards embedding table, handles masked lookups for vocab parallelism.r   embedding_dim_shardingc               2   > [         TU ]  " S0 UD6  Xl        g rG  )rH  r  r  )r  r  r0  rJ  s      rB   r  EmbeddingParallel.__init__  s    "6"&<#rK   c                    U(       a  US   OUnU R                   S:X  aY  UR                  5       nUR                  R                  S   nXV-  nXv-   nXG:  XH:  -  n	Xl        UR                  5       U-
  n
SX'   U
$ U$ rM  )r  r   weightrr   _input_maskclone)r  r  r  r;   rP  r   per_partition_sizevocab_start_indexvocab_end_index
input_maskmasked_inputs              rB   r#  #EmbeddingParallel._prepare_input_fn  s    $*vay &&!+--/D
 "%!1!1!!4 $ 9/DO ':|?^_J(O (--/2CCL'(L$rK   c                    U R                   S:X  a]  [        US5      (       aL  UR                  nUR                  S5      R	                  U5      nX%) R                  UR                  5      -  nU?[        X#5      $ )Nr   r  rq   )r  r`  r  	unsqueeze	expand_asru   r   r  )r  r  r  r;   r  mask_expandeds         rB   r(  $EmbeddingParallel._prepare_output_fn  si    &&!+]0K0KJ&004>>wGM 3 3GMM BBG!'77rK   c                   [        U[        R                  5      (       a  UR                  5       O[	        UR                  5       5      nUS:X  a-  [        XR                  U R                  U R                  S5      nO7[        UU R                  U R                  U R                  U R                  5      nUR                  X4S9$ )Nr   rq   rV  )rd   r"   r   rz   r   r   r   ry   r;   r   r  ru   rX  s          rB   r,  EmbeddingParallel.shard_tensor  s     (u||<<eiik#eooFWBX!8(0@0@$BRBRTXT]T]_abI(    		++I ||6|77rK   c                H   U R                   R                  5       n[        U5      n[        U5      S:X  a  SOU R                  nUS:  a  [        U5      U-   OUn[
        R                  " X4   U-  5      nU R                  U-  n[        Xe-   X4   5      nXv-
  X4'   [        U5      $ r  )
r;   r7   re   r   r  r   r   r   r   r   r\  s           rB   r5  ,EmbeddingParallel.get_expected_sharded_shape  s    %%**,
Z  J!Ob)D)D"%'c%j3sYYuzJ67
		J&%$ej1[
U|rK   c                   [        US5      (       a4  U R                  S:X  a$  U R                  UR                  45      S   Ul        [        US5      (       a6  U R                  S:X  a%  U R                  UR                  45      S   Ul        g g g )Nnum_embeddingsr   embedding_dimr   )r`  r  r5  r  r  r8  s     rB   r9  *EmbeddingParallel.update_module_attributes  s    6+,,1L1LPQ1Q$($C$CVEZEZD\$]^_$`F!6?++0K0Kq0P#'#B#BFDXDXCZ#[\]#^F  1Q+rK   )r  r*   r;  r<  rB  rC  re  rg  s   @rB   r  r    sR    b89 = =4	8 VZ8!8/98	8"_ _rK   r  c                  Z   ^  \ rS rSrSrSS	U 4S jjjrS rS r S
     SS jjrSr	U =r
$ )SequenceParalleli  zX
Sequence Parallel: input/output sharded on sequence dimension.
Weights are replicated.
c                2   > [         TU ]  " S0 UD6  Xl        g rG  )rH  r  sequence_dim)r  r  use_local_outputuse_dtensorr0  rJ  s        rB   r  SequenceParallel.__init__  s    "6"(rK   c                4    U(       a  US   OUn[        XC5      $ rM  )r   rO  s        rB   r#  "SequenceParallel._prepare_input_fn  s    $*vay ,44rK   c                    [        X#5      $ r   )r   r'  s       rB   r(  #SequenceParallel._prepare_output_fn  s    g33rK   c                &    US   R                  X4S9$ ro  rp  r+  s        rB   r,  SequenceParallel.shard_tensor       Sz}}F}88rK   )r  )r   FF)r  r*   r  rd  r;  r<  r   r   r   r   r   r  r#  r(  r,  r   rf  rg  s   @rB   r  r    sE    
) )54 VZ9!9/99	9 9rK   r  c                  Z   ^  \ rS rSrSrU 4S jr S     S	S jjrS
S jrSS jrSr	U =r
$ )GroupedGemmParalleli  zZ
Applies Expert Parallelism to MoE experts by loading the correct experts on each device.
c                &   > [         TU ]  " S0 UD6  g rG  rH  r  r  r0  rJ  s     rB   r  GroupedGemmParallel.__init__      "6"rK   c                v   U R                   R                  S   nXPR                  R                  5       -  S:w  a*  [	        SU SU R                  R                  5        S35      eXPR                  R                  5       -  nUnU R
                  U-  nU R
                  S-   U-  n	[        U[        R                  5      (       d  UR                  5       OUR                  n
Ub!  Xs=::  a  U	:  a  O  OUS S  R                  US9$ Uc  XU	 R                  X4S9$ [        U
5      S:  a  Ub  g US S  R                  X4S9$ )Nr   zAGlobal number of experts must be divisible by number of devices:  %  != 0r   )r2   rV  )ry   rr   r;   r7   r    r   rd   r"   r   r   ru   r   )r  rx   r   r2   r   global_num_expertslocal_num_expertsr   r   r   rr   s              rB   r,   GroupedGemmParallel.shard_tensor  sM    "--33A6 0 0 5 5 771<STfSggjkok{k{  lA  lA  lC  kD  DI  J  /2B2B2G2G2II&
		J&yy1}
*)3E5<<)H)H!ekk!e&?C&?8;;f;--s#&&f&BBZ1_!78;;f;::rK   c                z    U R                   R                  5       n[        U5      nUS   U-  nXCS'   [        U5      $ rM  )r;   r7   re   r   )r  r4  r   rr   r  s        rB   r5  .GroupedGemmParallel.get_expected_sharded_shape-  s@    %%**,
Z !!H
2$aU|rK   c                    [        US5      (       a2  U R                  U R                  R                  S   45      S   Ul        g g )Nnum_expertsr   )r`  r5  ry   rr   r  r8  s     rB   r9  ,GroupedGemmParallel.update_module_attributes5  sB    6=))!%!@!@$BRBRBXBXYZB[A]!^_`!aF *rK   r   r;  r<  rB  rC  )r   r   r   r   r   r  r,  r5  r9  r   rf  rg  s   @rB   r  r    sB    # VZ;!;/9;	;0b brK   r  c                  R   ^  \ rS rSrSrU 4S jrS rS r S     S	S jjrSr	U =r
$ )
RouterParalleli:  zI
Allows to reshape the router scores to support running expert parallel.
c                &   > [         TU ]  " S0 UD6  g rG  r  r  s     rB   r  RouterParallel.__init__?  r  rK   c                    U(       a  US   $ U$ rM  r   r"  s       rB   r#   RouterParallel._prepare_input_fnB  s    "vay..rK   c                2   UR                  5       UR                  5       pT[        USS5      nUc  [        [        USS5      SS5      nUc"  [        S[	        U5      R
                   S35      eXe-  S:w  a  [        SU SU S	35      eXe-  nUu  pn
X-  U:g  nU	R                  US
5      n	U
R                  US5      n
US:  a  [        R                  " X5      n
O(U
R                  U
S:  S5      R                  U
S:  S5      n
U
R                  U
S:H  U5      n
XU
4$ )u  
Remap global expert indices to local and zero out non-local scores.

Example: 4 tokens, top_k=4, 128 experts, EP=8. num_local_experts = 128/8 = 16.

Router produces (all ranks see the same values):
    router_scores:  (4, 4)  — top-k routing weights
    router_indices: (4, 4)  — global expert IDs
        [ 52,  42, 119,  67],
        [102,  89,  61,  40],
        [ 82, 103,   4,  34],
        [ 93,  23, 109,  11],

Each index maps to a rank: index // 16 gives the owning rank.
        [3, 2, 7, 4],
        [6, 5, 3, 2],
        [5, 6, 0, 2],
        [5, 1, 6, 0],

For rank 0 (owns experts 0-15), we remap local indices with fmod and
fill non-local with sentinel=16 (used for one_hot masking):
    router_indices (rank 0):
        [ 16, 16, 16, 16],
        [ 16, 16, 16, 16],
        [ 16, 16,  4, 16],
        [ 16, 16, 16, 11],

Scores for non-local experts are zeroed out via masked_fill:
    router_scores (rank 0):
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.3, 0.0],    ← only expert 4 (local) keeps its score
        [0.0, 0.0, 0.0, 0.1],    ← only expert 11 (local) keeps its score

both router_scores and router_indices stay (seq, top_k) shape.
They are paired element-wise: scores[i] is the weight for indices[i].
All expert forward implementations (grouped_mm, batched_mm, eager) flatten
both with reshape(-1) and rely on this pairing. Changing the shape of one
without the other breaks routing!

Each rank believes it is alone and computes only its part of the hidden states.
The sentinel index (num_local_experts) is skipped by one_hot encoding or clamped
+ masked in grouped_mm/batched_mm. After the expert forward, an all_reduce sums
partial outputs across EP ranks to produce the full result.
r  Nr  zRouter module z. is missing num_experts and config.num_expertsr   z>The number of experts must be divisible by number of ep_size: r  r  g        rq   r   )
r   r7   r'   r  r%   r   r    masked_fillr"   fmod)r  r  r  r;   ep_rankep_sizer  num_local_expertsrouter_logitsrouter_scoresrouter_indicesnon_local_masks               rB   r(  !RouterParallel._prepare_output_fnE  sJ   \ '5579I9I9Kc=$7!'#x">tTK >$s)2D2D1EEs!tuu A%PQ\P]]`ah`iino  (27>4n(='I%11.#F'33NBGq "ZZJN+778JANZZ[ilm[moqrN'33Nb4HJ[\^;;rK   c                &    US   R                  X4S9$ ro  rp  r+  s        rB   r,  RouterParallel.shard_tensor  r  rK   r   r;  r<  r  rg  s   @rB   r  r  :  sB    #/D<N VZ9!9/99	9 9rK   r  c                  R   ^  \ rS rSrSrU 4S jrS rS r S     S	S jjrSr	U =r
$ )
MoeTensorParalellExpertsi  a#  
Note: For tensor parallel, the MoEExpertsParallel TP layer handles gradient sync:
    - all_reduce_backward on hidden_states (for colwise gate_up_proj gradient)
    - all_reduce_backward on top_k_weights (for router gradient)
    - all_reduce_forward on output (for partial expert outputs)
c                &   > [         TU ]  " S0 UD6  g rG  r  r  s     rB   r  !MoeTensorParalellExperts.__init__  r  rK   c                T    US   nUS   nUS   n[        XC5      n[        Xc5      nXEU4$ )Nr   r   r   rN  )r  r  r  r;   hidden_statestop_k_indextop_k_weightss          rB   r#  *MoeTensorParalellExperts._prepare_input_fn  s@    q	Qiq	 ,MG
 ,MGM::rK   c                    [        X#5      $ r   )r  r'  s       rB   r(  +MoeTensorParalellExperts._prepare_output_fn  s    !'77rK   c                &    US   R                  X4S9$ ro  rp  r+  s        rB   r,  %MoeTensorParalellExperts.shard_tensor  s    
 Sz}}F}88rK   r   r;  r<  r  rg  s   @rB   r  r    s@    #; 8
 VZ9!9/99	9 9rK   r  c                  .    \ rS rSrSrS rSS jrS rSrg)	MoeIdentityExpertParalleli  a5  
TP class for zero/identity experts in MoE layers.

Under TP, the parent MoeTensorParalellExperts does all_reduce_forward (sum)
on the expert module output. Identity experts produce the same output on
every rank, so the sum gives world_size * output. This class divides the
input by world_size to compensate.
c                B    U(       a  US   OUnXCR                  5       -  $ rM  )r7   rO  s        rB   r#  +MoeIdentityExpertParallel._prepare_input_fn  s!    $*vay..000rK   Nc                &    US   R                  X4S9$ ro  rp  r+  s        rB   r,  &MoeIdentityExpertParallel.shard_tensor  rr  rK   c                ,    [        XU R                  S9  g )N)r  )r  r#  r/  s       rB   r1  +MoeIdentityExpertParallel.prepare_module_tp  s    &8N8NOrK   r   r;  )	r   r   r   r   r   r#  r,  r1  r   r   rK   rB   r
  r
    s    1
9PrK   r
  c                  f   \ rS rSr% \" 5       (       a`  \(       aY  \" SS9\" SS9\" SS9\" 5       \" 5       \" SS9\	" 5       \
" 5       \" 5       \" 5       \" 5       \" 5       \" 5       \" 5       \" 5       S.O0 rS	S	S	S
S
S
SSSSSS.rS\S'   S
S
S
SSSSSSSSS.rS\S'   \SS j5       r\SS j5       rSrg)ParallelInterfacei  r   r  r   Trb  r  )embedding_rowwiseembedding_colwisecolwise_gather_outputcolwiserowwiserowwise_split_inputpacked_colwisepacked_rowwisesequence_parallelgrouped_gemm	ep_routermoe_tp_expertsmoe_identity_expertreplicated_with_grad_allreducemla_kv_a_projrp   rq   N)r  r  r  r  r  r  r  r  r  r   r!  zdict[str, int | None]plan_to_weight_dimplan_to_bias_dimc                     X R                   U'   g r   )r"  clskeyvalues      rB   register_plan_to_weight_dim-ParallelInterface.register_plan_to_weight_dim  s    &+s#rK   c                     X R                   U'   g r   )r#  r%  s      rB   register_plan_to_bias_dim+ParallelInterface.register_plan_to_bias_dim  s    $)S!rK   r   )r'  strr(  r>  )r   r   r   r   r
   _torch_distributed_availabler  rE  r  r  r  r  r  r  r  r
  ri  r~  _global_mappingr"  __annotations__r#  classmethodr)  r,  r   r   rK   rB   r  r    s   * $@! "3!!L!2!!L%44%H&(&(#2t#D3535!1!3/1')68#<#>.I.K/1	
$ ' 4 !#!!*.1-   !##!!!*./+  , , * *rK   r  ALL_PARALLEL_STYLESc                z   UR                  5       nSUR                  =(       d    0 ;   a  UR                  S5      OSnUS:  a  U R                  U-   n[	        U5       Vs/ s H  n[
        R                  " U 5      PM     nn[        R                  " X`R                  5       US9  [
        R                  " XaS9$ s  snf )a^  
All-gather a sharded tensor along the specified dimension to reconstruct the full tensor.

Args:
    local_tensor: The local shard of the tensor on this rank
    shard_dim: The dimension along which the tensor was sharded
    device_mesh: The device mesh for distributed communication

Returns:
    The full reconstructed tensor (same on all ranks)
r   Nr   rF   r   )r7   r6   r   r5   rs   r"   r   r   r   r   r   )local_tensor	shard_dimr;   r   process_groupr   gathered_tensorss          rB   gather_full_tensorr9    s     !!#J37K<V<V<\Z\3]K))$/cgM 1} %%	1	 AFj@QR@Q1((6@QROO$&=&=&?}U 99%55	 Ss    B8c                   [         R                  n[         R                  n0 nU R                  5        GH  u  pxSU;   a  UR	                  SS5      S   OUn	SU;   a  UR	                  SS5      S   OSn
[
        R                  " SSU	5      n[
        R                  " SSU5      nSnX;   a  X   nO.X;   a  X   nO$SU;   a  UR	                  SS5      S   nX;   a  X   nUb  X;  a  XU'   M  U
S:X  a  UR                  U5      nOUR                  U5      nUc  XU'   M  [        XU5      nUS;   a  [        UXS	5      nUR                  5       Xg'   GM     U$ )
a  
Gather sharded tensors to reconstruct full tensors for saving.

This function all-gathers each sharded tensor along its shard dimension
to reconstruct the full unsharded tensor for checkpoint saving.

Args:
    state_dict: The model state dict with local sharded tensors
    tp_plan: The tensor parallel plan mapping layer patterns to shard styles
    device_mesh: The device mesh for distributed communication
    tp_size: The tensor parallel world size

Returns:
    State dict with full (gathered) tensors
rQ   r   r   N\d+*r  )r  r  r   )r3  r"  r#  itemsrR   rL   rM   r-   r9  r   r   )
state_dictr9   r;   r:   r"  r#  resultr'  r   
param_name
param_typerU   generic_full_keycurrent_planparent_param_namer6  full_tensors                    rB   gather_state_dict_for_saverF  6  su   , -??*;;F!'').1SjSZZQ'*c
.1SjSZZQ'*d
VVFC<66&#s3 &"4L*"6L&& 2 9 9#q A! D +&9<#I 3K (,,\:I*..|<I 3K )KH??(i!LK!,,.Q *T MrK   c           	        ^^ Tb?  [         T   n UR                  TX@R                  S9  TTl        UTl        UU4S jTl        gg! [         a(  n[        R                  SU ST SU 35         SnANISnAff = f)a  
This function is called in `PretrainedModel.post_init()`. It is responsible of adding hooks
to the modules of the `model`, based on the `PretrainedModel._tp_plan`.

This is the place where we add the `pre_forward` and `post_forwards` hooks. These are defined
for each `TensorParallelLayer` as `_prepare_input_fn` and `_prepare_output_fn`.

Args:
    model (`PretrainedModel`): The model containing the modules.
    module (`nn.Module`): The current module to which we want to add the hooks.
    current_module_plan (`str` or `None`): The tensor parallel plan for the current module, if any.
    layer_name (`str`): The qualified name of the current module.
    device_mesh (`dist.device_mesh.DeviceMesh`): The device mesh for distributed communication.

N)r  Trying to prepare 0, but it's not supported. Corresponding module: z Fix it's TP plan: c                 .   > TR                  5        ST  3$ )Nz

TP Plan: )__repr__)current_module_planr  s   rB   rI   5add_tensor_parallel_hooks_to_module.<locals>.<lambda>  s    V__%6$7{CVBW"XrK   )	r3  r1  r  r!  loggerwarning_hf_tp_plan_hf_device_meshrK  )modelr  rL  
layer_namer;   tp_layerr?   s    ``    rB   #add_tensor_parallel_hooks_to_modulerU  }  s    , &&':;	&&v{<<&P 1!,X ' # 	NN$ZL0`ag`h i 	s   A 
A8A33A8c                X   SU;   a  UR                  SS5      OUu  pU R                  =(       d    0 n
U R                  U5      n[        U5      n[	        X:5      n[
        R                  " 5       S:X  a8  Uc  [        R                  SU S35        O[        R                  SU SU 35        SnUbE   [        U   nX-l
        X}l        Xml        UR                  USXFS9nU(       a  UR                  5       nOUSS R#                  U5      n[%        U[&        R(                  R*                  5      (       d+  [&        R(                  R+                  XR-                  5       S9n[/        XU5        Ub  UR1                  U5        U$ ! [         a!  n[!        S	U S
U SU SU 35         SnANSnAff = f)a  
This function is called in `from_pretrained` when loading a model's checkpoints.
It receives the pointer to the parameter (or the parameter itself) and takes care of "sharding".
All process run this function, so they just load the partition of the tensor that they require.

Main uses cases:
- column / rowise parallelism, you just shard all the weights of the layer (weight and bias)
- packed layers: you slice the weights, then shard like above
- custom operation:
    - you want to add an all-gather at the end of a local layer.
    - you want to have a layer that is isolated from the rest of the world (because torch.DTensor does not work well with `.view` for instance)

rQ   r   r   NzTensor sharding plan for z+ not found, using default 'replicate' plan.z: )r   r   r2   rH  rI  z" Fix it's TP plan, current layer: z : )requires_grad)rR   r9   get_submoduler*   rW   r   get_rankrN  infor3  ry   r;   r   r,  r   r!  printru   rd   r"   r   	Parameteris_floating_pointsetattrr9  )rR  rx   ry   rS   param_casting_dtypeis_contiguousr   r;   r@  rA  r9   module_to_tpcurrent_shard_planrT  r?   s                  rB   shard_and_distribute_modulerc    s     ?B^>S^223:YgJmm!rG&&z2Lt9D/H}}!%KK3J<?jklKK3J<rBTAUVWH%	*+=>H#. #.  M))%DH[)iE((* a/0 eUXX//00""58U8U8W"XLe,)),7L # 	$^$44deqdr  sU  V^  U_  _b  cd  be  f 	s   0AE> >
F)F$$F)c                   Uc  gU  Vs1 s H  n[        U5      iM     nn[        U5      nUR                  5       nU H  nSU;   a  UR                  SS5      S   OUn[        R
                  " SSU5      nXq;   a%  UR                  US5        UR                  U5        Mb  SU;   d  Mj  UR                  SS5      S   =o;   d  M  UR                  US5        UR                  U5        M     [        U5      S:  a  [        R                  SU 35        [        U5      S:  a(  [        R                  SS	R                  U5       35        ggs  snf )
zy
Verify the TP plan of the model, log a warning if the layers that were not sharded and the rules that were not applied.
NrQ   r   r   r;  r<  z>The following TP rules were not applied on any of the layers: z'The following layers were not sharded: z, )rO   setcopyrR   rL   rM   popdiscardr   rN  rO  join)	expected_keysr9   r'  generic_keysunsharded_layersunused_rulesr@  rU   rD  s	            rB   verify_tp_planrn    sA   
 ERS]c4S9]LS<(<<>L.1SjSZZQ'*c
VVFC<(/6$$S)&&ASAZAZ[^`aAbcdAe,e,=+q.5$$S)  <1WXdWefg
q @K[A\@]^_ !# Ts   E	c                   X@l         X0l        Ub;  [        U[        5      (       a  [        R
                  " U5      nX R                  l        [        U[        5      (       a  Xl        U R                  nUb  [        (       a  UR                  5        H"  nU[        ;  d  M  [        SU S[         35      e   U R                  5        H7  u  px[        USS5      (       d  [        XuSS9n	[!        U UU	UU5        SUl        M9     U $ )z,Distribute a model according to the TP plan.z"Unsupported tensor parallel style z. Supported styles are 
_is_hookedF)rS   r9   rT   T)_tp_size_device_meshrd   dictr   	from_dictr  distributed_configr9   r/  valuesr3  r    named_modulesr'   rW   rU  rp  )
rR  r9   ru  r;   r:   
model_planvrN   r  plans
             rB   distribute_modelr{    s    N$%($//!2!<!<=O!P*<''4  J">">""$A++ #EaSH_`s_t!uvv % "//1LD6<77-Tafg3 !%F 2 LrK   r;  )r9   zstr | dict[str, str] | Noner:   r>  )rN   r.  r?  r.  )T)rS   r.  r9   dict[str, str]r?  z
str | None)rg   r*   rh   zint | list[int]r?  z	list[int])r   )
r   r=  r   r*   r   r*   r   r*   r?  r=  r   )r   r>  r@  )r5  r=  r6  r*   r;   zdist.device_mesh.DeviceMeshr?  r=  )r>  dict[str, torch.Tensor]r9   r|  r:   r*   r?  r}  )rj  z	list[str]r9   zdict[str, str] | None)Q
__future__r   r   r   r+   rL   	functoolsr   r(   r   utilsr   r   utils.genericr	   utils.import_utilsr
   r"   torch.distributedr   r   is_availabler/  
get_loggerr   rN  rC   rO   rW   rd  uint8int8int16rv   bfloat16int32float32float64r   float8_e4m3fnrw   rm   r   r   r   r   autogradFunctionr   r   r   r   r   r  r  r   r
  r   r  r  rE  ri  r~  r  r  r  r  r  r  r  r  r
  r  r3  r1  r9  rF  rU  rc  rn  r{  r   rK   rB   <module>r     s3   #   	 	  + 6 , 3 $ $)#4#4#A#A#C  
		H	% dh>,(>,3=>,BE. 

kkjj{{}}{{}}}}{{&&L&4A4P 	> "> >  >  	> 
 > Be>P.4!00 !$!// !/(( /DGU^^$$ GD&GU^^,, &G\4
3
,
(
0 	
 "6 6r/]) /]d;"5 ;<-R, -R`?K) ?KD8O 8,8O 84N_+ N_b9* 92*b- *bZT9( T9n$92 $9NP 3 P,?*( ?*D *;)< & <66+.6=X66<D'DD 	D
 DN"YJ4n`:rK   