
    N jo                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKJ	r	  S SK
Jr  S SKJrJrJrJr  S SKrS SKJr  S SKJr  S SKJr  S	S
KJrJrJr  S	SKJr  \(       a  S	SKJrJr  S	SK J!r!J"r"J#r#J$r$  S	SK%J&r&J'r'J(r(J)r)J*r*J+r+J,r,  S	SK-J.r.  \R^                  " \05      r1\Rd                  Rg                  \0S5      r4\(       a  S SK5J6r6    SES jr7SFS jr8SFS jr9    SFS jr:    SFS jr;\ " S S5      5       r<SGS jr=SHS jr>SIS jr?SJSKS jjr@S rASLS jrBS rC    SMS  jrD    SNS! jrE    SOS" jrFS# rGS$ rHS% rI        SPS& jrJ          SQS' jrK            SRS( jrL              SSS) jrM              STS* jrN                  SUS+ jrO                      SVS, jrP          SWS- jrQ    SXS. jrR                SYS/ jrS    SZS0 jrT          S[S1 jrU    S\S2 jrV\ " S3 S45      5       rW    SXS5 jrX                    S]S6 jrY                    S^S7 jrZ              STS8 jr[                S_S9 jr\          SWS: jr]    S`S; jr^SFS< jr_SaS= jr`S> raS? rb    SFS@ jrcSbSA jrdScSB jreSC rf        SdSD jrgg)e    )annotationsN)defaultdict)	dataclass)AnyOptionalTYPE_CHECKINGUnion)trace_structured)StorageWeakRef)
OrderedSet   )configconfig_commsir)WeakDep)IRNode	Operation)estimate_peak_memory_allocfreeFreeableInputBufferget_freeable_input_bufSNodeMemory)contains_collectivecontains_waitfind_recursive_deps_of_nodefind_recursive_users_of_nodeis_collectiveis_fallback_opis_wait)Voverlap)BaseSchedulerNodec                V   SSK Jn  0 n0 nU  H&  nUR                  5       X$'   U" U5      c  M   X$   X4'   M(     SS KJn  SSKJn  UR                  5       nU" 5       n[        U5       V	s/ s H  n	/ PM     n
n	UR                  U
[        UR                  5       5      U5        [        R                  " [        R                  " U
5      SS9R                  R                  5       n[!        UR#                  5       5       H  u  pX   X4'   M     U  H  nXC;   a  X4   X$'   X$   Ul        M     g s  sn	f )Nr   )_get_mm_like_fn)_get_default_group)dim)torch._inductor.schedulerr#   get_estimated_runtimetorch.distributeddistributed"torch.distributed.distributed_c10dr$   get_world_sizerangeall_gather_objectlistvaluestorchmediantensortolist	enumeratekeysoverride_estimated_runtime)snodesr#   runtime_estimationsruntime_estimations_for_mmssnodedistr$   
world_sizepg_$gathered_runtime_estimations_for_mms"median_runtime_estimations_for_mmsidxs                f/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/comms.py6align_runtime_estimations_across_all_distributed_ranksrC   4   s3    :"$%*%@%@%B"5!-1D1K'. 
 %E$$&J		B*%?%q% ) ? 	,(//12

 */9:*fVVX '   ; @ @ BC
-O-T#* D /)D)K&+>+E( ?s   'D&c                    [        U SSSS9$ )z/
Greedily schedules waits as late as possible.
FTraise_comms
sink_waitsreorder_for_overlap_schedule_for_commr7   s    rB   rG   rG   Z   s     Ed     c                    [        U SSSS9$ )z0
Greedily schedules comms as early as possible.
TFrE   rI   rK   s    rB   rF   rF   c   s     DU rL   c                    [        U SSSS9$ )a  
This achieves the following overall scheduling procedure:
    Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
        that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
    Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
        Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
        We prioritize compute nodes that are needed sooner.
    Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
    Step 4: We schedule comm N + 1.
    Repeat this for subsequent comm nodes.
TrE   rI   rK   s    rB   reorder_compute_for_overlaprO   l   s     DTt rL   c                     [        U 5      u  pU$ )a+  
Reorders communication ops relative to computation ops to improve communication-compute overlapping and hide comm
latency.  Stops moving a particular op if it reaches a point that would have increased the peak memory footprint.

Currently, follows these heuristics (subject to change or tune):
- never reorders collectives relative to one another, for SPMD safety
- has an option for per-collective prefetch limit, but does not enable it by default
- limits the total number of reorder steps to some factor of the graph size to prevent worst-case quadratic
  performance

Prerequisite: sink_comms_and_waits - ensure comm and wait nodes are scheduled as late as possible, respecting data
dependencies.  That allows reorder_communication_preserving_peak_memory to take a best case peak-memory snapshot,
and then monotonically improve latency by moving collectives backward in time.

Peak memory impact is computed in an iterative fashion.  First, memory use at each timestep is computed, and global
peak memory is computed as a max over timesteps.  Then, when swapping any two adjacent nodes, only the curr-memory
for the earlier of the nodes after the swap is affected.  This enables checking step by step whether a swap is
peak-memory-safe, and bailing out if not.  Example:

0   n0      C0
1   n1      C0 + Allocs(n1) - Frees(n1)
2   n2      C0 + Allocs(n1) - Frees(n1) + Allocs(n2) - Frees(n2)

0   n0      C0
1   n2      C0 + Allocs(n2) - Frees(n2)    <-- After moving n2 to Time 1, only time1 memory changes
2   n1      C0 + Allocs(n2) - Frees(n2) + Allocs(n1) - Frees(n1)

)6_reorder_communication_preserving_peak_memory_internal)r7   reordered_snodes
node_statss      rB   ,reorder_communication_preserving_peak_memoryrT      s    @ 	?vF ! rL   c                      \ rS rSr% SrSrS\S'   SrS\S'   SrS\S	'   S
r	S\S'   Sr
S\S'   SrS\S'   SrS\S'   SrS\S'   SrS\S'   \S 5       rSrg)ReorderInfo   z=
Debug info describing how an individual snode was reordered
Nonestrlimiting_factorr   intmovesgrouped grouped_info      float	comm_time	comp_timeinitial_exposedfinal_exposedoverlap_infoc                4    U R                   U R                  -
  $ Nrd   re   selfs    rB   improvementReorderInfo.improvement       ##d&8&888rL    N)__name__
__module____qualname____firstlineno____doc__rZ   __annotations__r\   r]   r_   rb   rc   rd   re   rf   propertyrl   __static_attributes__ro   rL   rB   rV   rV      st     "OS!E3NGSL#IuIu!OU!M5L#9 9rL   rV   c                    U c  g[        U [        R                  R                  R                  R
                  5      (       a  g[        U SS 5      =n(       a  SU;   a  gg)NFTpython_kernel_nameextern_kernels)r   r0   opsaten#_scaled_dot_product_flash_attentiondefaultgetattr)nodery   s     rB   is_gemm_liker      s]    |		::BB   &d,@$GGG
0
0rL   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7frh   )contains_gemm_like).0xs     rB   	<genexpr>%contains_gemm_like.<locals>.<genexpr>   s     ?,Q%a((,   )r&   r   
isinstanceanyr7   r   r   )r:   r   s     rB   r   r      s4    >%..?%,,???EJJ''rL   c                    SSK Jn  [        X5      (       a-  U R                  (       a  U R                   H  nU" U5        M     g U" U 5        g )Nr   r   )r&   r   r   temp_groupingr7   )r:   fnr   _snodes       rB   _temp_group_visit_leavesr      s6    >%..53F3FllFvJ # 	5	rL   c                    SnU R                    HG  nU(       a  US-  nX#R                  5       -  nU(       d  M*  U[        U R                  5       5       -  nMI     U$ )Nr^   r>   )r7   get_namer.   get_buffer_names)r:   	with_bufsretns       rB   _group_namer      sV    
C\\3JCzz|9d5113456C  JrL   c                H    [        U [        5      =(       a    U R                  $ rh   )r   r   is_fake)ds    rB   _is_fake_depr      s    a!/aii/rL   c                j    SR                  U  Vs/ s H  oR                  5       PM     sn5      $ s  snf )N~)joinr   )gnsgns     rB   _group_namesr      s'    88S1Sr[[]S1221s   0c                    [        X5      n[        XU5      u  pEpg[        [        X5      5      nSUS'   [	        U5      n	UUUUUU	4$ )z*Initialize memory tracking data structures)r   r   N)r   r   dictzip_build_candidate_buffer_map)
r7   graph_inputsgraph_outputsname_to_freeable_input_bufpeak_memorysnodes_curr_memorysnodes_allocfreebuf_to_snode_last_use_curr_memorycandidate_buffer_maps
             rB   _initialize_memory_trackingr      sl    !7!M&	
 MK%5
 F78LL 77LM 	" rL   c                    0 n0 n[        U 5       H3  u  p4US:  a  XS-
     OSX'   U[        U 5      S-
  :  a  XS-      OSX$'   M5     U S   nXU4$ )z/Create double-linked list structure from snodesr   r   N)r4   len)r7   _prev_nextir:   _heads         rB   _initialize_double_linked_listr   
  sh     EEf%()Av!e}4()CK!O(;v!e} & 1IErL   c                    [        [        5      nU  H3  nUR                  R                   H  nX   R	                  U5        M     M5     [        U5      $ )ae  
Build inverted index: node -> set of buffers where node appears in successors.

This optimization reduces buffer iteration from O(total_buffers) to O(buffers_per_node).
Since buffer successors are immutable during reordering, this map doesn't need updates.

Returns:
    dict mapping each node to the set of buffers that have this node in their successors
)r   r   
mpi_buffer
succ_nodesaddr   )r   node_to_candidate_bufsbuf	succ_nodes       rB   r   r     sT     CNC %22I"-11#6 3 %
 &''rL   c           	     n    U  Vs0 s H#  o[        S UR                  5        5       5      _M%     sn$ s  snf )z
Pre-compute output name sets for all nodes.

This optimization avoids creating OrderedSet objects repeatedly during
exposed time calculations.

Returns:
    dict mapping each node to a set of its output names
c              3  @   #    U  H  oR                  5       v   M     g 7frh   r   r   os     rB   r   /_precompute_node_output_sets.<locals>.<genexpr>@  s     D0C1**,,0C   )r   get_outputs)r7   r:   s     rB   _precompute_node_output_setsr   3  s?     SYRXzD0A0A0CDDDRX  s   *2c                b    [        U 5      (       a  [        R                  $ [        R                  $ rh   )r   r   *reorder_sink_runtime_estimations_comm_mult.reorder_sink_runtime_estimations_non_comm_multr:   s    rB   _op_runtime_estimate_multr   D  s'     5!!FFFFFFrL   c                N    [        U R                  SS5      =n(       a  SU;   a  gg)z
Filtering out ops that contain Collective and Wait inside and considered as Collectives.
See contains_collective function.
If the op contains Wait inside - consider as Synchronous compute.
ry   Nz-torch.ops._dtensor.shard_dim_alltoall.defaultFT)r   r   )r:   ry   s     rB   is_async_collectiver   M  s-     %UZZ1EtLLL:>PPrL   c                "    [        U [        5      $ rh   )r   r   r   s    rB   contains_async_collectiver   Z  s    u&9::rL   c                P    / nU n Ub  UR                  U5        XA:X  a   U$ X$   nM"  )a!  
Traverse doubly-linked list from head to tail and return nodes as a list.

Args:
    head: Starting node of the segment
    tail: Ending node of the segment (inclusive)
    next_dict: Dictionary mapping each node to its next node

Returns:
    List of nodes from head to tail (inclusive)
)append)headtail	next_dictr   r   s        rB   _group_nodes_from_linked_listr   ^  s<      CA
=JJqM9J L rL   c                ,    X    nX1   n[        XT-  5      $ )zf
Check if a wait node corresponds to a given collective node.
Uses pre-computed sets for O(1) lookup.
)bool)collective_snode
wait_snodenode_output_setsnode_dep_setscollective_outs
unmet_depss         rB   !_is_corresponding_collective_waitr   y  s"     '8O*J
,--rL   c                  ^^ TU    nSmX0   nSn/ nX   n	U	b  XI   n
X-  (       a  USU	R                  5        S3-  nO[        U	5      (       a(  [        U	5      (       d  OUR                  U	5        X   n	M`  [	        U	5      (       a(  SnU H  n[        X	X45      (       d  M  Sn  O   U(       a  O=TnS
UU4S jjn[        X5        TnUSU	R                  5        S	X-
   S3-  nX   n	U	b  M  UTU4$ )z
Calculate exposed communication time by iterating directly over linked list.
Avoids O(N) list construction for each call.

The collective_snode is the starting point, iteration continues via next_dict.
        r^   z->W[]FTc                   > TTU    -  mg rh   ro   r   rc   runtimess    rB   accumulate_time9_coll_exposed_communication_time.<locals>.accumulate_time      &))IrL   +[r   r!   returnrX   )r   r   r   r   r   r   r   )r   r   r   r   r   rb   r   rf   collectives_foundr:   r   has_wait_for_collectives_found_collcomp_time_beforer   comp_time_afterrc   s     `             @rB    _coll_exposed_communication_timer     s;    )*II&8OL13'E

")
'd5>>#3"4A66Lu%%,U33!((/!(-2**4$-=  6:2 + .$	* 	* 	!8#!ENN,-Q/Q.RRSTT G 
J i--rL   c                  ^^ SnSmSn/ nX    n	U	b  [        U	5      (       a  UR                  U	5        [        U	5      (       aj  [        XXE5      (       a  TU	   nUSU	R	                  5        S3-  nOy[        U	5      (       d  SmX)   n	Mz  U H  n
[        XXE5      (       d  M  Sm  O   X)   n	M  TnSUU4S jjn[        X5        TnUSU	R	                  5        SX-
   S3-  nX)   n	U	b  M  UTU4$ )	z
Calculate exposed communication time for a wait operation by iterating
directly over linked list backwards. Avoids O(N) list construction.

Iterates from wait_snode backwards using prev_dict to find corresponding collective.
r   r^   z->C[r   c                   > TTU    -  mg rh   ro   r   s    rB   r   9_wait_exposed_communication_time.<locals>.accumulate_time  r   rL   r   r   r   )r   r   r   r   r   r   r   )r   r   	prev_dictr   r   r   rb   rf   waits_foundr:   wr   r   r   rc   s      `          @rB    _wait_exposed_communication_timer     s<    IIL+-K!E

u%u%%0#3  %UO	$u~~'7&8 ::,U33	!($A8"2  %(	 % "($	* 	* 	!8#!ENN,-Q/Q.RRSTT G 
J i--rL   c                p    X0   nU(       a  XU'   XcU'   XB   nU(       a  XU'   XtU '   X#U '   XU'   XP:X  a  U$ U$ )aQ  
Swap positions of candidate and group in doubly-linked list.

Transforms:
candidate_prev -> candidate -> group_head...group_tail -> group_tail_next
Into:
candidate_prev -> group_head...group_tail -> candidate -> group_tail_next

Args:
    candidate: Node to swap with group
    group_head: First node of group
    group_tail: Last node of group
    prev_dict: Dictionary mapping nodes to their previous nodes
    next_dict: Dictionary mapping nodes to their next nodes
    head: Current head of the linked list

Returns:
    New head of the linked list (may change if candidate was the head)
ro   )	candidate
group_head
group_tailr   r   r   candidate_prevgroup_tail_nexts           rB    _perform_double_linked_list_swapr     sa    8 )N$..!*j  +O%./"*i &i%j KrL   c                n   0 nSn	U(       d&  [        X4-
  Xr   S   U-
  UR                  -   5      n	X4$ U* n
U HS  nX{   S   U
-   nXU'   [        X5      n	UR                  U5      nUc  M2  U H  nXR                  R                  -  n
M     MU     Xr   S   U
-   UR                  -   nXU '   [        X5      n	X4$ )a  
Calculate potential peak memory after swapping candidate with group (reorder version).

Computes new memory levels for all affected nodes and returns the potential
peak memory along with cached post-allocation memory values for each node.

Args:
    candidate: Node being moved
    gns: Group nodes
    group_tail: Last node of group
    group_peak_memory: Current peak memory within the group
    candidate_delta_mem: Net memory change from candidate (alloc - free)
    candidate_allocfree: Candidate's allocation/free info
    group_n_to_bufs_after_swap_dealloc_by_candidate: Buffers whose deallocation moves to candidate
    curr_memory: Current memory state dict

Returns:
    Tuple of (potential_peak_memory, post_alloc_update_dict)
r   r   )max
size_allocgetr   	size_free)r   r   r   group_peak_memorycandidate_delta_memcandidate_allocfree/group_n_to_bufs_after_swap_dealloc_by_candidatecurr_memory_post_alloc_updatepotential_peakmem_after_reorder_deltar   gn_post_alloc_membufsr   candidate_mem_post_allocs                   rB   (_calculate_potential_peak_memory_reorderr  1  s    > 8:N:3#A&!"!,,-
 11 %8#7'OA.1HH!22^?>BB2F'>>+C+CC'   	"
!	"

(
(	) 
 %=y!BN--rL   c
                   U(       dH  U H  n
Xz   nUS   U-
  US   U-
  4Xz'   M     Xr   S   UR                   -   nXR                  -
  nUU4Xp'   gUR                  5        H  nU H  nXU'   M	     M     SnU HS  nUU   n[        S UU    5       5      nUU-  nU	U   =R                  U-  sl        UU	U   R                  -
  nUU4UU'   MU     X`   nX   =R                  U-  sl        XU    R                  -
  nUU4Xp'   g)a  
Update memory tracking structures after swap (reorder version).

Updates curr_memory, buf_to_snode_last_use, and snodes_allocfree dictionaries
to reflect the new memory state after swapping candidate with group.

Args:
    candidate: Node that was moved
    gns: Group nodes
    group_tail: Last node of group
    candidate_delta_mem: Net memory change from candidate (alloc - free)
    candidate_allocfree: Candidate's allocation/free info
    group_n_to_bufs_after_swap_dealloc_by_candidate: Buffers whose deallocation moves to candidate
    post_alloc_update: Cached post-allocation memory values
    curr_memory: Current memory state dict (mutated)
    buf_to_snode_last_use: Buffer to last-use node mapping (mutated)
    snodes_allocfree: Node allocation/free info dict (mutated)
r   r   Nc              3  N   #    U  H  nUR                   R                  v   M     g 7frh   r   r  r   r   s     rB   r   =_update_memory_tracking_after_swap_reorder.<locals>.<genexpr>  s!      2
I NN$$I   #%)r  r  r/   sum)r   r   r   r  r	  r
  post_alloc_updater  r   r   r   cm_candidate_post_alloc_mem_candidate_post_free_memr  r   "size_free_to_move_to_candidate_sumr   _gn_post_alloc_memsize_free_to_move_to_candidategn_post_free_memcandidate_post_free_mems                         rB   *_update_memory_tracking_after_swap_reorderr#  t  sy   < ;BB1++1++KO  #A&)<)G)GG 	" &(E(EE 	! &$"
 	 @FFHC)2#&  I /0&"3A"6.1 2
FqI2
 /
& 	+.LL*%%)GG% 25Ea5H5R5R R,.>?A  !2 <))-OO)!Y$?$I$II  	"KrL   c                    [        [        5      nUR                  U [        5       5      n[        U5      nU H!  nX'   nX;   d  M  XH   R	                  U5        M#     U$ )a^  
Find buffers whose last use will change after swapping candidate with group.

When we swap [candidate [group]] to [[group] candidate], some buffers that
were last used by a group node will now be last used by candidate instead.
This affects memory deallocation timing.

Args:
    candidate: The node being moved
    gns: Group nodes being swapped with candidate
    buf_to_snode_last_use: Mapping of buffers to their current last-use nodes
    candidate_buffer_map: Pre-computed map of node -> buffers using that node

Returns:
    Dict mapping group nodes to buffers that will change their last-use node
)r   r.   r  r   r   )	r   r   r   r   r
  candidate_bufsgns_setr   snode_last_uses	            rB   #_find_buffers_with_changed_last_user(    sb    0 	D 4 *--iFNoG.3$;KRRSVW 
 ;:rL   c                    [         R                  (       d%  [        U 5      (       a  SSU R                  5        34$ [         R                  (       d  [        U 5      (       a  gg)aA  
Check if a candidate node can be grouped with collective during reordering.

This pass processes collectives left to right, so we avoid grouping with
already-processed collectives based on configuration.

Args:
    candidate: Node to check for groupability

Returns:
    Tuple of (is_groupable, reason_if_not_groupable)
Fzcandidate contains_collective )Fr   TN)r   (reorder_iterative_group_with_collectivesr   r   )reorder_iterative_use_runtime_estimationsr   r   s    rB   _is_node_groupable_for_reorderr.    sU    $ @@$Y//01C1C1E0FG  AAi((.rL   c                  ^ U nU Vs0 s H  oXx   R                   _M     n	n[        U	 Vs/ s H  oU   PM	     sn5      n
[        U Vs/ s H  oU   R                  PM     sn5      nSU
 SU S3m/ SQnUR                  5        VVs/ s H  u  p[	        U5      UR
                  S-  UR                  S-  UR                  S-  UR                  S-  UR                   S-  UR                  UR                  UR                  UR                  UR                  /PM     nnn[        R                  R                  S5      (       a  SSKJn  TU" UUS	9-  mO8TS
-  mT[#        U5      S-   -  mTSR%                  ['        ["        U5      5      -  m[)        USU5      n[+        U5      U:X  d   e[-        UXV5      u  n    nTSU 3-  mTSU 3-  m[.        R1                  T5        [3        SS U4S jS9  U$ s  snf s  snf s  snf s  snnf )a  
Format reordering statistics, log them, and return final node list.

Computes improvement metrics, creates a formatted table (using tabulate if
available), validates the reordered node count, recalculates peak memory,
and logs all information.

Args:
    stats: Per-node reordering statistics
    head: Head of the reordered linked list
    next_dict: Linked list next pointers
    original_snodes_num: Original number of nodes (for validation)
    peak_memory: Initial peak memory before reordering
    name_to_freeable_input_buf: Buffer memory tracking info
    graph_outputs: Graph output names

Returns:
    Final reordered list of scheduler nodes
zAreorder_communication_preserving_peak_memory improved overlap by z
 ns after z reorders.
)zCollective nodecomm_time(us)comp_time(us)initial exposed(us)final exposed(us)improvement(us)limiting factorr\   r]   r_   rf        @@tabulater   r7  headers>Please `pip install tabulate` to nicely render overlap stats.

Nz
 peak_memory_before:z
 peak_memory_after:artifactc                     SSS.$ )NrT   stringnameencodingro   ro   rL   rB   <lambda>2_format_and_log_reordering_stats.<locals>.<lambda>a  s    B 
rL   c                    > T $ rh   ro   )reorder_log_strs   rB   rC  rD  e  s    ?rL   metadata_fn
payload_fn)rl   r  r\   itemsnode_summaryrb   rc   rd   re   rZ   r]   r_   rf   	importlibutil	find_specr7  rY   r   mapr   r   r   overlap_loginfor
   )statsr   r   original_snodes_numr   r   r   rS   r:   rl   total_improvementtotal_movesr:  	node_inforowsr7  
new_snodesnew_peak_memoryr>   rF  s                      @rB    _format_and_log_reordering_statsrZ    sL   8 JEOPZE*+777ZKP[I[E/[IJJGJ5%(..JGHK LL]K^ _l	, G6 !+ 0 0 2 !3E #%#%%%+##c)!!C'%%OO""""	
 !3 	   ~~
++%8
 	

 	M	
 	3w<$..499Sd^44.tT9EJz?1111=. OQ1 /}==O..?@@O_%
 + K QIG&s   G3G8	G=BHc                   SnU  H  n[        U5      (       d  M  Sn  O   U(       d  U 0 4$ [        U 5      n[        [        R                  R
                  R                  5       5      n[        [        R                  R                  5       5      n[        XU5      u  nnnn	n
nU  Vs0 s H  nU[        U5      [        U5      -  _M     nnU  Vs0 s H#  o"[        S UR                  5        5       5      _M%     nnU  Vs0 s H   nU[        S UR                   5       5      _M"     nn0 nSn[        U 5      u  nnn[        R                   nSnUn[        R"                  nSnUGb  UU   Gb  UU   nU(       a  GO[%        U5      (       d  UnM-  Ub  UU:  a  GOUS-  n['        5       =nUU'   [)        UUXU5      u  nnnUUl        UUl        UU-
  =Ul        Ul        UUl        UU   nUnUn 0 n!Sn"UU   S   n#[        UU   5      n$[        UU   5      n%UGbY  [        R4                  (       a5  UR0                  [        R6                  * UR*                  -  :  a	  S	Ul        GO[        R:                  (       d  [        U5      (       a	  S
Ul        GOU$U%-
  n&U&(       d  Sn'OUU   n([=        U(U&-  5      n'U'(       Ga  [?        U5      u  n)n*U)(       a  UnU$RA                  UU   5        U%RA                  UU   5        [        R4                  (       aC  [C        U5      (       a  [E        UUUUUU5      u  nnn+UU4U!U'   [%        U5      (       d  U"UU   -  n"[G        U#UU   S   5      n#U=RH                  S-  sl$        UU   nGMl  SURK                  5        SUR                  5        V,s/ s H  n,U,RK                  5       PM     sn, SU* 3n-U-Ul        GO[        R4                  (       Gab  [%        U5      (       d  UU   n.U.S:  a  [        U!5      S:  a  [G        SUR*                  UR,                  -
  5      n/[G        SUR*                  UR,                  -
  U.-
  5      n0U0U/-
  n1U!RM                  5        H/  u  n2n3[G        SU2U3-
  5      n4[G        SU2U3-
  U.-   5      n5U1U5U4-
  -  n1M1     U1S:  a  SU. SU1 3Ul        GOU!RO                  5        H  u  n6u  n2n3U2U3U.-
  4U!U6'   M     OjU"S:  ad  [)        UUXU5      u  nnn+[G        SUU-
  5      n/[G        SUU-
  U"-   5      n0U0U/-
  n1U1S:  a&  SURK                  5        SU" SU1 SU SU 3
Ul        GO*[Q        UU U5      n7UU   n8U8RR                  U8RT                  -
  n9[W        UU7X5      n:[Y        UU7U U#U9U8U:U5      u  n;n<U;U-
  U[        RZ                  -  :  a  SU; SU 3Ul        OU=R\                  S-  sl.        US-  n[_        UUU UUU5      n[)        UUXU5      u  nnnUUl        UUl        UUl        UU-
  Ul        [a        UU7U U9U8U:U<UU	U5
        U(       a4  SSK1J2n=  U=" UU7[g        U75      [Q        USU5      U
UUUUSU:5      nU(       a  OUU   nUb  GMY  UnUb	  UU   b  GM  [        Rh                  (       d  [Q        USU5      n>U>U4$ [k        UUUUUU
U5      n>U>U4$ s  snf s  snf s  snf s  sn,f )z|
Internal testing helper that also returns debug info.
Returns:
    - reordered snodes list
    - dict {snode: ReorderInfo}
FTc              3  @   #    U  H  oR                  5       v   M     g 7frh   r   r   s     rB   r   I_reorder_communication_preserving_peak_memory_internal.<locals>.<genexpr>       C/B!/Br   c              3  ^   #    U  H#  n[        U5      (       a  M  UR                  v   M%     g 7frh   r   rA  r   r   s     rB   r   r]           
4qLOFAFF4   --r   Nr   r    unexposed by runtime estimationszcollective ordering$data dependency detected
 candidate:z(outs:z)
 non_group_reason:candidate has compute z,, group contains waits, total_exposed_delta z
candidate z is collective, group_runtime:z, exposed_delta:z c_comm_time:z c_comp_time:peak memory new:	 vs base:!_debug_iterative_memory_recomputerT   )6r   r   r   r   graphr   r5   get_output_namesr   estimate_op_runtimer   	frozensetr   unmet_dependenciesr   r   (reorder_iterative_debug_limit_to_reorder(reorder_iterative_debug_memory_recomputer   rV   r   rb   rc   rd   re   rf   r,  )reorder_iterative_extra_comm_comp_overlaprZ   ,reorder_iterative_unsafe_collectives_reorderr   r.  updater   r   r  r]   r   r/   rJ  r   r  r  r(  r  $reorder_iterative_peak_memory_budgetr\   r  r#  comms_debugrj  r   reorder_sink_verbose_loggingrZ  )?r7   has_collectivesr:   rS  r   r   r   r   r   r   r   r   r   r   r   rR  rU  r   r   r    debug_num_collectives_to_reordernum_processed_collectivescurr debug_iterative_memory_recomputeiterative_recompute_error
_next_currrQ  rb   rc   rf   r   r   r   group_waitsgroup_runtimer  group_unmet_deps_namesgroup_output_namesdata_deps_namesdata_depcandidate_out_namesis_groupable_resultgrouping_reasonr>   r   msg	c_runtimeexposed_beforeexposed_afterexposed_deltagw_comm_timegw_comp_timegw_exposed_beforegw_exposed_aftergwr   r	  r  r
  r  r  rj  rX  s?                                                                  rB   rQ   rQ   k  s    Ou%%"O  rzf+$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$F-H"
 0E 	"5),Ee,LLL  0 RXAQWyCu/@/@/BCCCQW  A 	> E 	y 
!44
 
 	
 	  > 35EK8@E5% 	== % &'(-D== % !&

uT{64[
$(..D+7%)II!Q&!(]*uT{-M%].
*	9l #"4=	4IIt1($K	

(.q1 ",M$,?!@'(8(>?#EE""III..!! (J$ !MM'	22'<$ 57IIO"  '7y&A# 3o EF7U84#_ '!*J +11-	2JK&--.>y.IJ#MM(336V ) % % ( 0 -73Iy! 6?	5IK	28CC)Xi-@@M(+)<	+B1+E)% LLA%L %i 0I((1(:(:(<'=V[d[p[p[rDs[rVWQZZ\[rDsCt u//>.?A 
 ,/D(EEE0;; ( 3I 1}[)9A)=),Q0O)P(+t~~>J) )6(F:E:L:L:N6L,03A|l7R0S-/2 !<,#>#J0, *-=@Q-QQM ;O )1,"8 DNN[_!^ !0 " "-!2!2!4! % , , %1$09$<3"B "5 %q(2R%uh-3/	9a *-Q	I0E)F(+Ay9/D}/T(U(5(F(1,",Y-?-?-A,B C22? A22?i[Xefoep!r !0
 " ,IJ,C 0@	/J#..1D1N1NN    4s$9 < 9%''C 	 /N. , Q QQR '~&6i}M $ JJ!OJ1K4:z5%E 2ReX2.Iy, 'DN&DN ,D!*Y!6D6##?"%  0 K,M %1%uE.! $BC-) -j)IC #D K 
uT{6N 4425$F
5  1"J uo	0A>p Ets   %"Z:*Z?='[$[	
c                  ^^^^^^^^^^^^ 0 n0 m0 0 0 smmm[        U 5       H|  u  pVUR                  5        H  nXdU'   M	     UR                  5        H  nUTU'   M
     UTUR                  5       '   UR                  5       n	[        R
                  TU	'   STU	'   UTU	'   M~     Sn
U  H  nU(       ab  [        U5      (       aR  U
TUR                  5       '   UR                   H(  nTU   R                  5       n[        TU   U
5      TU'   M*     U
S-  n
Ml  U(       d  Mu  [        U5      (       d  M  STUR                  5       '   M      " UUUU4S jS5      mU  Vs0 s H   nU[        S UR                   5       5      _M"     snm/ m[        [        5      mU  Vs0 s H  of[        U5      _M     snmTR                  5        HN  u  pm[        U5      S:X  a  [         R"                  " TT" U5      5        U H  nTU   R%                  U5        M     MP     / mUUUUU4S jmU4S jmUUUU4S jnT(       aQ  [         R&                  " T5      R(                  nU(       a  [        U5      (       a	  U" U5        OT" U5        T(       a  MQ  TR+                  5        H  n[        U5      S:X  a  M   S	T 35       e   T$ s  snf s  snf )
a  
Schedule `snodes` for various comm optimization objectives.

Args:
    snodes: the nodes to be scheduled.
    raise_comms: whether to greedily schedule collectives as early as possible
    sink_wait: whether to greedily schedule waits as late as possible
    reorder_compute_for_overlap: whether to reorder compute nodes to
        optimize for compute/communication overlapping.

Returns:
    The new schedule order.

Some notes on the synergy between different options:
    - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
    - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
r   r   c                  2   > \ rS rSrSU UUU4S jjrS rSrg)$_schedule_for_comm.<locals>.Runnablei  c                   > Xl         [        [        UR                  5       5      5      nTU   R	                  5       nTU   TU   TU   4U l        g rh   )r:   nextiterget_operation_namesr   score)rk   r:   rA  
fused_namename_to_fused_nodescores_0scores_1scores_2s       rB   __init__-_schedule_for_comm.<locals>.Runnable.__init__  sT    JU6689:D+D1::<J$$$DJrL   c                4    U R                   UR                   :  $ rh   r  )rk   others     rB   __lt__+_schedule_for_comm.<locals>.Runnable.__lt__  s    ::++rL   )r  r:   N)r   rX   )rp   rq   rr   rs   r  r  rw   )r  r  r  r  s   rB   Runnabler    s    	 		,rL   r  c              3  8   #    U  H  oR                   v   M     g 7frh   )rA  )r   deps     rB   r   %_schedule_for_comm.<locals>.<genexpr>  s     G.Fs((.Fr   c                   > TR                  U 5        U R                  5        HT  nTU    HH  n TU    R                  U5        [        TU    5      S:X  d  M+  [        R
                  " TT" U 5      5        MJ     MV     g)zE
Schedules `snode` and put all unblocked nodes onto the ready queue.
r   N)r   r   remover   heapqheappush)r:   buf_namer  buffer_usersready	scheduledr   s     rB   schedule$_schedule_for_comm.<locals>.schedule#  sl     	..0H%h/5!((2z%()Q.NN5(5/: 0 1rL   c                    > T V s/ s H=  n [        U R                  5      (       a  M  [        U R                  5      (       a  M;  U PM?     nn [        U5      S:X  a  g[	        US S9$ s  sn f )zP
Return the next node in the ready queue that's neither a collective or
a wait.
r   Nc                    U R                   $ rh   r  r   s    rB   rC  G_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<lambda>:  s    QWWrL   key)r   r:   r   r   min)r   
candidatesr  s     rB   get_overlapping_candidate5_schedule_for_comm.<locals>.get_overlapping_candidate.  sf     
&qww/ 8Eagg8N  	 

 z?a:#455
s   A'A'A'c                  > [        U 5      (       d   eT" U 5        TU    nUS:  aQ  T" 5       =nbG  TR                  U5        T" UR                  5        UTUR                     -  nUS:  a  T" 5       =nb  MG  [        R                  " T5        g)z
Schedules collective node `snode`, along with one or more compute nodes
to overlap with it. The strategy is described in the comment of
`reorder_compute_for_overlap`.
r   N)r   r  r:   r  heapify)r:   collective_costr   r  r  r  snode_to_costs      rB   schedule_collective_for_overlap;_schedule_for_comm.<locals>.schedule_collective_for_overlap<  s     #5))))'.a799FLL#Y__%}Y__==O a799F 	erL   z;Detected unscheduled nodes. Nodes with unmet dependencies: )r4   r   r  r   sysmaxsizer   	ancestorsr  r   r   ro  r   rm  rJ  r   r  r  r   heappopr:   r/   )r7   rF   rG   rH   buf_name_to_snoderA   r:   r  op_name	node_namecomm_idxancestoranc_fused_namedepsr  r  r  r  r  r  r  r  r  r  r  r  r  r   s                   @@@@@@@@@@@@rB   rJ   rJ     s   L #%r2 Hh'
..0H*/h' 1 002G*/w' 3/45>>+,NN$	!kk! ( H.u55)1HU^^%&!OO!3H!=!F!F!H+.x/G+R( , MHZM%00)*HU^^%& , ,  <E 	zGe.F.FGGG<J
 E=H=TLDJKF5/66FKM!'')t9>NN5(5/2C!!%(  * I	; 	;6 * e$**#6u#=#=+E2UO % !!#4yA~ 	
I*V	
~ $ U< Ls   'J7J<c           
        [         R                  R                  5       (       d  U $ U  Vs/ s H  n[        U5      (       d  M  UPM     nn[	        S[        U5      5       H^  n[        [        XE   R                  5       5      5      nXES-
     R                  5        H  nXE   R                  [        XvSS95        M      M`     U $ s  snf )z
Decide global ordering of comms, by just enforcing the ordering that's in the input graph
(might not be the same ordering as the eager mode program).
TODO: Come up with a better approach
r   Tmutating_bufr   )r0   r)   is_availabler   r,   r   r  r  r   add_fake_depr   )nodesname_to_bufr  r   
comm_nodesr   r  r   s           rB   decide_global_ordering_of_commsr  _  s     ))++"=U&9!&<!UJ=1c*o&D!?!?!ABC!e$557CM&&E 8 ' L >s   CCc                      \ rS rSr% SrS\S'   SrS\S'   SrS\S'   SrS\S	'   S
r	S\S'   Sr
S\S'   SrS\S'   SrS\S'   SrS\S'   S
rS\S'   \S 5       rSrg)SinkWaitInfoiw  r   r[   r]   r^   rY   r_   r\   
moves_inforX   rZ   r`   ra   rb   rc   rd   re   rf   c                4    U R                   U R                  -
  $ rh   ri   rj   s    rB   rl   SinkWaitInfo.improvement  rn   rL   ro   N)rp   rq   rr   rs   r]   ru   r_   r\   r  rZ   rb   rc   rd   re   rf   rv   rl   rw   ro   rL   rB   r  r  w  sw    GSL#E3NJ!OS!IuIu!OU!M5L#9 9rL   r  c                V   [        U 5      (       a  SSU R                  5        34$ [        U 5      (       a  SSU R                  5        34$ [        R                  (       dJ  [        U 5      (       a  SSU R                  5        34$ [        U 5      (       a  SSU R                  5        34$ g)a0  
Check if a candidate node can be grouped during sink_waits pass.

Sink Waits traverses waits right to left, so we don't group with
processed waits on the right or with async collectives.

Args:
    candidate: Node to check for groupability

Returns:
    Tuple of (is_groupable, reason_if_not_groupable)
Fzcandidate contains wait $candidate contains_async_collective zcandidate contains collective zcandidate contains gemm_like r*  )r   r   r   r   &sink_iterative_use_runtime_estimationsr   r   r-  s    rB   !_is_node_groupable_for_sink_waitsr    s    " Y01C1C1E0FGGG ++293E3E3G2HI
 	

 >>
 y))01C1C1E0FG  i((/	0B0B0D/EF  rL   c	                `   US   n	Xy   S   X   R                   -
  n
U(       d@  XR                   -   nUXR                  -
  4Xp'   U H  nX|   nUS   U-   US   U-   4X|'   M     gU /UQ HC  nX^   nX   =R                  UR                  US5      -  sl        UXU   R                  -
  4X~'   ME     g)a  
Update memory tracking structures after swap (sink_waits version).

Updates curr_memory and snodes_allocfree dictionaries to reflect the new
memory state after swapping candidate with group.

Args:
    candidate: Node that was moved
    gns: Group nodes
    candidate_delta_mem: Net memory change from candidate (alloc - free)
    candidate_allocfree: Candidate's allocation/free info
    group_n_to_bufs_after_swap_dealloc_instead_of_candidate: Buffers whose deallocation moves from candidate to group
    post_alloc_update: Cached post-allocation memory values
    size_free_delta_update: Cached size-free delta values
    curr_memory: Current memory state dict (mutated)
    snodes_allocfree: Node allocation/free info dict (mutated)
r   r   N)r  r  r  )r   r   r  r	  7group_n_to_bufs_after_swap_dealloc_instead_of_candidater  size_free_delta_updater  r   r   pre_group_memcandidate_post_allocr   r  r   
post_allocs                   rB   -_update_memory_tracking_after_swap_sink_waitsr    s    8 QJ+A.1A1M1X1XXMB,/M/MM  #@#@@"
 BB1++1++KO  	#&)
%%)?)C)CAq)II%!,666
 rL   c	                   Xr   S   X   R                   -
  n	0 n
0 nSnU(       d  [        X4-   XR                   -   5      nXU4$ XR                   -   nXU '   Un[        S [        R                  R                  UR                  5       5       5       5      nU* X'   XN-   nU HX  nUU   S   U-   nUU
U'   [        UU5      nSnUU;   a,  UU   nU H  nUUR                  R                  -  nM     UUU'   UU-  nMZ     XU4$ )aO  
Calculate potential peak memory after swapping candidate with group (sink_waits version).

Computes new memory levels for all affected nodes and returns the potential
peak memory along with cached post-allocation and size-free delta values.

Args:
    candidate: Node being moved
    gns: Group nodes
    group_head: First node of group
    group_peak_memory: Current peak memory within the group
    candidate_delta_mem: Net memory change from candidate (alloc - free)
    candidate_allocfree: Candidate's allocation/free info
    group_n_to_bufs_after_swap_dealloc_instead_of_candidate: Buffers whose deallocation moves from candidate to group
    curr_memory: Current memory state dict
    snodes_allocfree: Allocation/free info for all nodes

Returns:
    Tuple of (potential_peak_memory, post_alloc_update_dict, size_free_delta_update_dict)
r   c              3  N   #    U  H  nUR                   R                  v   M     g 7frh   r  r  s     rB   r   >_calculate_potential_peak_memory_sink_waits.<locals>.<genexpr>  s%      &
C 	  
r  )	r  r  r  	itertoolschainfrom_iterabler/   r   r  )r   r   r   r  r  r	  r  r  r   r  r  _size_free_delta_updater  r  candidate_size_free_to_move	delta_memr   gn_post_allocgn_size_free_to_addr  r   s                        rB   +_calculate_potential_peak_memory_sink_waitsr    sT   >  +A.1A1M1X1XXM79<>NB3:::
 3JJJ(+I+II$8y!)N"% &??00CJJL
& # +F)E&#AI#B*Y6!.2^];HHJ2ND#s~~'?'??# *=#B'((	  /FFFrL   c                p    X1   nU(       a  XU'   XcU '   X@   nU(       a  X#U'   XtU'   XU'   XU '   X:X  a  U $ U$ )a  
Swap positions of candidate and group in doubly-linked list (sink_waits version).

Transforms (moves candidate to the left):
group_head_prev -> group_head...group_tail -> candidate -> candidate_next
Into:
group_head_prev -> candidate -> group_head...group_tail -> candidate_next

Args:
    candidate: Node to swap with group
    group_head: First node of group
    group_tail: Last node of group
    prev_dict: Dictionary mapping nodes to their previous nodes
    next_dict: Dictionary mapping nodes to their next nodes
    head: Current head of the linked list

Returns:
    New head of the linked list (may change if group_head was the head)
ro   )r   r   r   r   r   r   group_head_prevcandidate_nexts           rB   +_perform_double_linked_list_swap_sink_waitsr  /  sa    8  +O%./"*i )N$..!*j &j%i KrL   c                .  ^ / SQnU R                  5        VV	s/ s H  u  p[        U5      U	R                  S-  U	R                  S-  U	R                  S-  U	R
                  S-  U	R                  S-  U	R                  U	R                  U	R                  U	R                  U	R                  U	R                  /PM     n
nn	Sm[        R                  R                  S5      (       a  SSKJn  TU" U
US9-  mO8TS-  mT[#        U5      S	-   -  mTS	R%                  ['        ["        U
5      5      -  m[(        R+                  T5        [-        US
U5      n[/        U5      U:X  d   e[1        XU5      u  n    nTSU 3-  mTSU 3-  m[3        SS U4S jS9  U$ s  sn	nf )a  
Format sink_waits statistics, log them, and return final node list.

Computes improvement metrics, creates a formatted table (using tabulate if
available), validates the reordered node count, recalculates peak memory,
and logs all information.

Args:
    stats: Per-node sink_waits statistics
    head: Head of the reordered linked list
    next_dict: Linked list next pointers
    original_snodes_num: Original number of nodes (for validation)
    peak_memory: Initial peak memory before reordering
    name_to_freeable_input_buf: Buffer memory tracking info
    graph_outputs: Graph output names

Returns:
    Final reordered list of scheduler nodes
)z	Wait noder0  r1  r2  r3  r4  r5  r]   r_   r\   r  rf   r6  r^   r7  r   r8  r9  r;  r<  Nz*
 sink_waits_iterative peak_memory_before:z)
 sink_waits_iterative peak_memory_after:r=  c                     SSS.$ )Nsink_waits_iterative_infor?  r@  ro   ro   rL   rB   rC  2_format_and_log_sink_waits_stats.<locals>.<lambda>  s    / 
rL   c                    > T $ rh   ro   )log_strs   rB   rC  r    s    7rL   rG  )rJ  rK  rb   rc   rd   re   rl   rZ   r]   r_   r\   r  rf   rL  rM  rN  r7  rY   r   rO  rP  rQ  r   r   r   r
   )rR  r   r   rS  r   r   r   r:  r:   rQ  rW  r7  rX  rY  r>   r  s                  @rB    _format_and_log_sink_waits_statsr  `  s   8G: !;;= )KE NNS NNS   3&$s"  LLJJOO	
 ) 	 " G~~
++%8
 	

 	TT3w<$&&499Sd^,,W.tT9EJz?1111= OQ1 <[MJJG;O;LMMG
 # [s   B Fc                   [        [        5      nUR                  U [        5       5      nU HP  nX&   nXp:w  a  M  UR                  R
                  nSn	U H  n
X;   d  M
  U
n	M     U	c  M=  XI   R                  U5        MR     U$ )a}  
Find buffers whose last use will change after swapping in sink_waits pass.

When we swap [group] candidate to candidate [group], some buffers that
were last used by candidate will now be last used by a group node instead.
This is the opposite direction from the reorder version.

Args:
    candidate: The node being moved (currently last use)
    gns: Group nodes being swapped with candidate
    buf_to_snode_last_use: Mapping of buffers to their current last-use nodes
    candidate_buffer_map: Pre-computed map of node -> buffers using that node

Returns:
    Dict mapping group nodes to buffers that will change their last-use node
N)r   r.   r  r   r   r   r   )r   r   r   r   r  r%  r   r'  r   last_succ_gnr   s              rB   ._find_buffers_with_changed_last_use_sink_waitsr    s    0 	D < *--iFN.3& ^^..
B!   	@MTT	
% , CBrL   c                   [        U 5      nUS:X  a  U 0 4$ [        [        R                  R                  R                  5       5      n[        [        R                  R                  5       5      n[        XU5      u  nnnnnn	[        U 5      u  pn0 nU  Vs0 s H  nU[        U5      [        U5      -  _M     nnU  Vs0 s H#  o[        S UR                  5        5       5      _M%     nnU  Vs0 s H   nU[        S UR                   5       5      _M"     nnU S   n[        5       n[        R                  n[        R                   nSnUGby  U
U   Gbr  U
U   nU(       a  GOdUb  [        U5      U:  a  GOP[#        U5      (       a  UU;  d  UnMG  UR%                  U5        ['        5       =nUU'   [)        UXUUU5      u  nnnUU-
  =Ul        Ul        UUl        UUl        UUl        UU   nUnUn0 nSn UU   S   n![        UU   5      n"[5        U5      n#UGb  [        R6                  (       a5  UR,                  [        R8                  * UR.                  -  :  a	  SUl        GOSU"(       d  Sn$OUU   n%[=        U%U"-  5      n$[        R>                  (       d*  [A        U5      (       a  SURC                  5        3Ul        GOU#=(       a    [5        U5      n&U$(       d  U&(       Ga  [E        U5      u  n'n(U'(       a  UnU"RG                  UU   5        U#=(       d    [5        U5      n#[        R6                  (       aA  [5        U5      (       a1  [I        UXUU5      u  nnn)UU4UU'   [A        U5      (       d  U UU   -  n [K        U!UU   S   5      n!U=RL                  S	-  sl&        UU   nGM  U$(       d7  [        RN                  (       d!  U&(       a  S
URC                  5        3Ul        GOOSURC                  5        SU( 3Ul        GO[        R6                  (       GaS  [Q        URR                  5      (       aJ  [)        UUU
UUU5      u  nnn)[K        SUU-
  5      n*[K        SUU-
  U -   5      n+U+U*:  a  SU* SU+ 3Ul        GO;[A        U5      (       d  UU   n,U,S:  a  [        U5      S:  a  [K        SUR.                  UR0                  -
  5      n-[K        SUR.                  UR0                  -
  U,-
  5      *   URU                  5        H+  u  n.n/U-[K        SU.U/-
  5      [K        SU.U/-
  U,-   5      -
  -  n-M-     U-S:  a  SU, SU- 3Ul        GOsURW                  5        H  u  n0u  n.n/U.U/U,-
  4UU0'   M     [Y        UUU5      n1UU   n2U2RZ                  U2R\                  -
  n3[_        UU1Xy5      n4[a        UU1UU!U3U2U4UU5	      u  n5n6n7U5U-
  U[        Rb                  -  :  a  SU5 SU 3Ul        OU=Rd                  S	-  sl2        U=Rf                  SURC                  5        3-  sl3        [i        UUUXU5      n[)        UXUUU5      u  nnnUUl        UUl        UU-
  Ul        UUl        [k        UU1U3U2U4U6U7UU5	        U(       a4  S	SK6J7n8  U8" UU1[q        U15      [Y        US U5      UUUUUSU45      nU(       a  OUU   nUb  GM  UnUb	  U
U   b  GMr  [        Rr                  (       d  [Y        US U5      n9U9U4$ [u        UUUUUUU5      n9U9U4$ s  snf s  snf s  snf )Nr   c              3  @   #    U  H  oR                  5       v   M     g 7frh   r   r   s     rB   r   1_sink_waits_iterative_internal.<locals>.<genexpr>  r^  r   c              3  ^   #    U  H#  n[        U5      (       a  M  UR                  v   M%     g 7frh   r`  ra  s     rB   r   r    rb  rc  Fr   rd  r  r   z$collective ordering
 with candidate:re  z
 non_group_reason:z"candidate is wait, exposed_before:z vs exposed_after:rf  z2, group contains collectives, total_exposed_delta rg  rh  r   ri  sink_waits_iterative);r   r   r   rk  r   r5   rl  r   r   rm  r   rn  r   ro  r   rq  (sink_waits_iterative_debug_limit_to_sinkr   r   r  r   rd   re   rb   rc   rf   r   r  &sink_iterative_extra_comm_comp_overlaprZ   r   *sink_waits_iterative_swap_with_collectivesr   r   r  rt  r   r  r]   /sink_waits_iterative_unsafe_collectives_reorderr   r   r/   rJ  r   r  r  r  r  !sink_iterative_peak_memory_budgetr\   r  r  r  rv  rj  r   rw  r  ):r7   rS  r   r   r   r   r   r   r   r   r   r   r   rR  r:   r   r   r   r{  processed_waitsr|  debug_num_sink_waits_to_reorderr}  
_prev_currrQ  rb   rc   rf   r   r   r   group_collsr  r  r  group_contains_collectiver  candidate_dep_namesboth_contain_comms_is_groupablegroupable_reasonr>   r  r  r  r  gc_comm_timegc_comp_timegcr   r	  r  r  r  r  r  rj  rX  s:                                                             rB   _sink_waits_iterative_internalr    s    f+arz$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$F-H" 9@E%35E 0E 	"5),Ee,LLL  0 RXAQWyCu/@/@/BCCCQW  A 	> E 	y 
!44
 
 	
 	  > )/r
D lO== % 	== $ !&

uT{64[
$+7O$(GGd##O(CDD!)^+uT{-M%*:M.
*	9l 5>	4IIt1""($K	

(.q1 ((8(>?$7$=!#BB""FFFWX (J$ &  '4I&># 36H HI  JJ,Y77>y?Q?Q?S>TU (  "; "?R@ -2S3// !!*J '--.>y.IJ1S5H5S .
 %KK/	::2R%u8H-3/	9a 3<Y1GI.8CC)Xi-@@M(+)<	+B1+E)% LLA%L %i 0I!(XX.11:1C1C1E0FH , ((1(:(:(<'=./?.@B (
 BBB9>>** /O! (%/+Iy! &)I	,A%BN$'9y+@=+P$QM %~5//=.>>PQ^P_a ,  1;; !) 3I 1}[)9A)= ), NNT^^;)
 Q ?) KLL:E:L:L:N6L,)SL<4O-PSV !<,#>#JT . M ;O
 )1,"8 D88E!H !0 " "-!2!2!4! % , , %1$09$<3"B "5 ,IJ,C 0@	/J#..1D1N1NN   ?s$9 D <%''K $
 HN.0G , N NNO '~&6i}M $ JJ!OJOO9#5#5#7"899O?:z5E 2ReH.>2.Iy, 'DN&DN!*Y!6D ,D9##G"' 
 0J,M %1%uE.! $*K-) -j)IQ #R Y 
uT{6\ 4425$F
5  1"J uo	0A>s   "[?=*\-'\	c                    [        U 5      S   $ )a_  
Similarly to reorder_communication_preserving_peak_memory this pass will try to iteratively
push Wait nodes later, recomputing estimated peak memory before each swap,
and preventing peak memory regressions.

Pass will be applied to every Wait node. If there are immediate dependencies with next node,
pass will try to group them together and on the next step to swap the group with next candidate.

If _inductor.config_comms.sink_iterative_use_runtime_estimations is set True,
pass will stop reordering of Wait once corresponding Collective is unexposed,
based on runtime estimations.

inductor.config_comms.sink_iterative_peak_memory_budget allows to tune how much pass
can regress initial peak memory.
E.g.:
sink_iterative_peak_memory_budget == 0.0 - No regression of initial peak memory is allowed
sink_iterative_peak_memory_budget == 0.2 - Pass can improve comm-compute overlap, sacrificing
20% of initial peak memory value.

inductor.config_comms.sink_iterative_extra_comm_comp_overlap config allows to more aggressively
sink waits, stopping only when overlap_compute >= (1 + extra_comm_comp_overlap) * comm_time
r   )r  rK   s    rB   r  r  @  s    . *&1!44rL   c                    [         R                  S:X  a  U R                  5       nU$ [        [         R                  5      (       d   e[         R                  " U 5      nU$ )z3
Returns estimated op runtime in milliseconds (ms)
r~   )r   rm  r'   callable)r:   runtimes     rB   rm  rm  Z  sU     !!Y.--/ N 223333,,U3NrL   c           
        U R                  5       n[        U5      S:X  Ga  Sn[        U R                  [        R
                  [        R                  45      (       a  SU R                  5        Vs/ s H  o3R                  5       PM     sn 3nSU R                   Vs/ s H  oUR                  PM     sn 3nSU R                  5        SU R                  R                   SU SU S	3	nU R                  5        Vs/ s H  owR                  R                  5       PM     nnS
R                  U V	s/ s HA  n	[        U	[        R                  5      (       a  SU	R                   SU	R                    S	3OSPMC     sn	5      n
 U R                  R#                  5       nU R                  R&                  R(                   U U
 SU SU R+                  5       S S3$ / nU H  nUR-                  [/        U5      5        M     U R&                  R(                   SSR                  U5       3$ s  snf s  snf s  snf s  sn	f ! [$         a    Sn Nf = f)Nr   r^   zouts:zins: z (z)
 (),z (size=z	, stride=z.0fz ns): z, )	get_nodesr   r   r   r   ExternKernelOut_CollectiveKernelr   r   ro  rA  ry   get_output_specr   Layoutsizestridemaybe_get_nameAttributeError	__class__rp   r'   r   rK  )r:   r7   detailr   outs_strr   ins_strchildlayoutslayoutout_tensor_infor  	summarieschild_snodes                 rB   rK  rK  f  s&   __F
6{aejj2#5#5r7K7K"LMMe6G6G6IJ6I

6IJKLHe.F.FG.Fff.FGHIG)*"UZZ-J-J,K4PXzYZ[bZccdeF=B__=NO=NE::--/=NO((
 &	 &F fbii00 &++ia@ &	
	

113I **&&//08II;VXY^YtYtYvwzX{{  A  	A Ik23 oo&&'r$))I*>)?@@/  KGO  	I	s+   /H"H'9#H,-AH1=H6 6IIc                X   SnS nS n[        U 5       H  u  pEUci  [        U5      (       a  U[        U5      -  nUR                  nO)[	        UR                  5      (       a  OU[        U5      -  nU" U[        U5       5        Mq  [        U5      (       a/  U[        U5      -  nUR                  nU" U[        U5       5        M  [	        UR                  5      (       a  U" U[        U5       5        S nM  U" US[        U5       35        M     [        R                  SUS-  S-   35        g )Nr   c                :    [         R                  U S SU 35        g )Nz>6r#  )rP  debug)stepr  s     rB   step_log#visualize_overlap.<locals>.step_log  s    T"IRu-.rL   z| zEst. runtime (ms): i  )r4   r   rm  r   r   rK  rP  r9  )ordertotal_est_runtimecur_comm_noder;  r:  r:   s         rB   visualize_overlapr@    s#     #M/ !' "5))!%8%??! %

$$ !%8%??!Tl5124"5))!%8%??! %

,u"5!68$$,u"5!68 $L$7#89:- (. 
/$6=>?rL   c                    U n[         R                   HV  n[        U[        5      (       a  U[	        5       ;   a  [	        5       U   n[        U5      (       d   SU S35       eU" U5      nMX     U$ )Nz3Invalid reorder_compute_and_comm_for_overlap pass: z is not callable)r   'reorder_for_compute_comm_overlap_passesr   rY   globalsr  )r7   r=  ps      rB   $reorder_compute_and_comm_for_overlaprE    sm     E;;a!wy.	!A{{ 	
A!DTU	
{ % < LrL   c           
     <	  ^^^^^^ [        U R                  5      m[        [         5      m[        [         5      m[        T5       H  u  pUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  L d  MT  UR                  S   R                  S:X  d   SU SUR                  S    S35       eUR                  S   nUR                  S   nUS:  a  TU   R                  U5        M  TU   R                  U5        M     UUU4S jn[        [         5      n[        T5       H  u  pUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  L d  MT  UnUR                  S   mTR                  S:X  d   S	T S
U  S35       eU" T5      (       d  M  UT   R                  U5        M     S nS mT H  nUR                  S:X  d  M  [        UR
                  [        R                   R"                  5      (       d  MJ  UR
                  R$                  R&                  (       d  Mq  U" U5      (       a  M  T" X&R)                  5       5      (       d  M   SU S35       e   UR+                  5        GH3  u  mn	[        U	5       GH  u  pTU   nUR                  S   TL d   eUR                  u  nmUS-   nU
[-        U	5      S-
  :  a  XS-      O[-        T5      S-
  nTX n[/        UU4S jU 5       5      (       a   ST SU SU  S35       eU H  nUR                  S:X  d  M  TUR                  ;   d  M'  UR
                  [        R                  R                  R                  R                  :w  d  Me  [1        UU4S jUR                   5       5      nUUl        M     GM     GM6     UR3                  5        H"  n	U	 H  nTU   nU R5                  U5        M     M$     T Hx  nUR                  S:X  d  M  UR
                  [        R                  R                  R                  R                  L d  MR  UR                  S   U;   d  Mg  U R5                  U5        Mz     g)ab  
This FX graph pass replaces uses of FSDP2 unsharded params with their corresponding
graph intermediates that were fsdp.copy_ into the unsharded params in the original graph.

NOTE: Can only apply this pass to any of the FSDP2 unsharded params that have this pattern
(or repetition of): `resize_(full) -> copy_ -> resize_(0)`. Because of this, for partial-graph case
where `resize_(full) -> copy_` is in one graph and `resize_(0)` is in another graph, we can't
remove these resize and copy ops and thus we will have worse performance there.

In other words, "do we try to remove all the resize_(full) -> copy_ -> resize_(0) nodes for this unsharded param"
is actually a per-unsharded-param decision, since for each unsharded param, we look at its resize sequence pattern
(in `check_resize_pattern()`) to determine if its set of resize and copy nodes can be removed.
call_functionr   placeholderz1Resize can only operate on graph inputs, but got z# which is resizing non-graph-input r<  r   c                n  > TR                  U / 5      nTR                  U / 5      n[        U5      [        U5      :w  a2  [        R                  SU  S[        U5       S[        U5       S35        g[	        X5       H7  u  p4X4:  d  M  [        R                  SU  STU    SU S	TU    SU S
35          g   g)NzH
Unequal number of resize-to-full and resize-to-0 nodes for graph input z:
z vs. zK.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass.
Fz
For graph input z: resize-to-full node z
 at index z 
happens after resize-to-0 node zd.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass for that unsharded param.
T)r  r   logwarningr   )graph_inputresized_to_full_idxesresized_to_0_idxesresize_to_full_idxresize_to_0_idx&graph_input_to_resized_to_0_node_idxes)graph_input_to_resized_to_full_node_idxes	node_lists        rB   check_resize_patternLremove_fsdp2_unsharded_param_graph_input_usage.<locals>.check_resize_pattern  s    !J M M!
 DGGUWX$%-?)@@KKHHS} U E#&8"9!: ;  47!4
/ "43I>P4Q3RR\]o\p q  )/ :;:oEV W 4
 rL   z\
Assumed all FSDP2 `unsharded_param`s to be graph input, but it's not true!
Offending node: z	. Graph: c                    U R                   [        R                  R                  R                  R
                  L =(       d:    U R                   [        R                  R                  R                  R
                  L $ rh   )targetr0   r{   fsdpcopy_r~   inductorresize_storage_bytes_)r   s    rB   is_allowed_mutationKremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_allowed_mutation	  sO    KK599>>//777 O{{eii00FFNNN	
rL   c           	        [        U R                  [        R                  R                  5      (       aj  [        U R                  R                  R                  5       VVs/ s H3  u  p#UR                  c  M  UR                  R                  (       d  M1  UPM5     snnO/ n[        U Vs/ s H6  n[        U R                  U   R                  S   R                  5       5      PM8     sn5      n[        U Vs/ s H)  n[        UR                  S   R                  5       5      PM+     sn5      n[        XW-  5      S:  $ s  snnf s  snf s  snf )Nvalr   )r   rW  r0   _ops
OpOverloadr4   _schema	arguments
alias_infois_writer   r   argsmetauntyped_storager   )r   unsharded_paramsr   r   mutated_arg_idxesmutated_node_arg_storagesunsharded_paramstorages_of_unsharded_paramss           rB   -is_node_mutating_unsharded_param_or_its_aliaseremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias	  s)    $++uzz'<'<== &dkk&9&9&C&CDDDA<< 010E0E D  	 %/ +*A tyy|007GGIJ*%
! (2 (8'7O 33E:JJLM'7(
$ ,KLqPP)s    D=7D=D=)=E60EzdUser mutation on FSDP2 unsharded param is not allowed when Traceable FSDP2 is used. Violating node: c              3  8   >#    U  H  nT" UT/5      v   M     g 7frh   ro   )r   r   rn  rl  s     rB   r   Aremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>Y	  s%      *D >d_DUVV*s   z(Assumed no ops mutating unsharded param z in subgraph z, but it's not true!
Graph: c              3  6   >#    U  H  nUTL a  TOUv   M     g 7frh   ro   )r   argreplacementrl  s     rB   r   rq  f	  s$      %#,C (+o'=3F#,s   N)r.   r  r   r4   oprW  r0   r{   rZ  r[  r~   rf  r   rX  rY  r   r`  ra  rb  
is_mutabler5   rJ  r   r   tupler/   
erase_node)rk  rA   r   rL  new_sizerT  'unsharded_param_to_fsdp_copy_node_idxesfsdp_copy_noder\  fsdp_copy_node_idxesr   fsdp_copy_node_idxr>   subgraph_start_idxsubgraph_end_idxsubgraph_nodesnew_argsrQ  rR  rn  rS  rt  rl  s                    @@@@@@rB   .remove_fsdp2_unsharded_param_graph_input_usager    sS    U[[!I 1<D0A--8->*y)	GG&uyy11GGOOO99Q<??m3  :2267Z[_[d[def[gZh i6 3 ))A,Kyy|H!|9+FMMcR6{CJJ3O *"J /:$.?+y)	77o%$++9M9M9U9U*U!N"iilO"%%6  = !5' 29 6 $O447HOOPST *
Q4 GG&4;;

(=(=>>##...'--DBBD  eeidj k  : 
1	6	6	8	%./C%D!A&'9:N!&&q)_<<<+00NA{!3a!7 s/0144 %U+^a' 
 ''9KN *   ))8(9~FV Ww   'GG.'4994uyy'9'9'O'O'W'WW$ %#'99%  H !)DI ') &E 
9D !H N N P"6&'9:N^, #7 !Q GG&uyy11GGOOO		! GGT" rL   c                  ^	  SS K m	T	R                  R                  5       (       d   eT	R                  R                  R
                  (       a%  T	R                  R                  R                  (       d   e SSK
JnJnJnJnJn   U	4S jnU" 5       nU" U" T	R                  R                  R
                  R                   U" ["        R$                  U" T	R                  R&                  R(                  R                   U" S5      U" S5      U" S5      U" S5      U" S	5      5      U" S
5      5      U" S5      U" S5      5      US S9SU	4S jj5       nU" U 5        UR+                  U 5        g ! [        [        [        4 a     g f = f)Nr   r   )CallFunction
KeywordArgMatchPatternMatcherPassregister_graph_patternc                X  > [        U R                  5      nU H  nUR                  [        R                  L d  M"  UR
                  S   R                  TR                  R                  R                  R                  L d  Mh  UR
                  S   S:X  d  M}  U R                  U5        M     g )Nr   r   )r.   r  rW  operatorgetitemrf  r{   rX  all_gather_copy_inr~   rx  )grS  r   r0   s      rB   remove_unused_getitem8reinplace_fsdp_all_gather.<locals>.remove_unused_getitem	  sp    M	AH,,,FF1I$$		(I(I(Q(QQFF1INQ rL   all_gather_inputsall_gather_outputinp_split_sizesall_gather_input_numelrankitem_idx
group_size
group_namec                &    U R                   S   S:H  $ )Nr  r   )kwargs)matchs    rB   rC  +reinplace_fsdp_all_gather.<locals>.<lambda>	  s    %,,z":a"?rL   )	pass_dictextra_checkc                n   > U4S jnU R                  UUS   US   US   US   US   US   US   /5        g )	Nc                    > U S S nU S   nU S   nTR                   R                  R                  R                  " U6 nUS   nUS   nTR                   R                  R
                  R                  XRX6S9nU$ )Nr  r   r   )out)r{   rX  r  r~   _c10d_functionalall_gather_into_tensor_out)	rf  copy_in_argsr  r  r  r  	getitem_1all_gather_into_tensorr0   s	           rB   replEreinplace_fsdp_all_gather.<locals>.reinplace_all_gather.<locals>.repl	  s      9LbJbJ!&!B!B!J!J" )+G*1-I		**EEMM N  #
 *)rL   r  r  r  r  r  r  r  )replace_by_example)r  rf  r  r  r0   s       rB   reinplace_all_gather7reinplace_fsdp_all_gather.<locals>.reinplace_all_gather	  s[    .	*$ 	  *+*+()/0v|$|$	
rL   )r  r  )5torch.distributed.fsdp._fully_shard._fsdp_collectivesr)   r  r{   r  r  r  ImportErrorr,  AssertionErrorpattern_matcherr  r  r  r  r  r~   r  r  rX  r  apply)
rk  r  r  r  r  r  r  
graph_passr  r0   s
            @rB   reinplace_fsdp_all_gatherr  |	  sg   
D  --//// II&&==		**EE	
FE
  	  $%JII&&==EE  IINN55==23230178v& :& |$|$	
$ ?),
-,
B % UA 8 s   A1E) )F Fc                    [        U [        R                  R                  R                  [        R                  R                  R
                  45      (       a   e[        U R                  5       SS  5      $ )N   )r   r0   	_inductor	schedulerFusedSchedulerNoder   r[   r   r   s    rB   
get_op_idxr  	  sb    OO%%88OO%%::	
    u~~#$$rL   c           
     	  ^^^ ^! SSK Jm   / n[        [           " 5       nSnSn0 n0 n0 m!U U!4S jn	U  GH"  n
[	        U
R
                  [        R                  R                  R                  R                  S9(       Ga  [        U4S jU
R                   5       5      (       Ga  SnU
n[        5       n[        UUUT5        [        [        R                  R                  R                  R                  [        R                  R                  R                  R                  [        R                  R                  R                   R                  /5      m[#        UUUTUU 4S jS	9  [%        US
 S9n['        U5      nSn[)        ['        U5      5       H^  nUU   n[+        UR
                  [        R                  R                  R                   R                  5      (       a  US-  nUS:  d  M\  Un  O   US U nS n[)        ['        U5      S-
  5       H9  n[-        UUS-      R
                  [.        R0                  5      (       d  M4  US-   n  O   Uc   eU	" US U 5      nU	" UUS  5      nUUU'   GM;  [+        U
R
                  [        R                  R                  R2                  R                  5      (       d  GM  SnU
n[        5       n[#        UUUT5        [%        US S9nS n[)        ['        U5      S-
  5       H9  n[-        UUS-      R
                  [.        R0                  5      (       d  M4  US-   n  O   Uc   eU	" US U 5      nU	" UUS  5      nUUU'   GM%     ['        T!5      S:  d   eU(       a  ['        U5      S:  d   eU(       a  ['        U5      S:  d   eU  HS  n
U
R5                  5       T!;   a  T!U
R5                  5          n
X;   a  M1  UR7                  U
5        UR9                  U
5        MU     S nUR;                  5        Hl  u  nnUba  [=        [?        URA                  5       5      5      nURC                  5        H+  nURE                  [G        UR5                  5       USS95        M-     UnMn     S nUR;                  5        Hl  u  nnUba  [=        [?        URA                  5       5      5      nURC                  5        H+  nURE                  [G        UR5                  5       USS95        M-     UnMn     U$ )Nr   )r  Fc                   > TR                   R                  U 5      nU  H  nUTUR                  5       '   M     UTUR                  5       '   U$ rh   )r   creater   )snodes_to_group
group_noder:   r  snode_name_to_final_snodes      rB   _create_group_node:enforce_comm_ordering_for_fsdp.<locals>._create_group_node
  sO    33::?K
$E:D%enn&67 %;E!*"5"5"78rL   )ru  c              3     >#    U  HJ  n[        TU   R                  [        R                  R                  R
                  R                  5      v   ML     g 7frh   )r   r   r0   r{   rX  r  r~   )r   r   r  s     rB   r   1enforce_comm_ordering_for_fsdp.<locals>.<genexpr>
  sJ      
 % "1%**EIINN,M,M,U,U  %s   AATc                   > [        U TR                  5      =(       d6    [        U TR                  5      =(       a    U R                  R                  T;   (       + $ rh   )r   NopKernelSchedulerNodeExternKernelSchedulerNoder   op_overload)r   allowed_opsr  s    rB   rC  0enforce_comm_ordering_for_fsdp.<locals>.<lambda>.
  sD    q)"B"BC "1i&I&IJ >FF..+=	'rL   )criteria_cbc                    [        U 5      $ rh   r  r  s    rB   rC  r  9
      JqMrL   r  r   c                    [        U 5      $ rh   r  r  s    rB   rC  r  j
  r  rL   r  )$r^   r  r   r   r   r   r0   r{   r  r  r~   r   r  r   wait_tensorrX  split_with_sizes_copyr   sortedr   r,   r   r   r   _WaitKernel	chunk_catr   r   r   rJ  r  r  r   r   r  r   )"r7   r  r  	new_orderr  	ag_exists	rs_exists$ag_grouped_node_to_wait_grouped_node$rs_grouped_node_to_wait_grouped_noder  r:   ag_snodeag_related_snode_setag_related_snodesend_idx_of_current_ag_blockcopy_out_countr   	cur_snodewait_node_idxag_group_nodeag_wait_group_noders_snoders_related_snode_setrs_related_snodesrs_group_noders_wait_group_nodeprev_ag_waitwait_group_noder  r   prev_rs_waitr  r  r  s"     `                            @@@rB   enforce_comm_ordering_for_fsdpr  	  s   
 )+I3!III+-(+-( " JJ59955PPXX
 
 
 __	
 
 
 IHLVL  ($"	 %II..IIQQII..::BBIINN88@@K )$" !'$*A! +..?*@'N3012-a0	!NNEIINN$H$H$P$P  #a'N!A%23/ 3 !22N3N O !M301A56/A6;;R^^LL$%EM 7 !,,,./@-/PQM "44Emn4U!VBT0? EJJ		(@(@(H(HIIIH MWL ($"	 !'$*A!
 !M301A56/A6;;R^^LL$%EM 7 !,,,./@-/PQM "44Emn4U!VBT0?] ` ()A---781<<<781<<< >>88-enn.>?Ee  L*N*T*T*V&#]%C%C%E FGL!--/**AJJL|TR 0 ' +W L*N*T*T*V&#]%C%C%E FGL!--/**AJJL|TR 0 ' +W rL   )r7   list[BaseSchedulerNode])r7   r  r   r  )r   z"Optional[Union[IRNode, Operation]]r   r   )r:   r!   r   r   )r:   r!   )F)r   rY   )r   r  r   rY   )r7   r  r   ztuple[dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], BaseSchedulerNode])r   r   r   #dict[BaseSchedulerNode, OrderedSet])r7   r  r   z(dict[BaseSchedulerNode, OrderedSet[str]])r   Optional[BaseSchedulerNode]r   r  r   4dict[BaseSchedulerNode, Optional[BaseSchedulerNode]]r   r  )
r   r!   r   r!   r   'dict[BaseSchedulerNode, frozenset[str]]r   r  r   r   )r   r!   r   r  r   dict[BaseSchedulerNode, float]r   r  r   r  r   tuple[float, float, str])r   r!   r   r!   r   r  r   r  r   r  r   r  r   r  )r   r!   r   r!   r   r!   r   r  r   r  r   r!   r   r!   )r   r!   r   r  r   r!   r  r[   r  r[   r	  r   r
  r   r  r   r   z(tuple[int, dict[BaseSchedulerNode, int]])r   r!   r   r  r   r!   r  r[   r	  r   r
  r   r  dict[BaseSchedulerNode, int]r  r   r   r   r   r   r   rX   )
r   r!   r   r  r   r   r   r  r   z>dict[BaseSchedulerNode, list[Union[FreeableInputBuffer, Any]]])r   r!   r   ztuple[bool, Optional[str]])rR  z$dict[BaseSchedulerNode, ReorderInfo]r   r!   r   r  rS  r[   r   r[   r   r   r   OrderedSet[str]r   r  )r7   r  r   zDtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]])
r7   r  rF   r   rG   r   rH   r   r   r  )r  r  r   r  )r   r!   r   r  r  r[   r	  r   r  r   r  r  r  r  r  r   r   r   r   rX   )r   r!   r   r  r   r!   r  r[   r  r[   r	  r   r  r   r  r   r   r   r   zFtuple[int, dict[BaseSchedulerNode, int], dict[BaseSchedulerNode, int]])rR  z%dict[BaseSchedulerNode, SinkWaitInfo]r   r!   r   r  rS  r[   r   r[   r   r   r   r  r   r  )r7   r  r   zEtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]])r:   r!   r   ra   )rk  torch.fx.Graph)rk  r  r   rX   )r7   1list[torch._inductor.scheduler.BaseSchedulerNode]r  z4dict[str, torch._inductor.scheduler.SchedulerBuffer]r  zdict[str, BaseSchedulerNode]r   r  )h
__future__r   r  rL  r  loggingr  r  collectionsr   dataclassesr   typingr   r   r   r	   r0   torch._loggingr
    torch.multiprocessing.reductionsr   torch.utils._ordered_setr   r^   r   r   r   dependenciesr   r   r   memoryr   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr   	getLoggerrp   rJ  _logginggetArtifactLoggerrP  r&   r!   rC   rG   rF   rO   rT   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r#  r(  r.  rZ  rQ   rJ   r  r  r  r  r  r  r  r  r  r  rm  rK  r@  rE  r  r  r  r  ro   rL   rB   <module>r     s   #      
 # ! 6 6  + ; / & & ! %     !nn..xC;#F##FL#&####L 9 9 9("(032#"((((0#-"G
;
%
% D 	6.'.!. >. ;	.
 
.9.'9.C9. -9. >	9.
 ;9. 9.x9.!9.
9. D9. -	9.
 >9. ;9. 9.x. .!. ". D	.
 D. . .b@. @.	 @. "@. 	@.
 @. %@. 6:@. @. .@.FJ J	 J "J 	J
 %J 6:J 4J J  J J 
JZ$; $;	 $;  $; >	$;
 D$;N <b/b
b Db 	b
 b !%b #b bJU#UIUp
Y#YY Y 	Y
 Yx"0 9 9 9"( ((V2
 2
	 2
 2
 %	2

 >B2
 42
 92
 2
 2
 
2
jCG CG	 CG "CG 	CG
 CG %CG >BCG CG CG LCGL. .!. ". D	.
 D. . .bW0W
W DW 	W
 W !%W #W Wt4C 4C	 4C  4C >	4C
 D4CnL#LJL^
54	A>#L# ~#BjZ%n=nEn 5n 7	nrL   