
    N jnI                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SKJrJr  S SKJrJrJrJrJrJrJr  S SKJr  S SKJr  SS	K J!r!  \(       a  S S
K"J#r#J$r$J%r%  S SK&J'r'  S SK(r(S SK)r)S SK*r)S SK+J,s  J-r.  S SK/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6  S SK7J8r8  S SK9J:r:J;r;  S SK<J=r=  S SK>J?r?J@r@JArA  S SKBJCrC  SSKDJErEJFrFJGrGJHrHJ r JIrI  SSKJJKrK  SSKLJMrMJNrNJOrO  SSKPJQrQJRrR  SSKHJSrSJTrTJUrUJVrV  SSKWJXrXJYrY  SSKZJ[r[  SSK J\r\J]r]J^r^J_r_J`r`Jara  SSKbJcrc  SSKdJereJfrf  SSKgJhrh  SSKiJjrjJkrk  SS KlJmrm  SS!K,JnrnJoroJprpJqrqJrrrJsrsJtrtJuruJvrvJwrwJxrxJyryJzrzJ{r{J|r|J}r}J~r~Jr  SS"KJr  \GR                  " \5      r\)GR
                  GR                  \S#5      r\)GR
                  GR                  \S$5      r\)GR
                  GR                  \S%5      r\)GR
                  GR                  \S&5      r\S'   rS(\S)'   \" S*5      r\" S+5      r\GR                    " S, S-5      5       r\GR                    " S. S/5      5       r " S0 S15      r\GR                    " S2 S35      5       r\GR                    " S4 S5\5      5       r " S6 S'5      r\GR.                  ScS7 j5       rSdS8 jrSeS9 jrSfS: jr\GR                   " S;S<9 " S= S>5      5       rSgS? jr " S@ SA5      r        ShSB jr " SC SD\5      r " SE SF\5      r " SG SH\5      r    SiSI jr        SjSK jr " SL SM\5      r " SN SO\5      r " SP SQ\5      r " SR SS\5      r Sk       SlST jjr      SmSU jrSnSV jr\GR                    " SW SX5      5       r\GRZ                  " 5       rSoSY jrSpSZ jr    SqS[ jrSrS\ jrSrS] jrSrS^ jrSrS_ jr " S` SJ5      r " Sa Sb5      rg)s    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                  z    \ rS rSr% SrS\S'   SrS\S'   SrS\S'   S	 r\	SS
 j5       r
\	 S   SS jj5       rSrg)FusionResulth   NzOptional[bool]should_fusezOptional[Callable[[], bool]]callable_fnOptional[LambdaFuture]futurec                V    U R                   S LU R                  S L-  (       d   S5       eg )NzLFusion result should contain either fusion decision or callable_fn, not both)rc   rd   selfs    j/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/scheduler.py__post_init__FusionResult.__post_init__n   s0      ,1A1A1MN 	
Z	
N    c                    [        US9$ )N)rc   ra   )clsrc   s     rj   fuseFusionResult.fuses   s    44rm   c                    [        XS9$ )Nrd   rf   ro   )rp   rd   rf   s      rj   from_callableFusionResult.from_callablew   s     CCrm    )rc   boolN)rd   Callable[[], bool]rf   re   )__name__
__module____qualname____firstlineno__rc   __annotations__rd   rf   rk   classmethodrq   ru   __static_attributes__rw   rm   rj   ra   ra   h   sf    "&K&04K-4%)F")

 5 5 OSD,D6LD Drm   ra   c                  L    \ rS rSr% S\S'   S\S'   S\S'   SrS\S	'   SS
 jrSrg)PendingFusion~   rz   rd   r\   node1node2Nre   rf   c                2    U R                   U R                  4$ ry   r   r   rh   s    rj   get_fusion_nodesPendingFusion.get_fusion_nodes   s    

DJJ''rm   rw   )return+tuple[BaseSchedulerNode, BaseSchedulerNode])r{   r|   r}   r~   r   rf   r   r   rw   rm   rj   r   r   ~   s$    ##%)F")(rm   r   c                  D   \ rS rSrSr\SS j5       r\SS j5       r\      SS j5       r	\SS j5       r
\      SS j5       r\      SS j5       r\SS	 j5       r\      SS
 j5       r\SS j5       r\      SS j5       r\SS j5       r\SS j5       rSrg)MixOrderReduction   z
This class contains utility functions to decide if we should fuse reductions
reducing across different dimensions of the same input tensor.
c                p    U R                  5       =(       a     [        S U R                  5        5       5      $ )Nc              3     #    U  Hl  n[        U[        5      (       d  M  UR                  5       (       d  M1  [        UR                  [        5      (       d  MR  UR                  R
                  S Lv   Mn     g 7fry   )
isinstanceSchedulerNodeis_reductionnoder   _split_size.0subnodes     rj   	<genexpr>7MixOrderReduction.is_split_reduction.<locals>.<genexpr>   sZ      +
+'=1 1 $$& 1 7<<8	 1GLL$$D0+s   A6A6A6A6)r   all	get_nodesr   s    rj   is_split_reduction$MixOrderReduction.is_split_reduction   s3      " 
s +
>>++
 (
 	
rm   c                   U R                  U5      (       Ga  S nS nUR                  5        GH  n[        U[        5      (       a4  UR	                  5       (       a  [        UR
                  [        5      (       d  MO  UR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUc  UnUnGM  [        R                  R                  R                  X%5      (       d   U SU 35       e[        R                  R                  R                  X65      (       a  GM   U SU 35       e   Uc   eX#4$ UR                  S   $ )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrW   graphsizevarssimplifyrV   _original_reduction_rangesstatically_known_equalsgroup)rp   r   xnumelrnumelr   	curxnumel	currnumels          rj   get_numel_rnumel"MixOrderReduction.get_numel_rnumel   s   !!$''FF>>+w66,,.."7<<@@||44@@@GG,,55!',,"?"?@	 ||>>JJJGG,,55!',,"I"IJ	 >&F&F77++CC  4 	{34  77++CC  4 	{34 1 ,8 %%%##::a= rm   c                    U R                  U5      nU R                  U5      n[        U5      S:w  d  [        U5      S:w  d  X4:X  a  g[        U5      [        [        U5      5      :H  $ )N   F)r   lentuplereversed)rp   r   r   g1g2s        rj   has_mix_reduction_orders*MixOrderReduction.has_mix_reduction_orders   sX     !!%(!!%(r7a<3r7a<28RyE(2,///rm   c                   SnUR                   R                   H.  n[        U[        5      (       d  M  UR                  U:X  d  M,  Un  O   U(       d  gUR
                  nUR                   R                  nU(       dI  [        U[        5      (       d   [        U5       5       eUR                  S   R                   R                  nU(       d   e[        U5      [        UR                  5      -
  (       d  g[        R                  R                  R                  [!        UR"                  5      [!        UR%                  5       5      5      (       a  gg)z0
The access to 'buf' is not a broadcast access.
NFr   T)read_writesreadsr   r2   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r    rW   r   r   r   rV   sizevalues)rp   bufr   	found_depdepr   r   s          rj   _is_full_access!MixOrderReduction._is_full_access   s   
 	##))C#y))chh#o	 *
 %%00
d$677HDJ<H7Q33>>Jz:&E4F4F)GG
 7733)..)=9J9J9L+M
 
 rm   c                    / nUR                  5       UR                  5       -  nU HD  nU R                  XQ5      (       d  M  U R                  XR5      (       d  M3  UR                  U5        MF     U$ ry   )used_buffer_namesr   append)rp   r   r   outcommon_readsr   s         rj   get_common_read!MixOrderReduction.get_common_read   sb     ..053J3J3LLC""3..33F3Fs3R3R

3   
rm   c                <    [        U R                  X5      5      S:  $ Nr   )r   r   rp   r   r   s      rj   has_common_read!MixOrderReduction.has_common_read   s     3&&u4599rm   c                    U R                  U5      n[        R                  R                  R	                  US   US   -  SS9$ )Nr   r   fallback)r   rW   r   r   optimization_hint)rp   r   r   s      rj   	get_numelMixOrderReduction.get_numel   s>    !!$'ww11"Q%"Q%-!1LLrm   c                $    U R                  U5      $ ry   )r   r   s      rj   get_fusion_score"MixOrderReduction.get_fusion_score  s    
 }}U##rm   c                   [         R                  R                  (       d  g[        R                  R
                  (       a  gUR                  5       (       a  UR                  5       (       d  gUR                  5       R                  nUS;  d  [        U5      S:w  a  gUR                  5       (       a  UR                  5       (       d  gUR                  UR                  5       -  (       d"  UR                  UR                  5       -  (       a  gU R                  X5      (       d  g[        R                  X5      n[!        U5      S:X  a  gU R#                  U5      (       a  XpeOU R#                  U5      (       a  X!peOgU R%                  U5      nUu  p[         R                  R&                  (       d  Sn
[        R                  R(                  R+                  [,        R.                  " X-  U
5      5      (       d  g[        R                  R(                  R+                  [,        R.                  " XS-  5      5      (       d  g[        R                  R(                  R+                  [,        R.                  " US5      5      (       d  g[1        S UR3                  5        5       5      (       a  g[        R                  R(                  R5                  U	S	5      (       d  g[7        S
 UR3                  5        5       5      nU$ )z@
Check whether we can fuse two reductions with mix loop orders.
F)cudaxputritonr   i  P r   i   c              3     #    U  H]  nUR                  5       (       d  M  UR                  R                  R                  [        R
                  [        R                  4;  v   M_     g 7fry   )r   r   datareduction_hintrA   INNERDEFAULTr   s     rj   r   -MixOrderReduction.can_fuse.<locals>.<genexpr>V  sU      
 7##%GLL,,##%%
 7s   A'AA'i @  c              3     #    U  H9  nUR                  5       (       d  M  UR                  R                  5       S ;   v   M;     g7f)>   sumprodN)r   r   get_reduction_typer   s     rj   r   r   i  s@      
 2##%GLL++-
 2s
   A$A)r&   r   mix_order_reductionrW   r   cpp_wrapperrR   
get_devicer   rI   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   )rp   r   r   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   s               rj   can_fuseMixOrderReduction.can_fuse  sK   
 }}00 77||~~U\\^^&&(--.";/8;!!##5+=+=+?+?OOe7799OOe7799  ++E99 )88F|!!!%((*/Z##E***/Z!!/2
 }}@@ #J
 77##11%((4;
2STT
 77##11%((42JKK
 77##11%((42FGG  
 +446
 
 
 
 ww44T9EE  
 &//1
 
 
rm   c                $    U R                  X5      $ ry   )r   r   s      rj   are_mix_order_reductions*MixOrderReduction.are_mix_order_reductionst  s     ||E))rm   c                h   ^ ^ [        U U4S jTR                  R                   5       5      (       d  gg)Nc              3  \   >#    U  H!  nTR                  UR                  T5      v   M#     g 7fry   )is_contiguous_loadr   )r   r   rp   r   s     rj   r   7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>|  s)      
>TsC""388T22>Ts   ),FT)r   r   r   )rp   r   s   ``rj   r   $MixOrderReduction.is_contiguous_nodez  s1     
>B>N>N>T>T
 
 
 rm   c                :   SSK Jn  UR                  5        H  n[        U[        5      (       d   eUR
                  nUR                  UR                     nU Vs/ s H   owR                  U:X  d  M  UR                  PM"     nn[        U5      S:X  a  M}  U Hy  n	UR                  U	   n
UR                  n[        UR                  5       5      n[        R                   R"                  R%                  U
UU5      nUS   S:X  a  Mm  US   S:X  a  Mx      g   M     gs  snf )Nr   )MemoryUsageTyper   FT)torch._inductor.loop_bodyr	  r   r   r   _bodymemory_usageLOADbuffer_name
index_namer   indexing_exprsr   listkeysrW   r   r   stride_vars)rp   r   parent_noder	  r   	loop_bodyentrieseindex_namesr  
index_exprr   var_symbolsr  s                 rj   r  $MixOrderReduction.is_contiguous_load  s   =))+DdM2222

I,,_-A-ABG18QAMMS<P<1<<KQ;1$ *
&55jA
&11
 #:??#45gg..:: $B1,B10D  * ,2 + Rs   D1Drw   Nr   r\   r   rx   )r   r\   r   ztuple[sympy.Expr, sympy.Expr]r   r\   r   r\   r   rx   )r   strr   r\   r   rx   )r   r\   r   r\   r   	list[str])r   r\   r   intr   r\   r   r\   r   r!  )r   r  r  r\   r   rx   )r{   r|   r}   r~   __doc__staticmethodr   r   r   r   r   r   r   r   r   r   r  r   r  r   rw   rm   rj   r   r      sv   
 
 
 #! #!J 	0%	0.?	0		0 	0  B 	%	.?			 	 :%:.?:	: :
 M M $%$.?$	$ $ e eN *%*.?*	* *
    rm   r   c                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg) SchedulerBufferi  	Scheduler	schedulerz	ir.Bufferr   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr?   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ ry   )r*  get_name)ri   ops     rj   defining_op_name SchedulerBuffer.defining_op_name  s#    ~~{{}rm   c                @    [        U R                  R                  5      $ ry   )hashr   r   rh   s    rj   __hash__SchedulerBuffer.__hash__  s    DIINN##rm   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rO   r0  	writeliner   r   r{   layoutget_aliasespformatget_mutationsr   r-  indentgetrawvalue)ri   resultr   users       rj   	debug_strSchedulerBuffer.debug_str  s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ ry   r   r0  rh   s    rj   r0  SchedulerBuffer.get_name      yy!!##rm   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr)   CommBufferLayoutrW   r   wrapper_codecodegen_allocationhasattrkernelr0  inplace_update_buffersr(  name_to_donated_buffername_to_bufcodegen_inplace_reuse)ri   input_buffer_nameinput_buffers      rj   allocateSchedulerBuffer.allocate  sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rm   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)r   r   r=  r)   r=   rS   r-  
OutputNode)ri   uses     rj   can_freeSchedulerBuffer.can_free  sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  rm   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g ry   )idr   merger  r   r-  )ri   r-  rC  ra  s       rj   	set_usersSchedulerBuffer.set_users  sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
rm   c                T    U R                   c   eU R                   R                  5       $ ry   )r   rN  rh   s    rj   r>  SchedulerBuffer.get_aliases  s%    yy$$$yy5577rm   c                T    U R                   c   eU R                   R                  5       $ ry   )r   rO  rh   s    rj   r@  SchedulerBuffer.get_mutations  %    yy$$$yy++--rm   c                R    U R                   R                  5       R                  5       $ ry   )r   rP  r   rh   s    rj   r   SchedulerBuffer.get_device
  s    yy((*5577rm   )r-  Nr   r  r   r!  r   Noner   rx   )r-  r,  r   rs  r   zSequence[str]r   Optional[torch.device])r{   r|   r}   r~   r   dataclassesfieldr  r-  r?   r.  r2  r6  rE  r0  r\  rb  rg  r>  r@  r   r   rw   rm   rj   r&  r&    sv    
O,,'--dCE>C.9.?.?3/J+ 
$$($?B
+8.8rm   r&  c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBufferi  Nr)  r*  rw   )r{   r|   r}   r~   r*  r   r   rw   rm   rj   r{  r{    s    /3K,3rm   r{  c                  t   \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   STS jrSUS jr	SVS  jr
SVS! jrSVS" jrSWS# jrSVS$ jrSXS% jr      SYS& jrSZS' jrS[S( jrS\S) jrS]S* jr      S^S+ jrSXS, jrS_S- jrS_S. jrSXS/ jrSXS0 jr    S`S1 jrSVS2 jrSVS3 jr\S_S4 j5       r\S_S5 j5       r \S\S6 j5       r!\S\S7 j5       r"SaS8 jr#SbS9 jr$ScS: jr%SdS; jr&S\S< jr'S\S= jr(S\S> jr)S\S? jr*S\S@ jr+S\SA jr,S\SB jr-S\SC jr.SeSD jr/S\SE jr0SXSF jr1 Sf     SgSG jjr2\ShSH j5       r3\ShSI j5       r4\ShSJ j5       r5      SiSK jr6      SjSL jr7\SkSM j5       r8SlSN jr9\SlSO j5       r:SmSP jr;SnSQ jr<\=    SoSR j5       r>SSr?g)pr\   i  OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager!  	min_order	max_orderr@   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]r   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFrx   writtenc                     Xl         S U l        g )Nc                     / $ ry   rw   )rL  kwargss     rj   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>+  s    Brm   )r(  debug_device_str)ri   r(  s     rj   __init__BaseSchedulerNode.__init__(  s    $-& 	rm   c           	     ^   Xl         [        5       U l        [        [           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l	        U R                   Vs0 s H  o3R                  5       U_M     snU l        0 U l        g s  snf s  snf )NF)r(  r   r*  )r   r   r   r  r~  r  get_outputsr&  r(  r  r0  r  r  )ri   r   outputr   s       rj   _init_from_node!BaseSchedulerNode._init_from_node.  s    	#$
   **,
 - .. 
 -
 @D||L| 3|L !#
  Ms   B%;B*c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)r   r{   r0  rh   s    rj   __repr__BaseSchedulerNode.__repr__F  s'    t*%%&fT]]_,?qAArm   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsr9  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r;  Ignoring error in debug_str()Texc_info)r0  rO   splicer   r{   getattrr?  r   writesr  r   rA  r  rE  r<  debug_str_extra	ExceptionlogwarningrB  rstrip)ri   r   r   r   s       rj   rE  BaseSchedulerNode.debug_strI  sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N rw   rh   s    rj   r  !BaseSchedulerNode.debug_str_extrab      rm   c                $    U R                  U 5      $ ry   )r  rh   s    rj   _debug_str_for_device'BaseSchedulerNode._debug_str_for_devicee  s    $$T**rm   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Nr   r  , F)shorten	multiline)r  r   r   torch	_inductorr)   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )ri   
maybe_datadata_strs      rj   debug_str_short!BaseSchedulerNode.debug_str_shorth  s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""rm   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r  infor  r   r  rh   s    rj   log_detailsBaseSchedulerNode.log_detailsw  s,    6####		
rm   c                    gNFrw   )ri   self_dep	other_deps      rj   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair       rm   c                    S U R                   R                  5        5        Vs0 s H  nX!;   d  M
  X!U   _M     snU l        U R                  U R                   R	                  U R                  5      5        g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fry   r   r   r   s     rj   r   9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>  s     Q-Pc-P   )r   reads_and_writesr  set_read_writesrename)ri   renamesr   s      rj   update_mutated_names&BaseSchedulerNode.update_mutated_names  ss     RT-=-=-N-N-PQ!
Q  D$-Q!

 	T--44T5J5JKL!
s
   	A7	A7c                X    U R                  U R                  R                  U5      5        g ry   )r  r   	with_readri   r   s     rj   add_fake_depBaseSchedulerNode.add_fake_dep  s!    T--77<=rm   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7fry   )r>  r@  )r   r   s     rj   r   =BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s*      
@ROO4!2!2!44@Rs   35)r   r  rh   s    rj   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutation  s%     
@D@P@P@R
 
 	
rm   c                f    Xl         U R                   R                  U l        U R                  5         g ry   )r   r   r  
prune_deps)ri   rws     rj   r  !BaseSchedulerNode.set_read_writes  s&    "&"2"2"8"8rm   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7fry   )get)r   kmutation_real_names     rj   r   3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r~  )ri   future_used_buffersr  used_bufferss     ` rj   set_last_usage BaseSchedulerNode.set_last_usage  s-     88:!!U!UU&<rm   c                J    U R                    H  nUR                  5         M     g ry   )r  r\  )ri   r   s     rj   mark_runBaseSchedulerNode.mark_run  s    <<CLLN  rm   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7fry   r  r  s     rj   r   6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
W HHW   )r   	itertoolschainr   r   r  rh   s    rj   r   #BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rm   c                  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H7  n[        U[        5      (       a  UR                  (       a  M+  UR                  PM9     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R!                  U5      (       aD  UR#                  U4S j[        R                  R                  U   R%                  5        5       5        [        U5      S:  a  M  T$ s  snf )z
Returns buffer names used by this node, including aliases.

Note: is_fake WeakDeps are excluded since they are purely for ordering
and should not affect buffer lifetime.
r   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7fry   rw   )r   alias
used_namess     rj   r   ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s(      "5 J.	 E"5s   
	)r   r  r  r   r   r  r   r4   is_faker   r   popaddrW   r   name_to_bufferr  extendrN  )ri   r   depsr  s      @rj   r  .BaseSchedulerNode.used_or_aliased_buffer_names  s     '1l
 !t'7'7'='=t?O?O?V?VW
WsG,, CHHW 	 

 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m !
s   *E;Ec                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7fry   )r   r(  available_buffer_namesr   r   ri   s     rj   r   /BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s0      -
.xxt~~DDD C.s   (8	8r   r  rh   s   `rj   r  BaseSchedulerNode.prune_deps  s#    ", -
..-
 #
rm   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                  > [        U [        5      (       d  gU R                  TR                  R                  ;  a  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ r  )	r   r4   r   r(  rX  r2  rW   r   removed_operations)r   op_nameri   s     rj   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sb    c7++xxt~~999nn00:KKMGagg8888rm   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fry   rw   r   r   r  s     rj   r   4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
1C\#5FCC1   !	!r   r1   r   rx   )r   r   r   r  remove_reads)ri   	to_remover  s   ` @rj   prune_weak_deps!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFrm   c                D    [        XU R                  R                  5        g ry   )_prune_redundant_depsr(  rX  )ri   name_to_fused_nodes     rj   prune_redundant_deps&BaseSchedulerNode.prune_redundant_deps  s     	d8R8RSrm   c                T    U R                   c   eU R                   R                  5       $ ry   )r   get_operation_namerh   s    rj   r0  BaseSchedulerNode.get_name  rm  rm   c                "    U R                  5       $ ry   r0  rh   s    rj   get_first_name BaseSchedulerNode.get_first_name  s    }}rm   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   r0  r   r   s     rj   r   8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     G6Fd--//6F   )r   r   rh   s    rj   r   %BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   r0  r   r   s     rj   r   5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     ALS,,..Lr7  )r   r  rh   s    rj   get_buffer_names"BaseSchedulerNode.get_buffer_names  s    ADLLAAArm   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNr   r   r+   r   ns     rj   r   ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s6      
 & q-( G+AFG%s   .0r   r   rh   s    rj   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rm   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7fry   rB  rC  s     rj   r   @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1rF  rh   s    rj   r+   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rm   c                    U /$ ry   rw   rh   s    rj   r   BaseSchedulerNode.get_nodes  s	    vrm   c                    U R                   $ ry   )r  rh   s    rj   r  BaseSchedulerNode.get_outputs  s    ||rm   c                     U R                   U   $ ry   )r  )ri   buf_names     rj   
get_outputBaseSchedulerNode.get_output  s    ##H--rm   c                T    U R                   c   eU R                   R                  5       $ ry   )r   r   rh   s    rj   r   BaseSchedulerNode.get_device  s%    yy$$$yy##%%rm   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r   r   ri   devices     rj   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rm   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ ry   )r   rR   r   rZ  s     rj   rR   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rm   c                    gr  rw   rh   s    rj   r   BaseSchedulerNode.is_reduction      rm   c                    gr  rw   rh   s    rj   is_native_matmul"BaseSchedulerNode.is_native_matmul  rb  rm   c                    gr  rw   rh   s    rj   is_split_scanBaseSchedulerNode.is_split_scan  rb  rm   c                    gr  rw   rh   s    rj   is_templateBaseSchedulerNode.is_template  rb  rm   c                    gr  rw   rh   s    rj   	is_externBaseSchedulerNode.is_extern   rb  rm   c                    gr  rw   rh   s    rj   
is_foreachBaseSchedulerNode.is_foreach#  rb  rm   c                    gr  rw   ri   read_deps     rj   can_inplaceBaseSchedulerNode.can_inplace&  rb  rm   c                    gr  rw   rh   s    rj   has_side_effects"BaseSchedulerNode.has_side_effects)  rb  rm   c                X  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GHQ  nUR2                  nUc   eUR5                  5       (       aV  UR7                  5       (       dA  UR9                  5       (       d,  UR;                  5       [        R                  R<                  ;   a  M  T R>                  R@                   GH  nURB                  T R,                  RD                  ;   a$  T R,                  RD                  URB                     nO/T R,                  RF                  RI                  URB                  5      nU(       d  M  [        R                  RJ                  RM                  UT 5      (       d  M  [        URN                  [P        5      (       a  M  URR                  c   eURR                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	n[U        U	5      S:X  d  GM3  U	S   RV                  (       d  GMJ  U	S   R2                  T L d  GM_  UR2                  c  GMo  [        UR2                  RY                  5       [Z        R\                  [Z        R^                  [Z        R`                  45      (       a  GM  URN                  (       am  [        URN                  R2                  [Z        Rb                  [Z        Rd                  45      (       a*  [U        UR2                  R7                  5       5      S:  a  GMF  U" UR2                  UR2                  5      (       d  GMk  U" U5      (       d  GM{  [        R                  Rf                  Ri                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rj                  Rm                  UR;                  5       5        [        R                  Rj                  Rm                  UR;                  5       5        UR;                  5       [        R                  Rn                  UR;                  5       '     GMO     GMT     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   )can_match_buffer_size	mutationsNrL  c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H  nUR
                  n[        U[        5      (       d  M&  UR                  5       U R                   R                  ;  d  U R                   R                  U5      ULa  Mn  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7fry   r  )r   orR  s     rj   r   ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>_  s&      Evv) AEs   $	$r   FT)r(  get_fused_noder0  r   r-  r   r   r\   r1  r)  r   r  r   )buf_to_be_inplaced
fused_noder  rD  	user_noderR  ri   s        @rj   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodeG  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= ' 1* rm   r   )r  r&  r   rx   )8codegen.wrapperr{  r   r   r&   inplace_buffersrW   r   has_featurer   r,   INPLACE_BUFFERSrU  r  r  codegensimd
SIMDKernelr  rT  r   r  r(  completed_operationsr  r   rM  rN  rO  r0  removed_buffersr   r   r   rW  rX  r  rR  	can_reuser*  NopKernelSchedulerNoder-  r   ru  rP  r)   r=   r<   MutationLayoutSHOULDREMOVEFallbackKernelr;   rL  make_inplacer|  r  rV  )
ri   r{  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         rj   decide_inplace_update'BaseSchedulerNode.decide_inplace_update,  s   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 	 	D ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G q / &0&s   "V'V'c                   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH5  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	S
S9S   nUR                  SUR                  SS5      R                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM8     [        U5      S:X  a  g UR                  U5        SU l        g )Nr  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr
  {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r&   comment_originr  r   get_originsr1  r   targetmetarsplitreplacer   
writelines)	ri   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            rj   codegen_originating_info*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3 6 y>Q 	)$rm   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrh   s    rj   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rm   c                "    U R                  SSS9$ )NTFr  r  rh   s    rj   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rm   c                "    U R                  SSS9$ )NFTr  r  rh   s    rj   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
rm   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)r   get_read_write_buffer_accessesr   )ri   r  r  s      rj   r  3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s1     //+ 0 fh	
 	
rm   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ [        T [        5      (       af  [        T R                  [
        R                  5      (       a=  T R                  R                  [        R                  R                  R                  L a  0 $ SS jm[        T [        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[         R"                  " [$        5      nU(       a:  T R&                  R(                   H   nX4R*                     R-                  U5        M"     U(       a:  T R&                  R.                   H   nX4R*                     R-                  U5        M"     U(       a&  [1        S T R&                  R(                   5       5      O	[1        5       nU(       a&  [1        S T R&                  R.                   5       5      O	[1        5       nSU 4S jjm[        T [2        5      (       a  [1        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	[5        U4S	 jX9    5       5      mU	[6        R8                  R:                  ;   a  [6        R8                  R:                  U	   n
O>U	[6        R8                  R<                  ;   a  [6        R8                  R<                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   r   )rW   r   r   r   )ss    rj   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint#  s"    77##55a!5DDrm   r   r       eAc              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>8  s     B+ACxx+Ar  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r  =       C+BCxx+Br  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7fry   r   )r   rD  s     rj   r   \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>D  s     !>))r  r   )r(  rX  r-  r   r   )r   r   r-  buf_usesri   s       rj   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedB  sG    NN..s399E!!>!>>Hx*V"44599rm   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7fry   r   )r   r   r  ri   s     rj   r   r  H  s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7fry   rw   )r   r   
node_numels     rj   r   r  Q  s     $R;QCZ;Qs   c                  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       a  M$  [        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [!        U4S jU R#                  5        5       5      $ T	" [%        U R'                  5       5      5      n[)        U R+                  5       5      [-        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7fry   )rW   r   
get_buffer)r   mut_nameget_buf_bytess     rj   r   ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>u  s/      (@H &agg&8&8&BCC(@   25)r   r)   TorchBindObjectr  r=  r<   r(  rX  r0  r-  r   r`  r\   r;   r  r=   r   rO  rV   r  rK   	get_dtypemin)
r   r-  totrD  	sched_buf	buf_elemsbuf_accessed_elemsr  ri   r  s
         rj   r  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesZ  s[    c2#5#566,,..

,=>> !NN66s||~FLLEC %%dii<<$)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rm   )r  z
sympy.Exprr   r!  )r   r  r   Sequence[BaseSchedulerNode]r   rx   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r!  )r   r  ExternKernelSchedulerNoder   r;   r)   r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rV   
get_rangesr!  collectionsr   r  r   r   r   r   r  r   r   r   rW   r   r  graph_inputs)ri   r  r  buf_accessesr   r   r  r  buf_byte_accessesrR  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rj   r  0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233Id566:II{<
 <
 It677499b&7&788		%%||%%BBC I	E dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84#Q## #J &c*I0.7(+!+y8+g 'j ! rm   c                ^   U R                   c  g U R                   R                  5       nUc  g [        U5      nUc  g [        U[        R
                  5      (       a  UR                   R                  n[        R                  R                  R                  USS9n[        S   S==   U-  ss'   U$ )Nr   r   inductor
flop_count)r   get_origin_noder7   r   r  SymIntexprrW   r   r   r   r   )ri   fx_nodeflopsresolved_flopss       rj   estimate_flops BaseSchedulerNode.estimate_flops  s    99))++-?w'=eU\\**JJOOE));;EA;N\*n<*rm   c                T    U R                   b  U R                   $ U R                  5       $ ry   )r  _get_estimated_runtimerh   s    rj   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime  s)    **6222**,,rm   c                ,   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       a  [        U R                  [        R                  5      (       d   e [        R                  (       av  [        U 5      n[        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ [!        U 5      nUc  [#        U R                  5      nUR%                  X6S9  U$ [#        U R                  5      $ [/        U R                  5      (       a  g[1        U 5      nUb  U$ UR                  R3                  5       n	 [5        5       n
[7        U	5      S-  nU
S::  a  [9        SU
 35      eUS::  a  [9        SU 35      e U R=                  5       nUS:X  d  Uc  U R?                  5       U
-  nUS-  nU$ SnU R?                  5       nUc  SOUnX-  U-  S	-  nX-  n[A        UU5      nUS-  nU$ ! [&         a  n[(        R+                  U5         SnAgSnAf[,         a  n[(        R+                  U5         SnAgSnAff = f! [:         a     gf = f)
z3
Returns estimated op runtime in milliseconds (ms)
r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!r   r  r   rP  rR   r9   rP   r   r)   IRNoder'   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr0   r/   	set_value
ValueErrorr  r  	TypeErrorrU    maybe_estimate_runtime_benchmarkmaybe_get_dtyperL   rJ   AssertionErrorr  r   r  max)ri   r   r=  	cache_keycache	cache_valmsr  retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     rj   r  (BaseSchedulerNode._get_estimated_runtime  s   
 nnq!--/2))+of-.. ##dii3333LL I$ OI68E %Y 7I ,))U;;;;((HNBz=diiHOOIO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%< }-#X	o    :  		sD   AH3 63H3 *H3 A J 3
J=IJ$I>>J
JJc                    g ry   rw   rh   s    rj   get_template_node#BaseSchedulerNode.get_template_node      rm   c                0    U R                  5       nUc   eU$ ry   r&  )ri   templates     rj   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rm   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7fry   rj  )r   irD  s      rj   r   CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     P,<DAaa,<s   *	*Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        rj   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epilogue   sH     PIe,<PP.)-!+-.00rm   )r   r  r~  r  r   r  r  r   r(  r  r  )r(  r'  r   rs  )r   ir.Operationr   rs  rp  )r   r   rr  r  r2   r  r2   r   rx   r  r  r   rs  )r   r1   r   rs  rt  )r  r  r   rs  r  r}  r  r  r   rs  r   r}  r)  dict[str, BaseSchedulerNode]r   rs  r   r  )r   zSequence[SchedulerBuffer])rR  r  r   r&  rv  rt  zdependencies.Depr   rx   T)r  rO   r  rx   r   rs  rq  )r  rx   r  rx   r   r!  )r  rx   r  rx   r   zdict[str, int]r   z
int | Noner   r  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r5  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])@r{   r|   r}   r~   r   r   r  r  r  r  r  rE  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r%  r*  r0  r1  rF   r   r=  rG  r+   r   r  rS  r   r\  rR   r   rd  rg  rj  rm  rp  ru  rx  r  r  r  r  r  r  r  r   r  r  r&  r,  r$  r:  r   rw   rm   rj   r\   r\     s   BB NN''$$#'D
 '""//266((''GT
#0B*2+#
!.7	
M>


=#2=HV=	=
6
G T">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
L!!L!37L!	L!\  $- U Un
 1&1	S1 1rm   c                 R    [         R                  R                  R                  5       $ ry   )r  r  	codecache
LocalCacherw   rm   rj   r  r    s    ??$$//11rm   c                  ^ [        U R                  SS5      nU R                  R                  nU R                  R                  / UQU R                  R                  QU R                  R
                  5      nU R                  R
                  n[        R                  " X#45      u  pESS jm[        U4[        U4S jU 5       5      -   5      nU$ )Npython_kernel_namer  c                    [        U [        R                  5      =(       a    [        U [        R                  5      (       + $ ry   )r   r)   r	  GeneratorStater  s    rj   _is_tensor_ir@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPrm   c              3  t   >#    U  H-  nT" U5      (       a  [        UR                  5       5      OS v   M/     g 7fry   )r   r  )r   arR  s     rj   r   <get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>#  s+     U9a}Q'7'7ajjl#TA9s   58rt  )
r  r   inputsfill_non_provided_argsconstant_argsr  pytreetree_flattenr  r   )snoderN  rL  r  	flat_argsflat_args_pytree_specr  rR  s          @rj   r  r    s     -A2F::D::,,*$*))*

D ZZF'-':':D>'J$IQ 	
U9U
U	VI rm   c                   [        U [        5      (       d  g [        R                  R                  R
                  [        R                  R                  R                  [        R                  R                  R                  S.n[        U R                  SS5      nX!;  a  g [        U R                  [        R                  5      (       d  g X   $ )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmrN  r  )r   r  r  opsatenmmbmmaddmmr  r   r)   ExternKernel)r\  mms_fnsrN  s      rj   _get_mm_like_fnrg  (  s    e677"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//22&&rm   c           	     d  ^ ^ S nS n[         R                  (       a  [        T 5      nUc  g UnU U4S jnOg [        T 5      n[	        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ SSKJ	m  U" 5       u  pxSSK
Jn	  U	R                  UUUSSSS9n
UR                  XJS	9  U
$ )
Nc                    > T" T 5      $ ry   rw   )r\  snode_args_kwargss   rj   r  2maybe_estimate_runtime_benchmark.<locals>.<lambda>A  s    !25!9rm   r   )rj  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r&   !runtime_estimations_mms_benchmarkrg  r  r  r  r   r  utilsrj  $torch._inductor.runtime.benchmarkingrl  	benchmarkr  )r\  bench_fnargs_kwargs_fnmm_fnr  r  r  rL  r  rl  r  rj  s   `          @rj   r  r  8  s    HN//&=99%@I&(EY'I)U++++(!#LD@			! 
 
B 
OOIO(Irm   T)slotsc                  \    \ rS rSr% S\S'   S\S'   S\S'   S\S'   SS jrSS	 jrSS
 jrSrg)	WhyNoFusei^  r  name1name2reasonztuple[Any, ...]rL  c                X    UR                  5       U l        UR                  5       U l        g ry   )r0  r|  r}  ri   r   r   s      rj   r  WhyNoFuse.__init__e  s    ^^%
^^%
rm   c                F    Xl         X l        [        R                  U 5        g ry   )r~  rL  
fusion_logdebug)ri   r~  rL  s      rj   __call__WhyNoFuse.__call__i  s    	rm   c                p    SU R                    SU R                   S3U R                  U R                  -  -   $ )Nzcannot fuse z with r9  )r|  r}  r~  rL  rh   s    rj   __str__WhyNoFuse.__str__n  s6    djj\

|2>KK$))#
 	
rm   )rL  r|  r}  r~  Nr   r\   r   r\   r   rs  )r~  r  rL  r   r   rs  rp  )	r{   r|   r}   r~   r   r  r  r  r   rw   rm   rj   r{  r{  ^  s&    JJK
&

rm   r{  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )rA  r      )	r   r   setsortedr  pprintr?  textwraprA  )objrC  s     rj   r?  r?  t  sU    #
C())Sc"^^C*Fv~HOOFG4566Mrm   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r`  i~  c                &    [        U/5      U l        g ry   r  r  s     rj   r  OutputNode.__init__  s    ",cU"3rm   c                    gr  rw   rh   s    rj   r   OutputNode.is_reduction  rb  rm   c                    g)Nrw   rw   rh   s    rj   rN  'OutputNode.get_inputs_that_alias_output  r  rm   c                    g)NOUTPUTrw   rh   s    rj   r0  OutputNode.get_name  s    rm   )r  N)r   r3   r   rs  rt  ru  rp  )
r{   r|   r}   r~   r  r   rN  r0  r  r   rw   rm   rj   r`  r`  ~  s    4 Hrm   r`  c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                  > [        U [        5      (       ap  TU R                     R                  5       nTTU   R	                  5          S:  =(       a     TR
                  R                  U TU   T5      nTU   T:H  nU=(       d    U$ g)Nr   F)r   r4   r   r2  r0  r(  fusable_weak_dep)r   r  is_redundantis_self_deprX  name_to_dep_countr)  r   s       rj   r  +_prune_redundant_deps.<locals>.should_prune  s    c7##!#((+<<>G,"7+446 nn55'0$  -W5=K.;.rm   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fry   rw   r  s     rj   r   (_prune_redundant_deps.<locals>.<genexpr>  s      .,s2C.r!  Nr"  )r  r   r  r   r4   r   r2  r0  r   r  r   r#  )r   r)  rX  r   r  deps_to_pruner  r  s   ```   @@rj   r(  r(    s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '
    .. M "&"9"9M"IT--::=IJ rm   c                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g ry   superr  r  r  get_read_writesri   r(  r   	__class__s      rj   r  "ExternKernelSchedulerNode.__init__  5    #T"T1134rm   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = rN  )r0  r  r   rh   s    rj   r  )ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbrm   c                    gNTrw   rh   s    rj   rm  #ExternKernelSchedulerNode.is_extern  r(  rm   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nrx  )r   rT  rx  rh   s    rj   rx  *ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVrm   rw   r(  r'  r   r<  r   rs  rp  rt  )
r{   r|   r}   r~   r  r  rm  rx  r   __classcell__r  s   @rj   r  r    s    5
cW Wrm   r  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g ry   r  r  s      rj   r  NopKernelSchedulerNode.__init__  r  rm   rw   r  )r{   r|   r}   r~   r  r   r  r  s   @rj   r  r    s    5 5rm   r  c                    ^  \ rS rSr% SrS\S'   S\S'         S#U 4S jjr  S$     S%S jjr  S$     S&S	 jjr      S'S
 jr	S(S jr
S)S jrS*S jrS)S jr      S+S jrS)S jr      S,S jrS-S jrS.S jrS/S jrS/S jrS/S jrS/S jrS0S jrS1S jr    S2S jrS3S jr S4   S5S jjr\S6S j5       r\S6S j5       rS7S jr\S8S  j5       r \S/U 4S! jj5       r!S"r"U =r#$ )9r   i  zi
A SchedulerNode is a node for scheduling that encapsulates either
a ComputedBuffer or a TemplateBuffer.
z tuple[Sequence[sympy.Expr], ...]_sizesr>   r  c                f   > [         TU ]  U5        U R                  U5        U R                  5         g ry   )r  r  r  _compute_attrsr  s      rj   r  SchedulerNode.__init__  s,    
 	#T"rm   c                   [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        nX0l        U R                  R                  5       nU R                  R                  U5      R                  nXE" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )r   r   r)   r   TemplateBuffersimplify_and_reorderr  r  get_device_or_errorr(  get_backendgroup_fnr   r&   loop_ordering_after_fusionrR   r   r  extract_read_writesr(   )ri   r  r  bodyr[  r  should_normalizes          rj   r  SchedulerNode._compute_attrs  s;   
 $))b&7&79J9J%KLLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hrm   c                $    U R                  UUS9  g )Nr  )r  )ri   r  r  s      rj   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rm   c                   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      R                  U R                  5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R!                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7fry   )r   r4   r3   r  s     rj   r   5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s$      0
1CZgwEW5XCC1s   .	.r  r   SIMDScheduling)r   r   r   r  r(   r  r  r  r  r  r  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)ri   r  need_clear_tiling_cache	fake_depsr  s        rj   refresh_dependencies"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rm   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr  r  )r  reorder_iter_loopssizesr  r  )ri   	new_orders     rj   apply_new_loop_order"SchedulerNode.apply_new_loop_order.  sA    ZZ22

 jj&&!!E4!Prm   c                   U R                   R                  5       n[        U R                   R                  5      U-
  n[	        [        U5      5      n[	        [        X"U-   5      5      nU R                  XC-   5        [        U R                  S   5      S:X  d   eU R                  S   U R                  S   S   U R                  S   S   44U l        g )Nr   r   r   )r  get_original_num_rdimsr   	iter_varsr   ranger  r   )ri   	num_rdims
num_pwdimspwdimsrdimss        rj   swap_pw_red_dimension#SchedulerNode.swap_pw_red_dimension6  s    JJ557	--.:
uZ()eJY(>?@!!%.14::a=!Q&&&ZZ]TZZ]1%5tzz!}Q7G$HH
rm   c                D    U R                   R                  5       U l         U $ ry   )r  extract_pw_from_reductionrh   s    rj   r  'SchedulerNode.extract_pw_from_reduction@  s    ZZ99;
rm   c                   [         R                  U 5      (       d  g [        U R                  [        R
                  5      (       d   eU R                  R                  5          U R                  5         S S S 5        g ! , (       d  f       g = fry   )r   r   r   r   r)   r   with_original_inner_fnr  rh   s    rj   cancel_reduction_split$SchedulerNode.cancel_reduction_splitD  s[     33D99$))R%6%67777YY--/! 0//s   !A;;
B	c                   [        U R                  [        R                  [        R                  45      (       d   eU R
                  R                  X5      U l        U R
                  R                  U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        U R                  SSS9  g )NTr  )r   r   r)   r   r  r  #expand_dimension_for_pointwise_noder  r  r  r(  r  r  r   r  )ri   	dimension	new_ranger[  r  s        rj   r  1SchedulerNode.expand_dimension_for_pointwise_nodeK  s     $))b&7&79J9J%KLLLLZZCC

 jj&&..0>>--f5>>ht{{34
 	!!D$!Orm   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFr  )r  merge_loopsr  r  r  rh   s    rj   r  SchedulerNode.merge_loops\  s<    ZZ++-
jj&& 	!!D%!Prm   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g[        R                  SU R                  5       5        g)Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr*   num_loop_reorderingloop_ordering_logr  r0  r  )ri   r  r  r  
self_sizess        rj   r  'SchedulerNode.reorder_loops_by_dep_pairh  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rm   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r  )r0  r   r  r   r  r   r4   r   rW   r   r  r)   r  r   r?  r=  r  r>   r  rA  rE  r   r  r  join)ri   r   linesr   rR  r   s         rj   r  SchedulerNode.debug_str_extra  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyrm   c                    U R                   $ ry   )r  rh   s    rj   r  SchedulerNode.get_ranges      {{rm   c                d   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      =(       a0    U R                  S L =(       d    U R                  R                  (       + $ Ntype(self.node)=)
r   r   r)   r   r  r   rx   r   r  has_partial_accumulaterh   s    rj   r   SchedulerNode.is_reduction  s    $))b&7&79J9J%KLL 	
tDII !	
L DII0023 
JJ$Gdjj&G&G"G	
rm   c                    [        U R                  [        R                  5      (       d   S[	        U R                  5      < 35       eU R                  R                  5       S:H  $ )Nr  dot)r   r   r)   r   r   r   rh   s    rj   rd  SchedulerNode.is_native_matmul  sM    $))R%6%677N<LDO;M9NN7yy++-66rm   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r  )r   r   r)   r   r  r   r   	SplitScanrh   s    rj   rg  SchedulerNode.is_split_scan  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rm   c                J    [        U R                  [        R                  5      $ ry   r   r   r)   r  rh   s    rj   rj  SchedulerNode.is_template  s    $))R%6%677rm   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ ry   r  rh   s    rj   r&  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$Nrm   c                f    U R                  5         U R                  5         U R                  U5        g ry   )r  r  r  )ri   
index_varss     rj   runSchedulerNode.run  s#    ""$Z rm   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ ry   )	r  r   mapr   dictzipr  r  from_iterable)ri   r#  r  r   s       rj   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rm   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f! [         a"    [        R                  SU R                  5        e f = f)a  
Generate code for this node using the provided index variables.

This method sets up the appropriate context for code generation, including
simplifying indexing expressions based on the variable ranges, and then
calls the node's body function with the index variables.

Args:
    index_vars: A sequence of sequences of sympy expressions representing
                the index variables for each dimension of the computation.
NzError in codegen for %s)r+  rW   set_ops_handlerrD   get_ops_handlerrU  set_current_noder  r  r  fatalr   )ri   r#  r   s      rj   r  SchedulerNode.codegen  s     00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r  r   r(   r  r  r   SZeror   )ri   	pointwise
keep_sizesignore_sizess       rj   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writes  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
rm   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
Tr7  r:  rh   s    rj   r  #SchedulerNode.pointwise_read_writes  s    
 666FFrm   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr=  r>  rh   s    rj   reduction_read_writes#SchedulerNode.reduction_read_writes  s    
 666GGrm   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7fry   )r>  r;  s     rj   r   ,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?,>S  ,>r7  r   ztype(write_dep)=)rj  r   r  r   r   r  r   r(   r2   r3  iterr   r   r   )ri   rt  	write_deps      rj   ru  SchedulerNode.can_inplace  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXrm   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_addrm  r  r   r   r   r  )r   r   r  r>   r   r1  r  r  r   rL  r  )ri   buffers_store_as_atomic_addr   s      rj   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*rm   c                |   > U R                   b!  U R                   R                  S5      (       a  g[        TU ]  5       $ )Ndevice_assert_asyncT)r  has_opr  rx  ri   r  s    rj   rx  SchedulerNode.has_side_effects  s5     ::!djj&7&78M&N&Nw'))rm   )r  r  r   )r(  r'  r   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   rs  NN)r  *Optional[tuple[dict[Any, Any], list[Any]]]r  zOptional[Callable[_P, _T]]r   rs  )r  rW  r  zOptional[Callable[..., Any]]r   rs  )r  rx   r  rx   r   rs  )r  Sequence[int]r   rs  rr  r   r\   )r  r!  r  r!  r   rs  r=  rp  )r   Sequence[Sequence[sympy.Expr]]rt  rH  )r#  Sequence[sympy.Expr]r   rs  )r#  rZ  r   zdict[sympy.Expr, sympy.Expr])r#  rZ  r   rs  rE  )r7  rx   r   r  )r   r  rD  r@  )$r{   r|   r}   r~   r#  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   rd  rg  rj  r&  r$  r+  r  r:  rF   r  rA  ru  rO  rx  r   r  r  s   @rj   r   r     s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QI"PP),P	P"
Q!.7	. ,
7
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rm   r   c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fry   r   r=  )r   r   group_snodes     rj   r   2refresh_group_node_dependencies.<locals>.<genexpr>+  s/      
Pxx{;;== CP   "2	2)
r   r  r(   
ReadWrites
merge_listr   r   unionr  r  )r_  r   r  s   `  rj   refresh_group_node_dependenciesre  "  s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8r'  c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fry   r  r   r  s     rj   r   "init_group_node.<locals>.<genexpr>C       H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7fry   )r  ri  s     rj   r   rj  D  rk  r  )r   r   GroupedSchedulerNoder   r(  r   r   rd  r   re  r  r  r  r  r  r0  r  )r_  r(  r   r  r   s        rj   init_group_nodern  4  s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      S#S j5       rS$S jrS%S jr	\
S&S j5       r      S'S	 jrS(U 4S
 jjr\
S)S j5       rS)S jr\
S*S j5       rS+S jrS)S jrS)S jr      S,U 4S jjr\
S*S j5       r\
S*S j5       rS-S jrS)S jr\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S/S j5       rS0S jr\
S.S j5       rS1S jr S2S jr!S3S jr"S)S  jr#\
S.U 4S! jj5       r$S"r%U =r&$ )4r   iJ  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
rI  r   c           	        UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       Ga  [        U[
        5      (       Ga  [        UR                  [        5      (       d   e[        UR                  R                  5      S:X  d   e[        [        [        UR                  R                  5      5      [        5      (       d   e[        [        UR                  R                  5      5      R                  nUR                  5        Vs/ s H  oDR	                  5       (       d  M  UPM     nn[        U5      S:X  d   eUS   n[        UR                  R                  5      S:X  d   e[        [        UR                  R                  5      5      n[        U[         5      (       d   e[#        [!        X7R$                  UR&                  UR(                  UR*                  5      /5      UR                  l
        O[        U[        [        45      (       d   e[-        [.        R0                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ s  snf )Nr   r   )r(  r   r   r   rj  r  r   r;   r   r   r  r3  rF  r3   r   r   r2   r   r   	var_namesr   rL  r  r  r  )	rp   r   r   r   r   template_nodesr8  writer5  s	            rj   rq   FusedSchedulerNode.fuseS  s    %//111%-1C!DEEEE:e5N#O#O ejj+6666u((//0A555d4(9(9(@(@#ABGLLLLU..5567<<D/4/@W/@tDTDTDVd/@NW~&!+++*1-M}00778A===m77>>?@EeY////'1kk5??EJJ

(E$ em5G%HIIIIY__U__%68IJK5??E**! Xs   ,JJc                    U R                    HA  n[        U[        5      (       d   eUR                  5       (       d   eUR	                  5         MC     U $ ry   )r   r   r   r   r  ri   r   s     rj   r  ,FusedSchedulerNode.extract_pw_from_reductionu  sK    {{Gg}5555''))))--/ # rm   c                x    U R                    H*  n[        U[        5      (       d   eUR                  5         M,     g ry   )r   r   r   r  rv  s     rj   r  (FusedSchedulerNode.swap_pw_red_dimension|  s/    {{Gg}5555))+ #rm   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fry   rj  rm  r   r5  s     rj   r   4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  =       0''))T^^-= *D'')) 0
   .AAr   r  filterr   r   r   ri   fpsr  s      rj   r   !FusedSchedulerNode.estimate_flops  K      $ 0	
 s8q=#h
rm   c                   U R                  5       (       a  gSnU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          gUR
                  S   nMj     SnUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g[        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g)	z0
Return true if a loop reordering is performed.
FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)rj  r   r   r   r   r  r  r  r   r  r  r0  r*   r  r  re  )ri   r  r  r	  r\  r  s         rj   r  ,FusedSchedulerNode.reorder_loops_by_dep_pair  sK    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-rm   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ ry   )r!  r   rQ  s    rj   r  -FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/Drm   r  )r  r  rn  r-  r  r   )ri   r(  r   r  s      rj   r  FusedSchedulerNode.__init__  s6    #0%'
%DEKK
rm   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r  r   r0  ri   r  s     rj   r0  FusedSchedulerNode.get_name  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ r   r   r0  rh   s    rj   r1  !FusedSchedulerNode.get_first_name      {{1~&&((rm   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ry   r   rd  r   r=  r  s     rj   r=  #FusedSchedulerNode.get_buffer_names  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ ry   r   r  r  ri   rC  r   s      rj   r  FusedSchedulerNode.get_outputs  /    (*KKDMM$**,-  rm   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r  r  )r4  r   r0  rE  r   r  r  r  rA  r  r  )ri   r1  r   r  s       rj   r  "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r   r  )ri   r   
snodes_strs      rj   r  "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g ry   )r  r  r   r   r   updater~  )ri   r  r  r   r  s       rj   r  !FusedSchedulerNode.set_last_usage  sQ    
 	2G 0:|T[[)D 3H&&t7 *rm   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ry   )r   rd  r   r   r  s     rj   r   $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ry   )r   rd  r   r  r  s     rj   r  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr  c                    U R                   $ ry   r  rh   s    rj   r   FusedSchedulerNode.get_nodes  r  rm   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r  r  rh   s    rj   r  FusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )r   ri  s     rj   r   2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     9[>>##[r7  r   r   rh   s    rj   r   FusedSchedulerNode.is_reduction   s    9T[[999rm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )rd  ri  s     rj   r   6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>  s     =A%%''r7  r  rh   s    rj   rd  #FusedSchedulerNode.is_native_matmul  s    ====rm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )rg  ri  s     rj   r   3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>
  s     :k??$$kr7  r  rh   s    rj   rg   FusedSchedulerNode.is_split_scan  s    :dkk:::rm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   r0  ri  s     rj   r   1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8Kq==??Kr7  r  rh   s    rj   rj  FusedSchedulerNode.is_template  s    8DKK888rm   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g ry   )r   rj  r&  ri   r   s     rj   r&  $FusedSchedulerNode.get_template_node  s3    KKD!!--//   rm   c                     U R                   S   $ r   )r   rh   s    rj   r   FusedSchedulerNode.get_device  s    zz!}rm   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )r  ri  s     rj   r   >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//r7  r  rh   s    rj   r  +FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErm   c                    [         ery   NotImplementedError)ri   r  s     rj   r  'FusedSchedulerNode.update_mutated_names       !!rm   c                    [         ery   r  )ri   r   s     rj   r  FusedSchedulerNode.add_fake_dep#  r  rm   c                    [         ery   r  rs  s     rj   ru  FusedSchedulerNode.can_inplace&  r  rm   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r  r:  c              3  L   #    U  H  n[        U5      R                  v   M     g 7fry   )r   r{   rC  s     rj   r   /FusedSchedulerNode.debug_str.<locals>.<genexpr>,  s     F+QQ 0 0+s   "$r9  r  r  r  r  r  r  z.outputs = [
            Nr;  r  Tr  )r0  r  r   rO   r  r   r{   r?  r   r  r  r   rA  r  rE  r<  r  r  r  r  rB  r  )ri   r   node_typestrr   r   s        rj   rE  FusedSchedulerNode.debug_str)  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F)c                r   > U R                   b  [        S U R                    5       5      $ [        TU ]  5       $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )rx  r5  s     rj   r   6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>F  s     G;4,,..;r7  )r   r   r  rx  rT  s    rj   rx  #FusedSchedulerNode.has_side_effectsC  s0    ;;"G4;;GGGw'))rm   )r   r-  r   r\   r   r\   r   r   rY  rr  rF  r=  )r(  r'  r   rI  r   rs  rp  r@  r   r  r?  rC  rt  rH  )r   torch.devicer>  )r   r1   r   rs  rD  )'r{   r|   r}   r~   r#  r   r   rq   r  r  rF   r   r  r  r0  r1  r=  r  r  r  r  r   r  r   r  r   rd  rg  rj  r&  r   r  r  r  ru  rE  rx  r   r  r  s   @rj   r   r   J  s    $#+%+.?+	+ +B,
  "(!(.7(	(TL = =) N N	B/8#28HV8	8 O O 
 

A : : > > ; ; 9 9   F F
"""*4 * *rm   r   c                  V   ^  \ rS rSrSU 4S jjr      SS jrS	S jrS	S jrSrU =r	$ )
FusedMixOrderReductionsiJ  c                l  > [         R                  U5      (       d  [         R                  U5      (       d   eX!p!Xl        X l        [        TU ]  UR                  [        UR                  5       5      [        UR                  5       5      -   5        [         R                  U R                  5      U l
        g ry   )r   r   r   r   r  r  r(  r  r   r   numel)ri   r   r   r  s      rj   r   FusedMixOrderReductions.__init__K  s     33E::$77>>>> 5

OOT%//"34tEOO<M7NN	
 '00<
rm   c           	     H   [        U[        5      (       a   e[        U[        5      (       a   eU R                  R                  XSS9(       d  g[        R                  U5      (       a  [        R                  U5      (       d  gSS jn    SS jnU(       a/  U" X45      U" U5      -  (       d  U" U5      U" X45      -  (       a  gUR                  5       (       + =(       d@    [        R                  " [        U R                  R                  XSS95      U R                  :  $ )z
node1 is from the current mix order reduction; node2 is another node we want to fuse in.

other_nodes are passed in to check if fusion will introduce producer/consumer relationship
between the inner and outer reduction. If yes, we don't fuse.
Fallow_mix_order_reductionc                B    [        5       nUR                  " S U  5       6 $ )Nc              3  8   #    U  H  oR                   v   M     g 7fry   )r   rC  s     rj   r   TFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>u  s     :Eq{{Er  r   rd  r5  r   s     rj   _get_ancestorsAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorss  s    ,C99:E:;;rm   c                B    [        5       nUR                  " S U  5       6 $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fry   )r   rC  s     rj   r   ZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>{  s     F14466r7  r  r  s     rj   _get_operation_namesGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_namesw  s      ,C99FFGGrm   )count_bytes)r5  tuple[BaseSchedulerNode, ...]r   r}  )r   r  r(  r   r   r   r   typingcastr!  score_fusion_memoryr  )ri   r   r   other_nodesr  r  s         rj   sub_node_can_fuse)FusedMixOrderReductions.sub_node_can_fuseW  s    e%<====e%<====
 ~~&&uu&U //
 
#66u==	<	H0	H	H ~.1Ek1RR{+.BE>.RR ""$$ {{T^^77RW7X zz	
rm   c                   [        U[        5      (       dU  U R                  U R                  XR                  45      =(       d'    U R                  U R                  XR                  45      $ U R                  U R                  UR                  U R                  UR                  45      =(       a/    U R                  U R                  UR                  [        5       5      $ ry   )r   r  r  r   r   r   ri   others     rj   can_fuse_with%FusedMixOrderReductions.can_fuse_with  s    %!899))

EJJ= J''

EJJ=IJ ))

EKK$**ekk)B K((U[[%'JKrm   c                b   U R                   R                  5       nU R                  R                  U5      n[	        U[
        5      (       aW  UR                  U R                   UR                   5      nUR                  U R                  UR                  5      n[        XE5      $ U R                  U R                   XR                  45      (       a1  UR                  U R                   U5      n[        X`R                  5      $ UR                  U R                  U5      n[        U R                   U5      $ ry   )	r   r   r(  r  r   r  rq   r   r  )ri   r  r[  backendfused_node1fused_node2r  s          rj   	fuse_with!FusedMixOrderReductions.fuse_with  s    &&(..,,V4e455!,,tzz5;;?K!,,tzz5;;?K*;DD%%djj%**GG$\\$**e<
.z::FF$\\$**e<
.tzz:FFrm   )r   r   r  r  )r   r\   r   r\   r  r  )r  r\   )
r{   r|   r}   r~   r  r  r   r  r   r  r  s   @rj   r  r  J  s<    
=2
 2
 !2
 3	2
h
KG Grm   r  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g ry   )r  r0  read_to_node)ri   producerr   s      rj   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for  sG     '')C||~!2!22((88 * rm   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr   )r   r\   r   r   r   r(  rX  r2  name_to_noder  r   r3  rF  )ri   consumer	producersrd	node_names        rj   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((rm   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7fry   )r(  r   )r   lrr  s      rj   r   6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r{  rp  r  r  r	  r   r   r   r)  r   r  r(  r   r  r  )rp   r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rj   r   #ForeachKernelSchedulerNode.can_fuse  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
rm   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rp  r  r  r	  r$  r'  r)  r   r   rq   r  r   r  r  r(  )rp   r  r  r$  r'  r%  r&  r  r  fused_nodesr!  r   new_noder   s                 rj   rq   ForeachKernelSchedulerNode.fuse  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     T R                   VVVs0 s H(  oR:                  R=                  5         H  u  pX_M	     M*     snnnT l        UT l        US   RA                  5       nU(       d   eU[B        RD                  " S5      444T l#        [         [H        RJ                  RL                     " 5       T l'        UT l(        g s  snnnf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fry   r^  r  s     rj   r   6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>\	  s5        xxt'<'<'>>	 C ra  r   combo_kernel))r  r  r  r  r   r   r   r   r(  r   r   r-  r  r(   rb  rc  r   rd  r  r  r  r  r  r  rp  r   r	  r   r  r  itemsr$  r   r   Exprr   r  fxNoder  r'  )ri   r(  r   r$  r%  r&  r'  r   r  r   foreach_noder   r\  r  vr[  r  s   `               rj   r  #ForeachKernelSchedulerNode.__init__:	  s    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 #'++@"-:O:O:U:U:W$!:W+@D  *C&%%'v

> :<>?
!%((--02.@s   /Lc           
        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H,  n[        U[        [        [        [        45      (       a  M*  UPM.     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     n	nU	(       a   [        R                  S[	        U	5      U	5        U Vs/ s H  o"U	;  d  M
  UPM     nn[        R                  (       av  U Vs/ s H  o"R                  5       (       d  M  UPM     n
nU
(       a  [        R                  S[	        U
5      5        U Vs/ s H  o"R                  5       (       a  M  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d grouped nodes are filteredz;ComboKernels: %d FusedMixOrderReductions nodes are filteredz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %szCComboKernels: %d reduction nodes are filtered (pointwise_only mode))r   r  r  r  r   r   r  rm  r  r  r	  rj  r&   combo_kernels_pointwise_onlyr   )rp   r5  r  externr   grouped	mix_orderfiltered_nodesforeach_nodesrr  reduction_nodess              rj   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes	  sl    #OUj4M&N!UOIIAF5;UVTyy(&&(VU
 $Kez!5I'J1eKII=G !&P1A7N)OQ	PIIMI 
*-(+	  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GIIBN#
 &4O^7N!^O ..*8M.QNN<Lq.OM		Y( *8PA~~?OaNPy P
 VK Q



 H P N Qs   KKK#KK)KK8K*)K K #K% K%2K*K*K/7K/*	K47K4K94K9&K>K>c                   U R                  5       n/ nSn[        U VVVs/ s H>  nU  H4  n[        U[        5      (       d  M  UR	                  5         H  nUPM     M6     M@     snnn5      nU H  n[        [        5      n	U Hi  nUR                  5       n
U
(       a"  U
R                  S:X  d  U
R                  S:X  a  M<  UR                  5       U-  (       a  MV  X   R                  U5        Mk     U	R                  5        H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXX-    PM     sn5        M?     M     U$ s  snnnf s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   mpsrY  r   )_topological_sort_nodesr   r   r  r=  r   r  r   r   r   r   r   r  r  r   )r(  sorted_nodesgrouped_nodesmax_num_nodesr   r   rR  excluded_buffer_namesr5  device_groupsr[  device_nodesr1  s                rj   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels	  sZ    !88:1; *)E!Dd$;<  !% 5 5 7H	  !8	 ! )2
 "E D!  *v{{e3v{{e7K ))+.CC%,,T2  !. 4 4 6$$ "'q#l*;]!K!KA %):;!K !7! ". ?4s   E"E'E4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g ry   r	  rM  )custom_group_algorithms    rj   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels	  s    
 # 	#Drm   c                ,    [         R                  U 5      $ ry   rO  r(  s    rj   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels	  s     *KKIVVrm   c                    [         ery   r  rh   s    rj   r  #ForeachKernelSchedulerNode.mark_run
  r  rm   c                    [         ery   r  rh   s    rj   r  "ForeachKernelSchedulerNode.codegen
  r  rm   c                    gr  rw   rh   s    rj   rp  %ForeachKernelSchedulerNode.is_foreach	
  r(  rm   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r  r   rh   s    rj   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes
  s     DKK  rm   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7fry   )r   ri  s     rj   r   7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>
  s     1UA++--r7  )r  r  r  r*  r   rh   s    rj   r   $ForeachKernelSchedulerNode.get_nodes
  s(     IOO111U1UUVVrm   c                <    U R                   S   R                  5       $ r   )r   r1  rh   s    rj   r1  )ForeachKernelSchedulerNode.get_first_name
  s    {{1~,,..rm   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g ry   )r(  r(  rX  r   r*  )ri   r)  r   s      rj   r*  /ForeachKernelSchedulerNode.prune_redundant_deps
  s5     	d8R8RSKKD%%&89  rm   )r   r'  r   r  r  r  r   r  r  r  r(  r   r  r$  r-  )r  r\   r   r)  )r  r\   r   r)  r  r\   r  r\   r   rx   )r  r\   r  r\   r   r	  )NNF)r(  r'  r   rI  r$  rx   r%  r)  r&  r)  r'  rx   r   rs  r5  rI  r   rI  )r(  r'  r   list[list[BaseSchedulerNode]])rP  rL  r   rs  rr  rt  r   rI  rC  rp  rA  )r{   r|   r}   r~   r#  r  r  r   r   rq   r  r>  r$  rJ  rM  r   rQ  rU  r  r  rp  r^  r   r1  r*  r   r  r  s   @rj   r	  r	    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/ F/P ?+?	 ? ?B **	&* *\ 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :rm   r	  c                     ^  \ rS rSr% SrS\S'   \SS j5       r S       SU 4S jjjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jr\SS j5       rSS jrSS jr\SS j5       rSrU =r$ )rm  i"
  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
rI  r   c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7fry   rT  )r   r   r(  s     rj   r   .GroupedSchedulerNode.create.<locals>.<genexpr>1
  s     B64>>Y.6s   )r(  r   r)  r0  )rp   r   grouped_snoder\  r(  s       @rj   createGroupedSchedulerNode.create.
  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>rm   c                H   > [         TU ]  U5        [        XU5        X0l        g ry   )r  r  rn  temp_grouping)ri   r(  r   rt  r  s       rj   r  GroupedSchedulerNode.__init__8
  s$     	#0 +rm   c                B   U R                   (       a  U R                  $ U R                   H)  nXR                  R                  UR	                  5       '   M+     U R                  R                  U R	                  5       	 U R                  R                  U R                  5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)rt  r   r(  r)  r0  
fuse_nodes)ri   r\  s     rj   unpackGroupedSchedulerNode.unpackG
  so    
 ;;[[EBGNN--enn.>? !NN--dmmo>~~((55rm   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g ry   )r  r   r  r  r  )ri   fake_deps     rj   r  !GroupedSchedulerNode.add_fake_depT
  s5    T--77AB##H-rm   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf r  r  r  s     rj   r0  GroupedSchedulerNode.get_nameX
  r  r  c                <    U R                   S   R                  5       $ r   r  rh   s    rj   r1  #GroupedSchedulerNode.get_first_name\
  r  rm   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ry   r  r  s     rj   r=  %GroupedSchedulerNode.get_buffer_names_
  r  r  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ ry   r  r  s      rj   r   GroupedSchedulerNode.get_outputsc
  r  rm   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fry   r|  r5  s     rj   r   6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>o
  r~  r  r   r  r  s      rj   r   #GroupedSchedulerNode.estimate_flopsi
  r  rm   c                    U R                   $ ry   r  rh   s    rj   r   GroupedSchedulerNode.get_nodes{
  r  rm   c                b    U R                   (       a  U R                   S   R                  5       $ S $ r   )r   r   rh   s    rj   r   GroupedSchedulerNode.get_device~
  s$    .2kkt{{1~((*CtCrm   c                    gr  rw   )rp   r  r  s      rj   r   GroupedSchedulerNode.can_fuse
  r  rm   )rt  )r   rI  r   rm  )F)r(  r'  r   rI  rt  rx   r   rs  rk  )r{  r1   r   rs  rp  r@  r  rF  rC  rv  rh  )r{   r|   r}   r~   r#  r   r   rq  r  rx  r  rF   r0  r1  r=  r  r   r   r   r   r   r  r  s   @rj   rm  rm  "
  s     $#  $	++ (+ 	+
 
+ +6. = =) N N  "D  rm   rm  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr   Nrw   r   sl_asl_bs      rj   r   5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>
  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr  rw   r  s      rj   r   r  
  r  r  r
  )rG   absr   r)  )	rU  bslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rj   	index_cmp"pick_loop_order.<locals>.index_cmp
  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )rU  r!  r  r!  r   r!  )		functools
cmp_to_keyr  r   r  r   r&   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    rj   pick_loop_orderr  
  s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g ry   )r0  r   r  r-  rW   r   r  r   
name_to_opoperation_namebuffersr   remove
operations)	orig_noder)  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rj   _replace_operation_bufferr  
  s]    !))+&&(MmS))j9JC.P.PPP224//1LlC((Z8H#-N-NNN	01!M	+,*77??  +DGGOO8$$AGGOOD,4AGG=)77##I.DGGh''AGGt'/AGG|$rm   c                j    UR                  5       nU R                  5       nX4-
  nXT-  nUSU-   -  nXr-  $ r  )r  r  )r   r   epilogue_runtimetotal_read_bytestemplate_write_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           rj    _estimate_fused_epilogue_runtimer  
  sO     224 779"9K#: +a2C.CD00rm   c                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri
  $Union[BaseSchedulerNode, OutputNode]r   Frx   ru  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ ry   )r5  r   r0  ru  r  rh   s    rj   r6  NodeUser.__hash__
  s+    TYY'')4+;+;T\\JKKrm   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ ry   )r   r  r0  ru  r  r  s     rj   __eq__NodeUser.__eq__
  s[    uh' .5>>#33.  E$5$55. -		
rm   c                6    U R                   R                  5       $ ry   rH  rh   s    rj   r0  NodeUser.get_name
  rJ  rm   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ ry   )r   r  ru  r  r  s     rj   rf  NodeUser.merge
  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rm   rw   Nrq  )r  objectr   rx   rp  )r  r  r   r  )r{   r|   r}   r~   r   ru  r  r6  r  r0  rf  r   rw   rm   rj   r  r  
  s3    
..K GTL
$
rm   r  c                 "    [         R                  $ ry   )r&   rr  rw   rm   rj   *used_non_deterministic_runtime_estimationsr    s    333rm   c                   [        5       nU R                  5       n[        U[        R                  5      (       a  UR                  [        UR                  5      [        UR                  5      -  [        UR                  5      -  5        [        U[        R                  5      (       a$  UR                  [        UR                  5      5        U$ Ub
   SU 35       eU$ )z=Get free symbols from a node's layout (size, stride, offset).z*Expect layout to be None but found layout=)r   maybe_get_layoutr   r)   Layoutr  r    r   strideoffsetr  get_layout_symintsr  )r   free_symbol_usesr=  s      rj   r  r    s    1;""$F&"))$$%6==)*6==)*	

 fb;;<<##$6v}}$EF  ~T!KF8TT~rm   c                .   [        U [        5      (       a(  [        5       R                  " S U R                   5       6 $ U R
                  c   eU R
                  R                  5       nUR                  " S U R
                  R                  5        5       6   U$ )z{
Gets symbols used in a scheduler node, including free symbols from
the node's operations and layout symints from outputs.
c              3  8   #    U  H  n[        U5      v   M     g 7fry   get_scheduler_node_symbol_uses)r   r\  s     rj   r   1get_scheduler_node_symbol_uses.<locals>.<genexpr>   s     M,U33r  c              3  8   #    U  H  n[        U5      v   M     g 7fry   )r  )r   ir_nodes     rj   r   r  %  s     	M5L'
W
%
%5Lr  )	r   r   r   rd  r   r   get_free_symbol_usesr  r  )r   r  s     rj   r  r    s     $*++|!!MM
 	
 99   yy557	MTYY5J5J5L	M rm   c                    U R                  5       =(       a,    [        R                  =(       a    UR                  5       (       + $ ry   )rj  r&   epilogue_fusionr   s     rj   is_epilogue_fusionr  *  -    U6#9#9U%BSBSBU>UUrm   c                    UR                  5       =(       a,    [        R                  =(       a    U R                  5       (       + $ ry   )rj  r&   prologue_fusionr   s     rj   is_prologue_fusionr  .  r  rm   c                <    [        X5      =(       d    [        X5      $ ry   )r  r  r   s     rj   is_template_fusionr  2  s    e+O/A%/OOrm   c                *    [        X5      (       a  U$ U $ ry   )r  r   s     rj   template_fusion_pw_noder  6  s    &u445?%?rm   c                  &  ^  \ rS rSrSrS^S jrS^U 4S jjrS_S jr\S`S j5       r	\	R                  SaS j5       r	SbS jrScS	 jrSdS
 jrSbS jrSbS jrSbS jrSbS jrSeS jr    SfS jrSgS jrShS jrSbS jrSbS jrSfS jrSbS jr    SiS jr Sj       SkS jjr      SlS jr    SmS jrSbS jr          SnS jrSoS jr  Sj     SpS jjr!      SqS jr"SrS  jr#        SsS! jr$        StS" jr%      SuS# jr&          SvS$ jr'    SwS% jr(    SxS& jr)      SyS' jr*SjSzS( jjr+S{S) jr,      S|S* jr-      S}S+ jr.      S}S, jr/        S~S- jr0      S}S. jr1        SS/ jr2      SS0 jr3      SS1 jr4SS2 jr5        SS3 jr6      SS4 jr7  S         SS5 jjr8      S}S6 jr9        SS7 jr:SS8 jr;SSS9 jjr<   S           SS: jjr=    SS; jr>    SS< jr?SbS= jr@SbS> jrASbS? jrBSS@ jrCSSA jrDSSB jrESSC jrF      SSD jrGSSE jrH\ISSF j5       rJ    SSG jrK  SSH jrL    SSI jrM      SSJ jrN      SSK jrO    SSL jrP    SfSM jrQ    SfSN jrR    SfSO jrS  SSP jrT      SSQ jrUSSR jrVSbSS jrW      SST jrX      SSU jrY      SSV jrZSbSW jr[S{SX jr\    SSY jr]SSZ jr^SS[ jr_SbS\ jr`S]raU =rb$ )r'  i:  z
A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
optimizations such as fusion, reorder, and graph partition.
c                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initri   r5  s     rj   r  Scheduler.__init__@  s#    ./JJu 0//s   '
5c           
       >^  [         TT ]  5         T [        R                  l        0 T l        [        [        5      T l        [        R                  " 5       T l        [        5       T l        [        / [        R                  R                  R                  5       Q[        R                  R                   R                  5       Q[        R                  R"                  R                  5       Q5      T l        U Vs/ s H  nT R'                  U5      PM     snT l        S T l        S T l        T R/                  5         T R$                  R1                  [        R                  R                   R                  5       5        T R(                   H  nUR3                  5         M     S T l        T R7                  5       T l        T R(                   Vs0 s H  o"R;                  5       U_M     snT l        T R(                   VVs0 s H*  o3R?                  5         H  oDR;                  5       U_M     M,     snnT l         T R<                  RC                  5       T l"        0 T l#        0 T l$        [        5       T l%        [L        RN                  " T R(                  T R@                  T RD                  5      T l        T RQ                  5         T RS                  T R(                  5      T l        T RU                  5         T R(                   Vs0 s H  o"R;                  5       U_M     snT l"        T RW                  5         [X        =RZ                  []        T R(                  5      -  sl-        SSK/J0nJ1n  U" T R(                  5        []        T R(                  5      T l2        T Rg                  5         T RS                  T R(                  5      T l        [        [h        [j        [j        4      " 5       T l6        [n        Rp                  b%  [n        Rp                  " T R(                  5      T l        [n        Rr                  (       a'  SSK:J;n  URy                  T 5        T RW                  5         T R{                  T R(                  5      T l        [n        R|                  b%  [n        R|                  " T R(                  5      T l        T R                  5         T R                  5         [n        R                  (       d  [n        R                  (       aA  [        5       (       a2  [        R                  R                  R                  R                  5         [n        R                  (       a#  [        SSSS9   T R                  S S9  S S S 5        [n        R                  (       a  SS	KMJLn  U" T R(                  T R@                  T RD                  [        [        R                  R                  R                  5       5      [        [        R                  R                  5       5      5      T l        [n        R                  (       Gd;  [n        R                  (       Ga%  [n        R                  (       d#  SS
KMJQn	  U	" T R(                  T R@                  5        [        5       (       a  [        R                  (       a|  [n        R                  (       d  [        R                  (       aR  Sn
T R(                   H!  n[        UR                  5      (       d  M  Sn
  O   U
(       a  SSK&JYn  U" T R(                  5        [        R                  (       a  SSK[J\n  U" SS U 4S jS9  [L        R                  " T R(                  5      T l        T R                  5         [n        R                  (       a~  [n        R                  R                  (       a_  [n        R                  R                  (       a@  T R                  T R(                  5      T l        T R                  T R(                  5      T l        T R                  5         [        R                  Rn                  R                  R                  (       a  T R                  5         U" T R(                  5        [        R                  R                  T R(                  5        T R                  5         [        5       T ll        0 T lm        [        S5      R                  U 4S j5        [        5       T lp        g s  snf s  snf s  snnf s  snf ! , (       d  f       GN= f)Nr   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotunez#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     SSS.$ )N#scheduler_nodes_before_comm_overlapstring)r   encodingrw   rw   rm   rj   r  !Scheduler._init.<locals>.<lambda>  s     E$,)rm   c            
        > SR                  [        TR                  5       V Vs/ s H0  u  pSU  S3UR                  5       -   SUR	                  5        3-   PM2     snn 5      $ s  snn f )Nz

zsnode[r;  z buffer_names:)r  r4  r5  rE  r=  )r1  rD  ri   s     rj   r  r    so    v{{
 )2$**(=	 )> %QCqMkkm, .q/A/A/C.DEF )>	(s   7A$
)metadata_fn
payload_fngraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r5  rh   s   rj   r  r    s%     33+/+>+>*-djj/rm   )qr  r  rW   r   r(  backendsr3  _post_grad_graph_counterr
  r  count_graph_partition_counterr   r  r  r  	constantstorchbind_constantsr  create_scheduler_noder5  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersrW  r0  r  r  rX  copyr)  r  r  seen_template_fusionsr%   decide_global_ordering_of_commsrZ   topological_sort_scheduledead_node_eliminationcompute_ancestorsr*   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r  logged_slow_fusionr&   _pre_fusion_custom_passdistributed_max_autotune_gemmr  r  schedulerw  _post_fusion_custom_passr  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  r  r'   6runtime_estimations_align_across_all_distributed_ranksrr  r
  rP   r   r  reorder_sink_verbose_loggingtorch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r[   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)ri   r5  rD  r   r   r  r  r  r  r  has_collectivesr  r  r  s   `            rj   r  Scheduler._initD  sN    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCUd003UC
:>9='')##**177+<+<+A+A+CDJJDOO  ?C# $$& 	# &*ZZ;
%/JJL!OZ;

 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 L 	"
 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G  	##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ//. ))$/""$__TZZ0
**688DDJ,,.$$(;(;&((OO,,AASSU5&* $
 ..D.A ))70

  ''177//44671773356DJ ###(O(O(O11UAJJ 0 0
 ;<< WW<<#PP #( JJD$TYY//*. ' # K4::V 88; !  CCDJJODJ""$ ""((CCDDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
 -7LM D;
8
H #HB s$   6cc61c$5c*c//
c>c                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)r*  )rW   r   graph_inputs_originalr   r)   DonatedBufferr{  )ri   name_to_donated_bufr   s      rj   r  Scheduler.get_donated_buffers  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"rm   c                6    [         R                  R                  $ ry   rW   r   current_devicerh   s    rj   rN  Scheduler.current_device&  s    ww%%%rm   c                .    U[         R                  l        g ry   rM  rZ  s     rj   rN  rO  *  s    !'rm   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r  rT  r5  )ri   rT  s     rj   r@  Scheduler.debug_draw_graph.  s1    ::>>:DASH+6 Irm   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r  isEnabledForloggingINFOr  r5  r  )ri   labelr   s      rj   debug_print_nodesScheduler.debug_print_nodes5  sD    GLL))HHUE"

  " # *rm   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r)   r   r  r   re  r  r  r  s     rj   r  Scheduler.create_scheduler_node;  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++rm   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr$  r'  )r   r)  r  rW   r   listsr   r   r  r  r  r&   combo_kernels_autotuner	  r   r5  r0  r  )
ri   removed_node_namesfe_nodeskept_node_namesnamesr   r   r'  fe_noder   s
             rj   r   Scheduler.create_foreach_nodesH  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                X  ^ ^'^(^)  " U'4S jS[         [           5      m'[        R                  " T'5      m(T R                   H  nUR                  5        H  nUR                  5       n[        UR                  R                  [        R                  5      (       a  [        UR                  5       5      S:  a  Me  UR                  5        HW  nUT(;   a6  UT(;   a0  T(U   nT(U   nXV-   nT( H  nT(U   UL d
  T(U   UL d  M  UT(U'   M     M?  UT(;   a
  T(U   T(U'   MO  T(U   T(U'   MY     M     M     SU)U 4S jjm)  S         SU(U)4S jjjn	0 n
[        R                  R                   R#                  5        H  n[        U[$        R&                  5      (       a  UR(                   H  nSX'   M	     M;  [        U[        R*                  5      (       d  M\  UR-                  5        Vs/ s H&  n[        U[$        R&                  5      (       d  M$  UPM(     nnU H  nUR(                   H  nSX'   M	     M     M     SnT R                   Hz  nUR                  c   e[/        UR                  R1                  5       S S	9nU H?  n[        U[$        R2                  5      (       d   eS
nX;  d  M-  UR                  5       X'   MA     M|     T R                   GHd  n[4        R7                  SUR                  5        U(       a  UR                  c   e[/        UR                  R9                  S
S9S S	9nU Hi  nX;   d   U SU
 35       eX   =nc  M  T R:                  U   R                  5        H+  nUR=                  [?        UR                  5       5      5        M-     Mk     [        UR@                  RB                  5      S:X  aQ  [E        [G        UR@                  RB                  5      5      =n(       a"  [        U[H        5      (       a  URJ                  nOSnUR                  5        GHB  n[        URM                  5       5      S::  d   eURM                  5        GH
  nT)" U5      nU	" UU5        UR=                  [?        UUS95        T(U   RN                   H  nUR                  5       UR                  5       :X  a  M'  [        UR                  [P        5      (       d   eUR                  R                  5        Hc  nUR                  5       nT)" U5      nUUR                  5       ;   nUR=                  [S        UUR                  5       U(       + S95        U	" UUS
S9  Me     M     GM     GME     [        R                  RT                  UR                  5           H3  nU	" UUS
S9  UR=                  [S        UUR                  5       S
S95        M5     [        R                  RV                  UR                  5           H%  nU	" UUSS9  UR=                  [?        U5      5        M'     UR@                  RX                   H<  n[        U[R        5      (       a  M  U	" URZ                  XR]                  U5      5        M>     UR_                  T R`                  5        UR                  5        H  nURM                  5        Hz  nUR                  5       T R`                  T)" U5      '   UR                  5       T R`                  U'   T Rb                  Re                  UU5      T Rb                  UR                  5       '   M|     M     GMg     [        R                  Rg                  5        H4  n[4        R7                  SU5        U	" U[i        [?        U5      5      5        M6     U(       a  [        R                  Rj                   H  nUR9                  S
S9 H  nX;   d   U SU
Rm                  5        35       eX   =n(       d  M/  T R:                  U   Ro                  5        H5  n[4        R7                  SUU5        U	" U[i        [?        U5      5      5        M7     M     M     T R`                   H  nU[        R                  R                   ;   aF  U	" U[i        [?        U5      5      5        [        R                  Rp                  Rs                  U5        Mg  U[        R                  Rt                  ;   d  M  U	" U[i        [?        U5      5      5        M     [w        [        R                  R                   Rm                  5       5       V Vs0 s H	  u  n nUU _M     n!n n[        R                  Rp                   Vs/ s H  nU!U   PM
     sn[        R                  l<        T R                   HF  nUR                  5        H/  nUR{                  T(UR                  5          RN                  5        M1     MH     T R|                   H.  nT R|                  U   R{                  T(U   RN                  5        M0     [        5       n"U"R                  S5        T(RO                  5        Ha  u  nn#U"R                  5          U#RN                   V$s/ s H  n$U$R                  5       PM     n%n$U"R                  SU SU% S35        SSS5        Mc     U"R                  S5        U"R                  5       R                  5       n&[        R7                  S5        [        R7                  SU&5        gs  snf s  snn f s  snf s  sn$f ! , (       d  f       M  = f)zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
c                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListir  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g ry   )r/  r   
membership)ri   r/  rq  s      rj   r  :Scheduler.compute_dependencies.<locals>.DedupList.__init__|  s    
 #[b
","<
rm   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g ry   )rq  r/  r   r  )ri   	node_users     rj   r   8Scheduler.compute_dependencies.<locals>.DedupList.append  s3    /

!!),##I.rm   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf ry   )r   rd  rq  r/  )ri   r  new_membershipr  	new_items	DedupLists        rj   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)r/  rq  rV  )r/  zOptional[list[_T]]rq  zOptional[OrderedSet[_T]]r   rs  )rt  r^   r   rs  )r  DedupList[_T]r   r|  )	r{   r|   r}   r~   r#  r  r   rz  r   )ry  s   rj   ry  ro  r  s@     -17;=)= 5= 	=/< <rm   ry  r   c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ ry   )r  )rD  r  ri   s    rj   r  .Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hrm   Fc                N   > TT" U 5         R                  [        XU5      5        g ry   )r   r  )used_by_namer  ru  r  name_to_usersr  s       rj   add_user0Scheduler.compute_dependencies.<locals>.add_user  s'     &./669rm   Nc                    U R                   $ ry   r  rQ  s    rj   r  0Scheduler.compute_dependencies.<locals>.<lambda>  s    AFFrm   r  Tzscheduling %s)unbacked_onlyc                    U R                   $ ry   r  rQ  s    rj   r  r    s    !&&rm   z not in )rL  )mutating_bufr
  )r  )r
  zscheduling output %sz+scheduling output %s for unbacked symint %sr  'z': r:  r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)rD  r  r   r  )FF)
r  r  r  r  ru  rx   r  rx   r   rs  )Er	   r^   r  r   r5  r  r0  r   r   r=  r)   r=   r   r>  rW   r   r  r   r   r0  r    	TensorBoxr  r  get_unbacked_symbol_defsSymbolr  r  r  r  r  r3   r   r  r3  rF  r2   rL  r@  r/  r\   r4   additional_buffer_depsadditional_star_depsr   r   ru  r  r  r  r  r/  r`  graph_outputsr  r=  mutated_inputsr  r  r4  mutated_input_idxsrg  rW  rO   r  rA  rB  r  compute_dependencies_log)*ri   r   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_nodevalfsr  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namerD  out_buf
other_nameis_aliasadd_depr  rR  r   r   r   	inp_nameslogbufr  r4  r-  r  ry  r  r  s*   `                                      @@@rj   rZ   Scheduler.compute_dependenciesl  s	   	< 	<@ @K?V?V@
 JJD((* MMO	 tyy//??D,,./!3!%!1!1!3I M1i=6P -i 8 -i 8#(=#0C -c 2e ;#0#5#>5=c 2 $1 #m33@3Ki03@3Ki0 "4 + <	 	 !&!				 <		 			
 		 		 		 MO&
 77''..0C#uzz****B9=26 +C.. (+||~S~!Auzz9RA~S!Ann=A6: - " 1 ',#JJD99((( $*		224:J$  *!!U\\2222 /3+:8<25 *   JJDIIotyy1*yy,,,'-II222F(($
 .A> #X&D%EF> <>>K#'#4#4Q#7#C#C#EC --gclln.EF $F . D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG'+yy'<'<'>G)0)9)9);J)/
);J (073F3F3H'HH -- '$.1408L!" %ZtD% (? !> !4 *B 7799$--/J$5 !!''4==?D"QR	 K 7777H$6!!''"23 I
 ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *a r 002HII,h7Xz'(*;<= 3
 'ww,,111EA> #X&D&I&I&K%LM> ;==q=(,(9(9!(<(M(M(OHII M ( !
 %Xz'(:K/LM )P F - ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4D%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0  !c'--/JC/4{{;{!{;#c%23 ! 0 	c  "))+ &&';< &&'I3OK TX
&
" < !s6   "#l	l2l
!ll*lll
l)	c           
     2  ^ ^ SSK JnJnJnJn  [        [        R                  R                  R                  5       5      nU" T R                  U5      n[        R                  R                  R                  (       d  U" T R                  T R                  5        [        [        R                  R!                  5       5      nU" T R                  UU5      u  n  n	[#        [%        T R                  5      5       V	s/ s H  n	/ / 4PM	     sn	mU H  n
U
R&                  S:X  a  U
R(                  S:X  a  M%  U
R*                  R-                  5       nTU
R.                     S   R1                  U5        TU
R2                     S   R1                  U5        M     SSKJn  U" 5               SU U4S jjn/ n[9        T R                  5       HE  u  nnUR1                  U5        UR1                  U" X[%        T R                  5      S-
  :H  S95        MG     UT l
        g s  sn	f )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                N  > TU    S   nTU    S   nX#U/n[         R                  " [        [        R                  " S5      S9[        R
                  R                  R                  R                  / US S9nSTR                  U    R                  5        3Ul        [        TU5      $ )Nr   r   rY  )r[  c                $    U US   US   US   S.4$ )Nr   r   r   )alivedeadis_final_steprw   )tensor_argsrY  s     rj   r  WScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>  s(    !.q!1 -a 0)6q)9Crm   )r=  rU  r  nontensor_argsunflatten_args
mem_check_)r)   MemoryCheckKernelr=   r  r[  r`  _inductor_debugcheck_memory_stepdefaultr5  r0  r  r  )step_idxr  expected_newly_aliveexpected_newly_deadr  r   ri   step_allocs_deallocss         rj   construct_mem_check_nodeEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C2WN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rm   )r  )r  r!  r  rx   r   r  )r.  r  r  r  r  r   rW   r   r  r  r5  r  r  r&   r  rX  r/  r  r   
size_alloc	size_freer  r0  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr  r4  )ri   r  r  r  r  r  name_to_freeable_input_bufr  buf_info_listr  buf_inforR  r  r  	new_nodesr1  r   r  s   `                @rj   r>  #Scheduler.insert_memory_check_nodes{  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
4RH4C
 &H""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG & 	N	9	9*.	9&	9 	92 	 ,GAtT"(DJJRS@S;SU - 
eC
s   8Hc                  ^	 [         R                  (       d  g/ n[        U R                  5       GH  nSS jm	SnUR	                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [        R                  SUR                  5       5        [        R                  R                   R                  UR                  5       5        UR"                  R$                   H  nUR&                  U R(                  ;   d  M  U R(                  UR&                     R                  nU Vs/ s H2  oR*                  R                  5       UR                  5       :w  d  M0  UPM4     snU R(                  UR&                     l        M     GM     [-        [        U5      5      U l        U R                   H  nUR/                  5         M     gs  snf )	z 
Remove any nodes without users
Nc                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ ry   )r  r0  rW   r   r  )rD  s    rj   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    ||Tt}}!'':T:T'TTrm   Fc              3  4   >#    U  H  nT" U5      v   M     g 7fry   rw   )r   ur  s     rj   r   2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %s)rD  r  r   rx   )r&   use_dcer   r5  r  r   r-  r  r  r0  rW   r   r  r  rx  r   r  r   r   r   rX  r   r  r%  )
ri   updated_nodesr   active_buffersr   can_eliminater  r-  r  r  s
            @rj   r  Scheduler.dead_node_elimination  s    ~~
 TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   
/I(=I(c                
    USL$ )z:Check if store mode requires cross-thread synchronization.Nrw   )ri   rL  s     rj   mode_requires_synchronization'Scheduler.mode_requires_synchronization  s    4rm   c                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ ry   r  )ds    rj   r  DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>  s    affrm   r  )r  r  r  r   r   )rD  r   r  rC  seenvisits     rj   r  2Scheduler.topological_sort_schedule.<locals>.visit  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  rm   )rD  r\   r   rs  )r   r\   r(  r=  )ri   r5  r   r   r  rC  r  r  s       @@@@rj   r  #Scheduler.topological_sort_schedule  sj     +,.59V*,	! 	! D--/%)T" 0  D$K rm   c                N  ^  [        5       n[        U[        [        [        [
        [        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7fry   )rX  r2  r  s     rj   r   1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>&  s(     XZc))#.??AAZr  c              3  B   >#    U  H  nTR                   U   v   M     g 7fry   r)  )r   rD  ri   s     rj   r   r  '  s     Q=at66q9=s   )r   r   r   r  r  r   rm  r  r  r   RuntimeErrorr   r  )ri   r\  
unmet_depsr   unmet_dep_opss   `    rj   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes  s    &0l
)&"$	
 	
 //sxx( 0 =d5k]!L  YZXJQ=QQRRrm   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	r(  fromkeysr5  r  r   r  r   r/  r  )ri   r  r5  childrenr   r  r   crD  r4  zero_deg_nodesrD  s               rj   rC  !Scheduler._topological_sort_nodes)  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                j   0 nU R                    Hw  n[        5       nUR                   HB  nU R                  UR                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l        My     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)r5  r   r  rX  r   r2  r  r0  r   r4  r  r  )ri   name_to_ancestorsr   r   r   dep_node_namer  s          rj   r  Scheduler.compute_ancestorsC  s    
 9;JJD)3I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1rm   c                   [         R                  (       d  g U R                   H  n[        U[        [
        45      (       a)  UR                  5       (       d  [         R                  S:w  a  MI  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)r&   r  r5  r   r   r   rR   cpu_backendr   rj  r  )ri   r   r\  s      rj   r  Scheduler.merge_loopsV  s    00JJD d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * rm   c                   [        SSSS9   [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  USS9n[        U5      n[        R	                  S	US-   UU5        XC:X  d  US:X  d  Mk  [        R	                  S
US-   5          O   [        R                  (       d  [        R                  (       a  U R                  USS9nUsSSS5        $ ! , (       d  f       g= f)z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodesTr  rn  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   r  r  fuse_nodes_oncer&   r  loop_index_inversion_in_fusion)ri   r5  r1  old_lennew_lens        rj   rw  Scheduler.fuse_nodesq  s     #4QU
 2Ye*  EE
 ,,UU,Ke*  TE	 %A$$Eq1u ' , 1188,,UT,J;
 
 
s   A3C%AC%%
C3c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)r5  r  r   rm  rx  )ri   r  r   s      rj   r6  Scheduler.process_grouped_nodes  sF     .0	JJD!+D2F!G!GdV  
rm   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   rN  r  r   r	  )ri   r5  r[  r  s       rj   r	  Scheduler.benchmark_fused_nodes  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
s   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX@l        U R                  U5      n[	        S5         UR                  XUS9sSSS5        $ ! , (       d  f       g= f)r  r   generate_kernel_code_from_nodeshint_overrideN)r   r   rN  r  r   r  )ri   r5  benchmark_kernelr  r[  r  s         rj   r  )Scheduler.generate_kernel_code_from_nodes  si     5zA~~q$$&$""6*;<::} ;  =<<s   A!!
A/c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)r  benchmark_codegened_moduleN)rN  r  r   r  )ri   moduler[  r  s       rj   r  $Scheduler.benchmark_codegened_module  s=     %""6*6755f= 877s	   >
Ac                   [         R                  R                  nU(       d  g[        R	                  SX5        UR
                   H  nUR                  5       n[        USS5      (       a  XB;  a  M,  UR                  nX$   n[        U[        R                  5      (       a'  UR                  UR                  5        UR                  n[        U[        R                  5      (       d  M  Xe:w  d  M  [        R                  SUUU5          g   g)z
Check if selecting a Triton template would cause layout conflicts.
Returns True if there's a conflict and we should fall back to ATen.
FzNode %s has constraints %sr=  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rW   r   buffer_layout_constraintsr  r  rW  r0  r  r=  r   r)   FlexibleLayout freeze_layout_with_exact_stridesr  FixedLayoutr  )ri   
multi_nodeconstraintsinpinp_namer=  expected_layouts          rj   !_has_layout_conflict_for_template+Scheduler._has_layout_conflict_for_template  s     gg77		.
H$$C||~H3$//83NZZF)3O&""3"344 44_5K5KL&"..11o6Oe#	 - %0 rm   c           
     0   [        U R                  5       GHe  u  p[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pEO [        S UR                  5        5       5      n[        U[        R                  R
                  R                  5      (       a  U R!                  U5      (       a  UR                  5        H:  n[        U[        R                  R"                  R$                  5      (       d  M8  Un  O   [        W[        R                  R"                  R$                  5      (       d   S5       e[        U[        R                  R
                  R                  5      (       a  [        R&                  (       a  0 nXGS'   [        R&                   Hm  nUR                  US9n	U	R)                  5        V
Vs0 s H  u  p[        U
[        5      (       d  M  X_M      nn
n[+        UR)                  5       S S9S   nXgU'   Mo     UR                  R-                  U5        OUR                  R/                  U5        GM  [
        R0                  R3                  UR4                  5         UR7                  5       nSSS5        WR8                  n[        U[
        R:                  5      (       d   eUR8                  n[        U[
        R<                  5      (       d   eUR>                  (       a  [A        XR>                  5        URB                  Ul!        U RE                  XX5        GMh     gs  snn
f ! , (       d  f       N= f)aP  
Finalize a backing choice for MultiTemplateBuffers which did not already have a
choice finalized through fusion. In the case of an extern choice, this will result
in replacing the SchedulerNode.

If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
will force completion of compilation and benchmarking.
c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7fry   )r   r  r  r)  ExternKernelCaller)r   timings     rj   r   <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>	  s7      *E) & % @ @ S S  #F*Es
   7A	AzZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr  c                    U S   $ r  rw   rQ  s    rj   r  ;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>9  s	    qQRtrm   r  r   )#r4  r5  r   r   r   r)   MultiTemplateBufferr&   r<  %force_extern_kernel_in_multi_templateget_min_choicer3  choice_timingsr  r  r   r!  r)  r%  multi_kernel_hintsr/  r  finalize_as_triton_callersfinalize_as_triton_callerr	  current_originsr  output_noder   
StorageBoxOperationBufferorigin_noder8   r=  _replace_node)ri   r1  r   r  min_node_unfusedr  choicecallershinttimingsr  r4  triton_timingsout_tensorboxout_storage
out_buffers                   rj   r&  )Scheduler.finalize_multi_template_buffers  s    !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&?? 
 ==jII&0&?&?&AF) & % @ @ S S    4: 0 % 'B  *"EOO$D$D$W$W     y   $OO&&?? 
 00QS(8 %+$=$=D&0&?&?d&?&SG -4MMO.,;DA#-a1I#J !%,; + .
 &))=)=)?^%TUV%WF,2DM %> 		<<WE		;;<LMYY..z/A/AB$4$@$@$BM C+00!+r}}====(--
!*b.@.@AAAA))&}6L6LM$.$5$5
!"":1CY -h. CBs   N
/N
?N
N	c                  ^ [        X!5        U R                  U5      nXPR                  U'   XPR                  UR	                  5       '   XPR
                  UR	                  5       '   0 m[        R                  " UR                  R                  UR                  5       HA  nU R                  R                  UR                  S 5      =n(       d  M2  UR                  TU'   MC     SU4S jjnU" UR                  5      Ul
        U" UR                  R                  5      UR                  l	        [        UR                  5       UR                  5       5       H2  u  pXR                   U
R	                  5       '   U
R"                  U	l        M4     UR$                  Ul        UR&                  Ul        UR(                  Ul        UR*                  Ul        g )Nc                .   > [        U4S jU  5       5      $ )Nc              3  D   >#    U  H  oR                  T5      v   M     g 7fry   )r  )r   r   r  s     rj   r   ?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>c  s     Kdsjj)9::ds    r   )r  r  s    rj   rename_deps,Scheduler._replace_node.<locals>.rename_depsb  s    KdKKKrm   )r  r  r   r  )r  r  r5  r  r0  r)  r  r  r   r   r  r  r  r   r)  r  rX  r-  r  r  r   r~  )ri   r?  r  r1  r   new_scheduler_noder   	real_namerE  new_outold_outr  s              @rj   r6  Scheduler._replace_nodeN  s{    	"*9!77
C*

1-?$--/*3E0 ??4#3#3#9#94;R;RSC 3377$GGyG.1hh + T	L 1<111
- 0;**000
&&, !$**,d.>.>.@!
G 4;W--/0#MMGM	!
 (,~~$'+~~$'+~~$(,%rm   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moderM  )rT  r   r   rN  rC  s     rj   r   ,Scheduler._any_atomic_add.<locals>.<genexpr>x  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r   )ri   	node_lists     rj   _any_atomic_addScheduler._any_atomic_addw  s     

 
 
 	
rm   c                (   U R                  USUS9n[        R                  " U5      n[        R                  R
                  R                  5       nUR                  5       (       d  S nXd4$ UR                  SUS9n[        U[        5      (       d   eXd4$ )NT)r  r  triton_)kernel_namesource_code)r  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )ri   r5  r  src_codemodrX  futs          rj   compile_kernelScheduler.compile_kernel  s     77D 8 
 x(55BBD--//C
 z  &&9(&SCc<0000zrm   c                  ^ ^^^^^^^^^^ ^!^"^#^$^%^&^'^(^)^* [        S TT4 5       5      n[        R                  (       d  U(       d  [        R	                  S5      $ TR                  5       (       a-  [        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  [        R	                  S5      $ TR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a)  [        R                  S:w  a  [        R	                  S5      $ TR                  5       n[        [         R"                  " XE5      5      nT R%                  U5      (       a  [        R	                  S5      $ SSKJm  [+        TT5      m*US   R                  5       mTc   eSUU4S jjm$U(       Gau  [        S	 TT4 5       5      (       Ga[  TR                  5       SLmT(       a  TR                  5       OTR                  5       m)[        T)[        R,                  5      (       d   eT R/                  T)5      (       a  [        R	                  S
5      $ 0 m#/ m![        R0                   GHI  nT)R3                  U5      m[5        TR7                  5       S S9 H  u  p[        U[8        R:                  R<                  R>                  5      (       d  M:  T)RA                  U5         T!RC                  U/T RE                  XhRF                  S9Q75        SSS5        M     [I        S5      n
Sn0 nT! HW  u  pn Ub  URK                  5         T)RA                  U5         T R[                  UT5      u  nnUX'   UU
:  a  Un
UnSSS5        MY     UT)R\                  U'   [        U[^        5      (       d   eUT#U'   GML     [        R`                  m[c        S T)Rd                   5       5      n[g        5       =(       a!    T(       + =(       a    U[        Rh                  :*  m"[I        S5      [I        S5      sm&m'Sm%T"(       dP  T)R3                  5       mT)Rk                  5       u  m%m&[5        TR7                  5       [l        Rn                  " S5      S9nOT)Rd                   Vs/ s H  nUS4PM	     nnT(       a-  T(       a  T Rq                  U5      OT Rq                  U5      u  m'nO9T(       d  [        R	                  S
5      $ TRs                  5       m'[u        TTT'5      m(/ m!SnU H  u  nn[        U[^        5      (       d  M  T(       d-  [w        US5      (       a  URx                  T)Rx                  :w  a  MQ  T(       a  UT&T'-   :  a    O]US-  nU[        Rh                  :  a    OBT)RA                  U5         T!RC                  U/T RE                  U5      Q75        SSS5        M     [{        T!5      S:X  a  [        R	                  S
5      $ SUUUUU!U"U#U$U%U&U'U(U)U 4S jjn[        R}                  UT!S   S   5      $ T RE                  U5      mT RE                  U5      m T RE                  U5      mSUUUUU U$U U*4S jjn[        R}                  UTS   S9$ ! , (       d  f       GM  = f! [L         a]  n[N        RQ                  [R        RT                  5      (       a)  [N        RW                  ST(       d  SOS[Y        U5      5         SnAGM  SnAff = f! , (       d  f       GM  = fs  snf ! , (       d  f       GM  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7fry   )rj  r   r&  r)   r*  rC  s     rj   r   .Scheduler.speedup_by_fusion.<locals>.<genexpr>  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   rY  r   CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  rZ  r[  DEBUGr  r=  rB   rC   )ms_fusedms1ms2r   r   s      rj   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rm   c              3  D   #    U  H  oR                  5       S Lv   M     g 7fry   r*  rC  s     rj   r   rc    s      %
7E!!-~s    Fc                    U S   $ r  rw   rQ  s    rj   r  -Scheduler.speedup_by_fusion.<locals>.<lambda>  s	    aPQdrm   r  r  infException in compiling %s: %sr7  r9  c              3  B   #    U  H  n[        U[        5      v   M     g 7fry   )r   r   )r   r  s     rj   r   rc    s      %ASA
1677AS   r   allowed_prologue_inpsc            	       > [        S5      n S n0 nT(       aY  T(       a  [        T[        R                  5      (       d   eTR	                  5       mTR                  5       u  mm[        TU4S jS9mT H  u  p4n Ub  UR                  5       nO'T(       d  UR                  nUR                  5         OS n T(       a=  TR#                  U5         TR%                  UT5      u  pXU'   X:  a  Un UnS S S 5        M  TU:H  =(       d    TT-   TU   T-   :  n
U(       d  M  ['        UR(                  5      S:X  d  M  UR(                  S   R*                  S	::  d  M  U
(       d  M  Un  O   T(       a
  T" U TT5        T(       a	  U TT-   :  aP  UbM  [,        R.                  (       a  UTS '   TR1                  T5        OTR3                  U5        UTR4                  S '   g
g! [         a]  n[        R                  [        R                  5      (       a)  [        R                  ST(       d  SOS[!        U5      5         S nAGM  S nAff = f! , (       d  f       GM  = f)Nrq  c                   > TU S      $ r   rw   )r  r-  s    rj   r  KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>f  s    nQqT&:rm   r  rr  r7  r9  r   r   rA  TF)r  r   r)   r*  r-  r,  r  rC  rT  
precompiler  r  rZ  r[  rh  r  r  swap_as_triton_callerr  r   	launchersn_spillsr&   r.  r/  r0  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr8  rf   	mod_fusedresr  ri  pathfusible_choicebench_epiloguer-  r[  r  future_choicesget_choice_timings_async hint_override_best_fusion_choicerl  
min_choicerj  rk  	ms2_fusedr  ri   s              rj   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyY  s,   $U|"& +%*ZAWAW*X*XXX%/%>%>%@N&0&?&?&AOJ%+&:&N
 2@-FI!!-"(--/C!/"+"3"3CNN,"&C &'==fE-1-L-L ) &.NH
 3;/'6/728 FE '&0 N"Sy>&+AI+MM '  C #CMM 2a 7 #a 0 9 9Q > ..4O!a 2@d "|S#6 ',#)*D%100AP8>"==<
 #<<_M 8CJ..t4 u % !%227==AA&,, ?2A
z #A
 !! FEs1   :G#G2G"I
H>!AH99H>
I	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiorw   )rj  rk  ri  path1path2
path_fuseds   rj   r  rx    s&    053605365?8@3;sSy3I%rm   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  rC  r  mathisinfr   r!  r  r   rC  r  )r  r]  r  rj  rk  ri  r  r  r  re  r[  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rl  ri   r  s      @@@@@@rj   r  r    s   A *!,)!,/2 
 ?JJL  "&!@!@)!,"JC
 zz#CD$!%!@!@)!,"JC
 zz#DE$+/+J+J/2,(Hj
 zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFFrt   )ri  r  rj  r  rk  r  r   rs  rt  )?r   r&   benchmark_fusionra   rq   rj  r   r&  r)   TritonTemplateBufferrp  r   r   r   r  r  r  r  rQ  triton.compiler.errorsre  r{  r*  r!  r.  r-  r  r/  r  r  r)  TritonTemplateCallerrz  r   r^  r  r  rC  r  r  rZ  r[  rh  r  r  r  r}  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesr,  operator
itemgetterr	  r  r  rT  ru  r   ru   )+ri   r   r   is_multi_templatenode_list_1node_list_2node_list_fusedr  r8  r  r~  r  r  rf   r  r  ri  r  num_triton_callerschoice_timings_iterr  r  triton_choicesunfused_timer  re  r  r-  r[  r  r  r  r  r  r  r  rl  r  rj  rk  r  r  r  s+   ```                      @@@@@@@@@@@@@@@@@@rj   speedup_by_fusionScheduler.speedup_by_fusion  s       
 U^ 
 

 &&/@$$T** u668":Q:QRR!!!!  $$T**oo'Q**,v ;;%F$6$6($B$$T**oo'y{HI
 00$$T**;u% #..0!!!	 	"  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA55jAA#((//  - TVN!'!:!:!+!:!:=!I!'(<(<(>N!SIF% @ @ U U  !#99&A&-- &!%!4!4$3CWCW "5 "" BA "T  %U|FJ 1?-FI
!!-"MMO $99&A)-)H)H%v*$ /7+#l2+3L.4O BA 2@( =H
**=9!/3KLLLLBQ0?U ";X $==N!$ %AKASAS% "
 )* R&&R&&*Q*QQ % U|U5\HC15J+!+!:!:!<",";";"=
C&,"((*0C0CA0F'# 8B7I7I&J7I!1v7I#&J ' ..{;33K@ U '',,U33224<UE3O	 TVNN(;$!&*BCC ((?@@44
8X8XX!lcCi&?!#!F$K$KK55f="))G$"5"5o"FG >=/ )<8 >"a'#((//V! V! V!p  --$nQ&7&:  !% 3 3K @ $ 3 3K @&*&9&9/&J#F FP  --09PQR9S .  q BA" % !%227==AA&,, ?2A
z #A
 !! BAF 'KT >=sC   -Z$Z77$\!3\4%\9$
Z47
\A\\!
\19
]		c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)r)  r1  r  s     rj   r  Scheduler.get_fused_node  s    &&t':':'<==rm   c                   [         R                  SUR                  5       UR                  5       5        UR                  5       nUR                  5       U:X  d   eU R	                  U5      R                  X5      nUR                  U5        UR                  U5        UR                  U5        U R                  R                  UR                  5        Vs0 s H  ofR                  5       U_M     sn5        U$ s  snf )Nzfusing %s with %s)r  r  r0  r   r  rq   r  r  r)  r  r   )ri   r   r   r(  r[  node3rD  s          rj   fuse_two_nodesScheduler.fuse_two_nodes  s     	,enn.>@PQ!!#!V+++  (--e;5!5!&&U__EV'WEV

e(;EV'WX (Xs   C7c                    U R                  X5      (       a5  U R                  X5      (       d  U" 5       (       a  U R                  XU5        ggNTF)r   will_fusion_create_cycler  )ri   r   r   
speedup_fnr(  s        rj   fuse_if_speedupScheduler.fuse_if_speedup  sA     MM%''11%??k:rm   c                   U(       Ga  / n0 n[        5       nU GH(  nXa;   a  [        X   5      S:  d   eX   R                  S5      n[        X   5      S:X  a  UR                  U5        UR	                  5       u  pX:X  a  [        X5      (       d   eUn
OX:X  d   e[        X5      (       d   eU	n
U R                  U
5      U
La  M  UR                  (       a3  UR                  R                  nUc   eUR                  U5        Xv4XK'   M  U R                  XUR                  U5      (       d  GM  UR                  U5        GM+     [        U5       Hq  nXK   u  p|U R                  U R                  UR                  5      U R                  UR                  5      UR                  U5      (       d  M`  UR                  U5        Ms     U H  nUR                  U5        M     U(       a  GM  gg)z
Evaluate pending template fusions for a set of fusion candidate nodes.
The fusion candidate nodes are pointwise nodes as potential epilogue
or prologue fusions
r   r   N)r   r   r  r  r   r  r  r  rf   r   r  rd   r   r   r   )ri   template_fusion_candidatesr(  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr   r   r8  fcands                rj   "_evaluate_pending_template_fusions,Scheduler._evaluate_pending_template_fusions-  s    )-/  % @J|7	;6ABaGH "<!F!J!J1!M1<=B%)))4->>@%-e;;;;$)M ----e;;;;$)M &&}5]J!((&--44A=(=$++A.3A2M,/ ++n&@&@+  *--i8K 8P ""23'?'B$''''(<(<=''(<(<="..	  &))$/ 4 '*..q1 'q )(rm   c                  ^ ^^       SUUU 4S jjnU GH\  u  pxU" Xx5        T R                  U5      nT R                  U5      n[        Xx5      (       a  Xx4T R                  ;   a  MS  T R                  XxU5      (       d  Ml  T R	                  Xx5      (       a  M  T R                  Xx5      n	U	R                  b  [        U	R                  UUU	R                  S9n
[        Xx5      (       aW  Xx4T R                  ;  d   eT R                  R                  Xx45        [        Xx5      nX;  a  / X;'   X;   R                  U
5        O
U
TU'   U
TU'   GM6  U	R                  (       d  GMJ  T R                  XxT5        GM_     g )Nc                  > TR                  U 5      T;   d  TR                  U5      T;   Ga  TR                  TR                  U 5      TR                  TR                  U5      5      5      nUc   eUR                  5       u  p4UR                  nTR	                  US 5        TR	                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  GM  TR                  X4T5        TR                  U 5      T;   a  GM  TR                  U5      T;   a  GM  g g ry   )r  r  r   rd   r  r  r  )	r   r   r  	node_key1	node_key2
is_speedupr(  pending_fusionsri   s	         rj   resolve_pending_fusions<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions{  s3   
 ##E*o=&&u-@!0!4!4''.#''(;(;E(BC" &111'5'F'F'H$	+77
##It4##It4**95BBB**95BBB!||t'D'DU'R'R##I+F+ ##E*o=&&u-@@rm   )rd   r   r   rf   r  )r  r  r  r   r  r  rd   r   rf   r  r  r   rc   r  )ri   possible_fusion_pairsr  template_fusion_nodesr(  r  r  r   r   
fusion_resr  template_pw_nodes   ` ` `       rj   _try_fusion_pairsScheduler._try_fusion_pairss  s   	G$	G$	G 	G 	G8 2LE $E1''.E''.E #500Nd&@&@@}}. 33EAA!33EA
))5%2$.$:$:##)00	&N *%77 %~T5O5OOOO2266~F+B5+P(+HFH1C-?FF~V1?.1?.!--##E+>W 2rm   c                N   [        5       nUR                  5        H  nUR                  5       u  pVUR                  nXs;   d  [	        XV5      (       a  M8  UR                  U5        U R                  U5      UL d   eU R                  U5      UL d   eU R                  XVXq5        M     g ry   )r   r   r   rd   r  r  r  r  )ri   r(  r  seen_pair_speedup_fnr  r  r  is_speedup_fns           rj   _finish_pending_fusions!Scheduler._finish_pending_fusions  s    
 @J| .446N#1#B#B#D I*66M48J9 9  $$]3&&y1Y>>>&&y1Y>>>  }R 7rm   c           
        [        U VVs/ s H  u  p4[        X45      (       d  M  UPM     snn5      n/ nU H@  u  p4[        X45      (       a  X5;   a  UR                  X445        M.  UR                  X445        MB     Ung s  snnf ry   )r   r  r  r   )ri   possible_fusionsdeferred_prologue_fusionsn1n2epilogue_template_nodesnew_possible_fusionss          rj   _handle_template_overlap"Scheduler._handle_template_overlap  s     #-.M.FB2DR2LR.M#
  "&FB!"))b.K)00":$++RH5	 ' 0 Ns
   A>
A>
c                ,   U R                  U5        [        U5      n[        R                  [        R
                  5      (       aB  [        R                  S5        U H'  n[        R                  SUR                  5       5        M)     0 n0 n/ nU R                  UU5      n[        R                  (       d  [        R                  (       a;  [        R                  (       a&  [        R                  (       a  U R                  X5        U R                  UUUUU5        U R!                  X55        U R#                  Xc5        UR%                  5         U(       a&  U R                  UUUUU5        U R#                  Xc5        ['        US S9nU R)                  U5      nU$ )z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  %sc                    U R                   $ ry   rh  rQ  s    rj   r  +Scheduler.fuse_nodes_once.<locals>.<lambda>3  s    !++rm   r  )r*  r   r  rZ  r[  rh  r  r  get_possible_fusionsr&   r'  r(  r  r  r  r  r  r  clearr  r  )	ri   r5  r  r(  r   r  r  r  r  s	            rj   r   Scheduler.fuse_nodes_once  si    	!!%( '""7==11;<#  )=)=)?@ $  	
 OQ  	"  44
 %%)<)<&&&&))*:V!	
 	$$[B//0ES##%$"")%  334IW{(=>..u5rm   c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Trd  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ ry   rh  rQ  s    rj   r  5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>]  s    q{{rm   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r5  r   r  r  r4  r	  rU  r>  speedup_by_combo_kernelr&   rf  r(  r  r  r  r)  r  r   r0  r  r  r*  )ri   r  r(  r  num_nodes_orignumrP  r'  r_  r   rD  s              rj   r-  #Scheduler.create_combo_kernel_nodes7  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
R

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g ry   )r*  r)  )ri   r5  r   s      rj   r*  Scheduler.prune_redundant_depsg  s     D%%d&=&=> rm   c                  ^ ^^
^ / m
[         [        [        [        4      " 5       mSUU
UU 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nXF   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      n	U	(       d  M  X   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T
5      m
T
R                  T R                  SS9  [         R#                  S[%        T
5      5        T
$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pU US-   US-   [        R                  -     H  nX#4nUT;   a  M  TR                  U5        TR	                  X#T5      (       a  TR                  U5        MI  UR                  5       (       d  UR                  5       (       d  Mu  TR	                  X2T5      (       d  M  TR                  X245        M     M     g r  )r4  r&   )max_fusion_buffer_group_pairwise_attemptsr  r   r   rj  rp  )	r5  node1_indexr   r   r  r  r  r  ri   s	        rj   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairsv  s    &/&6""!Ok'FF'GE
 !.Cd{ HHSM}}U3CDD(//4++--1A1A1C1C&6J J )//?! '7rm   r   NT)r  reversezfound %d possible fusionsr5  rI  r   rs  )r   r   r\   r  r   r  unfusable_noder   r   r   r&   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )ri   r5  r  r  buffer_names_groupingr   r   node_groupinggroup_groupingr   r  r  s   ` `       @@rj   r  Scheduler.get_possible_fusionsk  sV    % 13D DEFH	@ 	@( !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLrm   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fry   r  r   rD  
found_pathri   s     rj   r   IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s,      H!DA #4#:#:1#=>>!D   "%)r   r   r  r   issubsetrx   r   r   )r   combined_ancestorscombined_namesr  ri   visiteds    rj   r  6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  rm   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fry   r  r
  s     rj   r   5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s&     WDVqJt66q9::DVr  zwill create cycler  )r   r   r   _dictr  r   r   r{  )ri   r   r   cycler  r  r  r  s   `   @@@@rj   r  "Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78rm   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two nodes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r  )
r   r   rX  r  r   r   r-  r   has_tensor_outputr   )r   r  r  r   ri   s       rj   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mrm   c              3  4   >#    U  H  nT" U5      v   M     g 7fry   rw   r   r   r  s     rj   r   <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #S]c$4S$9$9]r  c              3  4   >#    U  H  nT" U5      v   M     g 7fry   rw   r  s     rj   r   r     r!  r  r   r   F    T)r   r\   r   zlist[ir.Buffer])r  r  r   intersectionr!  r  r  rW   r   r   statically_known_gt)ri   r   r   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @rj   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                   [        UR                  5        Vs/ s H  oDR                  5       PM     snUR                  5        Vs/ s H  oDR                  5       PM     sn-   5      n[        S UR                  R                   5       5      n[        S UR                  R
                   5       5      nXv-  n[        5       n	UR                  R                   HA  n
U R                  U
R                  U5      (       d  M&  U	R                  U
R                  5        MC     [        S UR                  R
                   5       5      [        S UR                  R
                   5       5      -  n[        S UR                  R                   5       5      [        S UR                  R                   5       5      -  nX-
  nX-
  nX-  n[        U5      U:  $ s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   EScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s     &T;SCxx;Sr  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r1    s     %R:Q3hh:Qr  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r1  &  s      $
 7HH 7r  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r1  (  r  r  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r1  +  s      %
 8HH 8r  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r1  -  s     D+CCxx+Cr  )
r   r   r0  r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )ri   r   r   	thresholdr   fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionrG  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                   rj   (fusion_prevent_too_many_reads_and_writes2Scheduler.fusion_prevent_too_many_reads_and_writes
  s    &).):;):]]_):;+0??+<=+<4}}+<=>
 '&T5;L;L;S;S&TT%%R%:K:K:Q:Q%RR'7'K$ :D%**11I88 0  .11)..A	 2 $ $
 % 1 1 7 7$
 
C5+<+<+B+BCCD
 % %
 % 1 1 8 8%
 
D5+<+<+C+CDDE
 &D (G )8$%	11M <=s   GG
c                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heuristic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )r  r  r  r  )ri   r   r   proximity_scores       rj   are_long_distant_nodes Scheduler.are_long_distant_nodes:  sE    * %//12%//12
 ##rm   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  r   rW   r   r  r   r2   r   r   rV   r   
get_offsetnormalize_with_stride_orderr)   r  r=  r  )ri   r   r   common_buf_namesreasonsr   node1_name2depnode2_name2deprR  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  rj   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reasonU  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                .	   [         R                  (       d  g[        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  g[        S UR                   5       5      nXc-
  (       a  g[        U5      S:  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  g[        [        UR                  R                  5      5      n[        [        UR                  R                  5      5      n[        U[        5      (       a  [        U[        5      (       d  gUR                  R                   V	s0 s H  oR                  U	_M     n
n	UR                  U
;  a  gXR                     n[        U[        5      (       d  gUR                  5       nUR                   UR                   :w  a  UR"                  UR"                  :w  a  gUR"                  UR"                  :w  d  [        UR$                  5      S:w  a  g[        UR&                  R(                  5      S:w  a  gUR&                  R*                  (       a  gSUR&                  R(                  ;   a  SUR&                  R(                  ;   d   e[        S UR&                  R-                  5        5       5      n[        U5      S:w  a  g[        [        U5      5      nXR&                  R(                  S   :X  a  SnSnO"XR&                  R(                  S   :X  d   eSnSnS	S
KJn  UR&                  R2                  S	   n[        U5      S:w  a  g/ n[4        R6                  R9                  U5       H;  nUR;                  [<        R>                  R@                  RC                  U5      5        M=     [E        U5      nU" UUS	   5      nUc  gUR&                  R(                  U   UR&                  R(                  U'   UUR&                  R(                  U'   URG                  SS5        U RI                  X5      n[        U[J        5      (       d   e[L        RO                  SU5        U$ s  sn	f )a  
Attempts to enable fusion between two nodes by inverting indexing patterns.

This optimization targets cases where node1 has a contiguous write and
node2 has a contiguous write but discontiguous read. By inverting the
indexing in node2's read and write operations, we can make them compatible
with node1 for potential fusion.

Args:
    node1: First scheduler node (source)
    node2: Second scheduler node (target for inversion)

Returns:
    int: Fusion score if successful, 0 if optimization not applicable
r
  c              3  @   #    U  H  oR                  5       v   M     g 7fry   r\  rC  s     rj   r   AScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s     2>axxzz>r7  c              3  8   #    U  H  oR                   v   M     g 7fry   r  r  s     rj   r   r\    s      .
 8HH 8r  r   r   index0index1c              3  $   #    U  H  ov   M     g 7fry   rw   )r   r  s     rj   r   r\    s     %T7Std7Ss   r   )generate_inverse_formulaTFz!Shared memory after inversion: %d)(r&   r  r   r   buffer_namesr   r  r   r   r  r3  rF  r   r2   r   r  r   r   rq  r  r  	subblocksget_read_exprs$torch._inductor.invert_expr_analysisra  varsr   Add	make_argsr   rW   r   r   combine_modular_indexing_pairsr   r  r  r!  r  r  )ri   r   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writer   node1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexra  r#  simplified_termstermsimplified_read_exprinverse_formulascores                          rj   $shared_data_after_inverting_indexing.Scheduler.shared_data_after_inverting_indexing  s   & 442E>222 #..;;="..;;=0E" $. .
 % 8 8.
 $
  $8'(1, u  &&'!+s53D3D3K3K/Lq/P$u006678
4 1 1 8 89:*i00
9
 9
 161B1B1I1IJ1I##1IJ??,."??3+y11 "++- !2!22  K$4$44??k...#j6J6J2Kq2P u{{))*a/ ;;   222EKK666	
7
 &%Tu{{7Q7Q7S%TT A%./0	 228<<&O' : :8 DDDD&O'Q[[%%a(
z?aII''	2D##  ??E 3  ##3423GTUW "
 7<kk6P6P7
""?3 8G""#34 	""4/((6%%%%%;UCm Ks    Rc                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  5       (       d  UR                  5       (       a  gUR                  R                  5       nUR                  R                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[!        U	["        R$                  " S5      S9u  pn['        U[(        5      (       a  ['        U[(        5      (       d  gUR*                  UR*                  :w  a4  UR-                  5       UR-                  5       :X  a  U R/                  U5      $ gSnUR1                  5       (       d  UR3                  X5      nOZUR1                  5       (       d  UR3                  X5      nO3[4        R7                  SUR9                  5       UR9                  5       5        U(       a*  [:        R<                  " [>        U RA                  X5      5      $ S$ s  snf s  snf )as  
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatible with node1 if that's more efficient.

Return the amount of shared data re-computed in this method.
If no such recomputation happens, return -1 (not return 0 since 0 is a valid
amount of shared data).

c              3  @   #    U  H  oR                  5       v   M     g 7fry   r[  rC  s     rj   r   >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>2  s      8
 .1HHJJr7  r
  r   r   r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)!r&   r  r   rj  r   rb  r  r   rM  r   rW   r   r   	size_hintr   r   r  r  r  r   r2   r  r  dep_size_hintr   r  r  r  r0  r  r  r!  r  )ri   r   r   rj  rk  rl  r   rP  rQ  
candidatesr  rR  rS  _numel	reordereds                  rj   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop"  s     00C 8
!&8
 5
 5
 
 %"3"3"5"5"..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a $'zx7J7J17M#N '9--Z5S5Sw///
   "g&7&7&99))'22	!!##77II##%%77II##Q    KKT55eCD	
 	
g YXs   6K!*K&c                    [        U[        [        45      =(       a6    UR                  5       (       + =(       a    [	        UR
                  5      (       + $ )z.
Is this node unfusable under any conditions.
)r   r  r  rj  rT   r   r  s     rj   r  Scheduler.unfusable_node}  sD    
 t79OPQ C$$&&C7		BB	
rm   c                   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnU" UR                  5       R                  5      (       a  UR                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)r   r  r1  r  )r   rD  r  s      rj   r   EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )Nr   )itemsizeis_floating_point)r  s    rj   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBrm   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   rx   )r   rW   r   invoke_quant_opsr  r  r   r   r  r`  ra  constant_pad_ndr  r,  r  rG  )	ri   prologue_noder8  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  r  s	            rj   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHII!>>@@h rm   c                  ^  [        U[        5      (       a  [        U[        5      (       d  g[        UR                  [        R                  5      (       a)  [        UR                  [        R                  5      (       d  gUR                  5       (       d  UR                  5       (       a  g[        R                  S:X  a  gUR                  UR                  pCUu  pVUu  pxUR                  5       (       d2  UR                  5       (       d  Xh:w  d  [        U5      [        U5      :w  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  gT R                  [        [        UR                  R                  5      5      5      n	T R                  [        [        UR                  R                  5      5      5      n
[!        X5      [        R"                  :  a  gSU 4S jjnU" U5      (       d  U" U5      (       a  g/ n[%        ['        XW5      5       H   u  nu  pX:w  d  M  UR)                  U5        M"     [        U5      S:w  a  gUS   nUU   UU   nn[*        R,                  R.                  R1                  UU5      (       a  UUU4$ [*        R,                  R.                  R1                  UU5      (       a  UUU4$ g)a?  
Fusing two small pointwise nodes significantly reduces kernel overhead
and launch overhead. However, slightly different sizes would prevent fusion.
Here, we decide if expanding sizes of one node is profitible by allowing
fusion, and returns the dimension to expand, node with smaller sizes,
and new size after expand.
Nr  r   c                  > U R                   R                   H  nUR                  TR                  ;   a  TR                  UR                     nO%TR                  R                  UR                  5      nU(       d  Me  [        R                  R                  R                  X 5      (       d  M  [        UR                  [        5      (       a  M    g   gr  )r   r   r   rW  rX  r  rW   r   rR  r  r   r*  r  )r   r  r  ri   s      rj   has_reusable_bufferIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    ((..99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I I,,66yGG&y'<'<>TUU / rm   r   r  )r   r   r   r)   r   r  r&   r  r  r   r   r   r  r  r3  rF  r  small_memory_access_thresholdr4  r)  r   rW   r   r   statically_known_lt)ri   r   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  rj   "get_expand_dim_for_pointwise_nodes,Scheduler.get_expand_dim_for_pointwise_nodes  sl    %//z%7W7W uzz2#4#4555::r'8'899 ))++u/M/M/O/O ) #\\5<<()1&)1&  !!##1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"7223 	  u%%)<U)C)C !'0]1R'S#C#'!#**3/ (T "#q(*1-,',' ' 77//OO66WW11..QQ66rm   c                  ^ XL a  g[        U[        5      (       a  UR                  U5      $ [        U[        5      (       a  g[        X5      nUR	                  5       (       a4  U R                  UR                  5       5      R                  X5      (       a  g[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR	                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR	                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR	                  5       (       Gac  [        R                  (       d	  U" S5        gUR                  5       (       d  UR	                  5       (       a	  U" S5        gUR!                  5       n[        U["        R$                  5      (       d	  U" S	5        gUR'                  5       n[)        S
 UR*                   5       5      U-
  nUR-                  5       U-  (       a	  U" S5        gUR/                  5       (       d  UR/                  5       (       a	  U" S5        gUR1                  5       mTSS  HK  n	U	R3                  5       n
U
 H2  n[5        U4S jUR6                   5       5      (       a  M)  U" S5            g   MM     [        U[8        5      (       d  U/O2UR:                   Vs/ s H  oR	                  5       (       d  M  UPM     snn[=        U5      S:X  d   eUS   n[=        TS   R>                  5      S:X  aU  [=        TS   R>                  S   R6                  5      S:X  a,  TS   R>                  S   R6                  S   R@                  UL d	  U" S5        gU RC                  XU5      (       d  gUR	                  5       (       aH  UR/                  5       (       d*  UR                  5       (       d  [        RD                  (       d	  U" S5        gUR-                  5       [F        RH                  RJ                  -  (       d0  UR-                  5       [F        RH                  RJ                  -  (       a	  U" S5        gUR                  5       nUR                  5       nUU:w  a  U" SUU5        gAU RM                  XUS9n[        U[N        5      (       d   eU(       aB  U[        RP                  :  a.  [        RR                  (       a  U RU                  X5      nUS:  a  Un[        RV                  (       aX  U RY                  X5      =n(       a@  Uu  nnnUR[                  UU5        U RM                  X5      n[        U[N        5      (       d   e[        R\                  (       a-  U[        RP                  :  a  U R_                  X5      nUS:  a  Un[`        Rc                  [d        Rf                  5      (       a4  [`        Ri                  SURk                  5       URk                  5       U5        [F        Rl                  Ro                  XUU5      (       d  gUR                  5       UR                  -  (       a`  U Rq                  X5      =(       aH    [F        Rl                  Rq                  XUU5      =(       a     U R                  U5      Rq                  X5      $ [F        Rl                  Rs                  XUU5      =(       a     U R                  U5      Rs                  X5      $ s  snf )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  @   #    U  H  oR                  5       v   M     g 7fry   r0  )r   r  s     rj   r   %Scheduler.can_fuse.<locals>.<genexpr>g  s     E_c<<>>_r7  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr
  c              3  @   >#    U  H  oR                   T;   v   M     g 7fry   r   )r   rD  prologue_nodess     rj   r   r  w  s     QytyyN:ys   z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data):r   r  r   r{  rj  r  r   can_fuse_multi_outputs_templaterm  r  r  r   r   r&   r  r   r,  r)   r  get_allowed_prologue_inpsr   rW  r=  r  r   r  r   r-  r   r   r   r  r   r  r  rW   r   no_fuse_buffer_namesr  r!  score_fusion_memory_thresholdr  r  $expand_dimension_for_pointwise_nodesr  r  r  r{  r  rZ  r[  rh  r  r0  r  r   can_fuse_verticalcan_fuse_horizontal)ri   r   r   can_reorderr  r  r+  ru  unsupported_prologue_argsr   	node_outsr   rD  template_snodestemplate_snoder[  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizer  s                          @rj   r   Scheduler.can_fuse$  s.    >e455&&u--e455 %4#3#3$

)
)%
7$8 e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-))01!!##u'8'8':':HI779Hh(?(?@@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 444M 5 
 +S1111 !F$H$HH11$($J$J5$X!$)$9!66#FFuTTOT6E3Z{<<ZU $ 8 8 F/5555 11!F$H$HH$($M$M%! %)$9!))'--88##.  !	 yy!!$u6GHH$$&8 &&u4 MII//UDUVM$$V,>>uL 9900U$5 M""6*>>uLMs Bs   ]/]c                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        S [$        R&                  R)                  UR+                  5       5       5       5      nX-  (       a	  U" S5        gUR-                  5       nU HJ  nU R.                  U   R1                  5       nXR2                  U   R4                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7fry   r  r  s     rj   r   .Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
U HHUr  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r=  r{  r   r  r  r  r  r   r   r4   r  r   r   r  r2   fusable_read_and_writer  r   r  r  r*  r   r   rX  r2  r)  r   )ri   r   r   node1_buf_namesr  remaining_deps_by_namer   r   cd	remainingr  remaining_depsnode1_op_namesr  s                 rj   r  Scheduler.can_fuse_vertical  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # rm   c                  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       a  g[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nU/n[        U[        5      (       a  UR                  nSnU He  n	U	R                  R                    V
s/ s H  n
U
R                   U:X  d  M  U
PM     nn
U(       d  MD  US-  n[#        U4S jU 5       5      (       a  Me    g   US:*  $ s  snf s  sn
f )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7fry   )r   r2   r!   r   r#   TMPr   )r   r  rs  s     rj   r   -Scheduler.fusable_weak_dep.<locals>.<genexpr>5  sn      
 +D	 4+ ,+DJJAA,JJ%++-, II+, +s   BB)r   r=  r   r  r  r   r   r3   r2   r!   r   r#   r  r  r	  r   r   r   )ri   weak_depr   r   rs  mutating_writesrH  relevant_reading_nodesnum_concurrent_readsreading_noder  relevant_readss       `       rj   r  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"eW%%%++++u{{DHH55++H,A,AB	"'e788%*\\" 2L )44:::D99	) :  
 " A%  
 +   ! 3" $q((K
*s   E>E>+FFc                z   [        U[        5      (       Ga  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nU R                  UR                  5      (       a  gUR                  UR                  :H  =(       aa    [        UR                   5      [        UR                   5      :  =(       a/    UR                   S [        UR                   5       UR                   :H  $ [        U["        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                  UR                  :X  a  UR                  b  X4:X  a  ggr_  )r   r2   r  r  r   r!   r   r#   r  r&   r  r  r  r  rL  r   r   r3   )ri   r  rs  	read_name
write_names        rj   r   Scheduler.fusable_read_and_writeC  s   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 11%**== 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rm   c                @    [         R                  R                  X5      $ ry   )rW   r   get_dep_size_hint)ri   r   r  s      rj   r  Scheduler.dep_size_hintj  s    ww((::rm   c                  ^ ^^ U4S jnU(       a8  [         R                  X5      (       a  [         R                  X5      nU" US5      $ [        UR                  R
                  5      [        UR                  R                  5      -   n[        UR                  R
                  5      [        UR                  R                  5      -   n	[        X5      S-  [        X5      :  a  X:  a  X!p!UR                  R
                  UR                  R                  -   V
s/ s H9  n
XR                  R
                  ;   d  XR                  R                  ;   d  M7  U
PM;     nn
U" [        UU 4S jU 5       5      S5      $ UR                  R
                  UR                  R                  -  UR                  R
                  UR                  R                  -  -  nU" [        U 4S jU 5       5      S5      $ s  sn
f )zV
The first term in our fusion score that estimates number of saved
memory operations.
c                   > T(       a  X4$ U $ ry   rw   )rz  is_mix_order_reductionreturn_is_mix_order_reductions     rj   _construct_return_value>Scheduler.score_fusion_memory.<locals>._construct_return_valuez  s     1 / rm   Tr  c              3  H   >#    U  H  nTR                  UT5      v   M     g 7fry   r  )r   r   r  ri   s     rj   r   0Scheduler.score_fusion_memory.<locals>.<genexpr>  s!     IDSD&&sK88D   "Fc              3  F   >#    U  H  nTR                  U5      v   M     g 7fry   r  r  s     rj   r   r    s!     F3EC""3''3Er  )
r   r   r   r   r   r   r  r  r  r   )ri   r   r   r  r  r  r  rz  node1_dep_lennode2_dep_lenr   r  common_memory_depss   `  ``        rj   r  Scheduler.score_fusion_memorym  s   	 %):)C)CE)Q)Q
 &66uDE*5$77E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT,$u !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   +IDII5  $//558I8I8P8PP##e&7&7&>&>>
 'F3EFF
 	
s   6G.G.c                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r   )
r   r   r!  r  get_fusion_pair_priorityr   r  r/  r  r  )ri   r  "possible_fusions_group_by_priorityr   r   r[  fusion_pair_priority&possible_fusions_with_highest_prioritys           rj   r  4Scheduler.get_possible_fusions_with_highest_priority  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55rm   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rW   r  score_fusionr  s     rj   r  Scheduler.score_fusion_key  s     yy%%d3U33rm   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rW   r   r/  r   r5  r  r  r  r~  )ri   r  r   s      rj   r;  Scheduler.compute_last_usage  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )rm   c                   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  [        U[        R                  5      (       a  GM  UR                   n[        U[        R"                  5      (       a  UR%                  5       (       d   e[        R                  R
                  R                  UR                   5        GM     U R                  R'                  5         g)z*Free any buffers that are no longer neededN)r  rA  rW   r   r  rR  freedrX  rb  codegen_freer   r  r   r)   r  rP  r   r3  is_input_bufferr  )ri   r   r   r  storages        rj   free_buffersScheduler.free_buffers  sU   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:R%6%677!hhG"7BMM::w?V?V?X?XXGG((55gllC)
, 	!!'')rm   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g ry   )r  r   flushr	  )ri   r  s     rj   r
	  Scheduler.flush  s.    }}++-GMMO .rm   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)Nr  extern_callsr   F)increase_kernel_countztype(node)=)r   r  r   rW   set_kernel_handlerr.   r  r  r   r)   re  r   r  r   rR  r	  )ri   scheduler_noder   s      rj   codegen_extern_callScheduler.codegen_extern_call  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: r      rB  )rR   r   r   rW   r   add_device_infor-   r  r$   r  r   get_device_propertiesmajorr5   inspectcurrentframer6   )ri   r[  device_schedulingdevice_propss       rj   create_backendScheduler.create_backend  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&rm   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ ry   )r  r	  rZ  s     rj   r  Scheduler.get_backend  s@    !!!&$($7$7$?DMM&!}}V$$rm   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf ry   )rB  r  r4  r   r5  )rD  r1  ri   s     rj   	get_order*Scheduler.enter_context.<locals>.get_order  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )rD  ztorch.fx.Noder   r!  )r   r   r  r  r  r  r  r  rW   r   rR  enter_context)ri   r   r"	  rD  r  r  r  lasts   `       rj   r$	  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7fry   )r  r0  )r   rD  r9  s     rj   r   AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>8  s)     VPUC3C CCPUr  )rX  r-  KeyErrorr   r  r  )ri   r   r9  r-  s     ` rj   r7  .Scheduler.can_buffer_be_removed_through_fusion0  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                   UR                   n[        U[        R                  R                  R
                  5      (       ax  UR                  =n(       ae  [        U5      u  pEU[        R                  ;   d  U[        R                  ;   a0  [        U[        R                  R                  5      (       d   eSU 3$ [        R                  R                  R                  R                  (       d  [        R                  c  g[        U[         5      (       a0  UR"                   H  nU R%                  U5      nU(       d  M  Us  $    gUR                   c   eUR'                  5       (       d  UR)                  5        S3$ [        UR                   [        R*                  5      (       a  g[        UR                   [        R,                  5      (       a  g[/        UR                   SS5      (       a  g[1        UR                   5      (       a  g	U R3                  U5      =n(       a  U$ [        R                  R4                  (       a  [7        U5      (       a  g
g)zr
Return the reason why we should partition the inductor graph on this node,
or None if the node is cudagraphable.
zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r)   r  r  rM   r&   custom_should_partition_ops_ops
OpOverloadr   r[   rE   wrapperr   r   should_partitionrR   r   
DeviceCopyConditionalr  rQ   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsr  )ri   r   r  r1  op_overload_packet_nameop_overload_namer\  r~  s           rj   r2	  Scheduler.should_partition=  s    ))gu11@@AA%%%B%8DR8H5#'6+M+MM#v'I'II!"ejj&;&;<<<<./?.@AA &&--886>>FKd.//..u56!M % yy$$${{}}oo'(--dii//#dii00$4991488)!$)),,0@@FF6FM ==66-d33*rm   c                   [        5       n[        R                  (       d  U$ U R                   GH  nUR                  nUc  M  [        U[        R                  R                  R                  5      (       d  MJ  UR                  nUc  M[  [        U5      u  pVU[        R                  ;  a  U[        R                  ;  a  M  UR                  5        Hn  n[        R                  R                  R!                  U5      n[#        U[$        R&                  [$        R(                  45      (       d  M]  UR+                  U5        Mp     GM     U$ )zS
Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
)r   r&   cudagraph_unsafe_unbacked_opsr5  r   r   r  r  r)   r  r  rM   r  rW   r   r   r   r"   r#   UNBACKED_INTUNBACKED_FLOATr  )ri   unsafe_symintsr   r  r1  r7	  r8	  syms           rj   &_get_cudagraph_unsafe_unbacked_symints0Scheduler._get_cudagraph_unsafe_unbacked_symints|  s    
 4><33!!JJDiiGgu'9'9'H'HII$$Bz8DR8H5#'v/S/SS$F,P,PP779gg&&//4!#(9(94;N;N'OPP"&&s+ :' 0 rm   c                    U R                  5       nU(       d  g [        U5      nU HM  n[        R                  R                  R                  U5      nUR                   H  nXb;   d  M
  SU 3s  s  $    MO     g )Nz'uses cudagraph-unsafe unbacked symint: )r@	  r  rW   r   r   r   r    )ri   r   r>	  node_symbolsr?	  simplified_symfree_syms          rj   r5	  0Scheduler._uses_cudagraph_unsafe_unbacked_symint  sn     DDF5d;CWW--66s;N*77-DXJOO 8   rm   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rW   r   r  r5  r  r/  r   )ri   r  r   r   scheduler_buffers        rj   get_name_to_nodesScheduler.get_name_to_nodes  sd     UWAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  rm   c           	        [        [        R                  R                  5       VVs0 s H  u  p#X2_M	     nnn[        [        R                  R	                  5       5       VVs0 s H  u  p#X2_M	     nnn/ [        R                  l        [        U5       H  u  pgUR                  (       a  M  / nUR                   H#  nUR                  UR                  U5      5        M%     / n	UR                   H1  n
U	R                  UR                  U
R                  5       5      5        M3     [        R                  R
                  R                  [        UUU	UR                  5      5        M     gs  snnf s  snnf )zj
computes a mapping from partition input/output indices to graph input/output
indices for each partition.
N)r4  rW   r   r  r/  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr0  rN   constant_names)ri   
signaturesr  r   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingr   s              rj   compute_graph_partition_maps&Scheduler.compute_graph_partition_maps  s;    (11E1E'F%
'F)#DI'F 	" %
 (11I1I1K'L&
'L)#DI'L 	# &
 "$'0'<#L''
 M!--$$%>%B%B4%HI .  N!..%%&@&D&DT]]_&UV / GG""))! !",,	! (=%
&
s   E'"E-c                  ^     SS jm    SS jn[        5       R                  " S U 5       6 nUR                  " U4S jUR                  5        5       6   U" U5      n[        5       nU HG  n[        R
                  R                  R                  U5      nUR                  UR                  5        MI     [        [        U[        R                  " S5      S95      $ )	a9  
Returns all symbol inputs which are required to be in scope to successfully
perform codegen for this graph partition, including:
- free symbols used in partition nodes
- free symbols in partition input/node shapes, strides, and offsets. This is needed
  for recording cudagraphs for tensors with dynamic shapes.
c                    [        U [        R                  5      (       a
  [        5       $ [        U [        R                  5      (       a  [        U 5      $ [        S[        U 5       35      e)z?
Gets symbols used in input node shapes, strides, and offsets.
zUnsupported input node type: )r   r)   r  r   r	  r  r  r   r   s    rj   get_input_node_symbolsKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sT     $ 2 233!|#D")),,)$// *,I$t**VWWrm   c                &    [        S U  5       5      $ )z
Filters a set of symbols that are required for codegen. Skip symbols
that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
and SymT.R0_INDEX.
c              3     #    U  HV  n[        U[        R                  [        R                  [        R                  [        R
                  45      (       d  MR  Uv   MX     g 7fry   )r"   r#   SIZEFLOATr<	  r=	  r   r  s     rj   r   VScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sI       A!		

))++	  s   AA 	A r   )symbolss    rj   filter_symbolsCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         rm   c              3  8   #    U  H  n[        U5      v   M     g 7fry   r  r5  s     rj   r   >Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     Iyt,T22yr  c              3  8   >#    U  H  u  pT" U5      v   M     g 7fry   rw   )r   r  r   r\	  s      rj   r   rh	  !  s     N:Mwq$T**:Ms   r   r  )r   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   OrderedSet[sympy.Symbol])rd	  rj	  r   rj	  )r   rd  r  r/  rW   r   r   r   r    r  r  
attrgetter)	ri   	partitionrN	  re	  candidate_symbolsr  r  symplified_sr\	  s	           @rj   !get_graph_partition_symbol_inputs+Scheduler.get_graph_partition_symbol_inputs  s    	XB	X%	X 	-	%	, 7Al6H6HIyI7
 	  N+:K:K:MN	
 ++<=(2"A77++44Q7LJJ|001 #
 &(*=*=f*EFGGrm   c           
       ^ ^ / n[        [        R                  R                  5       5      nT R	                  5       nSUU 4S jjm[        [        U5      [        U5      5       GHj  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H&  n[#        U[$        5      (       a  M  UR&                  PM(     sn5      U-
  n[        U 4S jU 5       5      n[        5       nU H  n	UR                  U	R(                  5        M      X-
   Vs/ s H  nX;   d  M
  UPM     nnUR                  U5        U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   _M     nnU Vs/ s H  nX;   d  M
  X;  d  M  UPM     nnU
R                  U5        [        U 4S jU
 5       5      n
U
 Vs/ s H  nT" U5      (       a  M  X_   PM     nnU Vs/ s H$  o[        R                  R*                  ;   d  M"  UPM&     nnT R-                  UU5      n[/        UUUUUU5      nUR1                  U5        UR3                  XJ-
  5      nGMm     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf s  snf s  snf s  snf )z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
c                   > TR                   R                  U S5      nUc  g[        UR                  R                  [
        5      (       a,  TR                  R                  U S5      =n(       a  T" U5      $ gg)z
Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
Buffers with NoneLayout are not allocated so graph partition should not
take them as inputs or outputs.
NFT)rX  r  r   r   r=  r=   r  )rR  r   rH  is_unallocated_bufferri   s      rj   rs	  FScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer:  sk     ""&&x6C{#((//:66 !% 7 7 ; ;Hd KK9K0;;rm   c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fry   r  r  r   r   ri   s     rj   r   :Scheduler.get_graph_partition_signature.<locals>.<genexpr>m  ,      /1D ''++D771   (+c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fry   rv	  rw	  s     rj   r   rx	    ry	  rz	  Nr
  )rR  r  r   rx   )r   rW   r   r/  rI	  r)  r   r  r  r  r$  r(   rb  rc  r   r   r  r   r4   r   r~  r  ro	  r:   r   rd  )ri   
partitionsskip_cudagraphsrQ	  unmet_output_namesr  rl	  rM	  output_namesr   returned_output_namesr   r  partition_input_namesrA  r   extra_input_namesrN	  input_deallocationextra_output_namesrO	  rP	  symbol_inputspartition_signaturers	  s   `                       @rj   get_graph_partition_signature'Scheduler.get_graph_partition_signature.  sZ    
'(@(@(BC--/	 	, *-Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K  "-!2!2[5G5G!G!GA)!W5 !G  " %/ /1/ %!
 5?L !$++DOO< " 2@!@D' @  !
 "(():; 21D' )4((1   2"1D' 32221  " 2"1D' ,0,L 1  " "(();<$. /1/ %! 21D,T2 #"1   "7!6!''BSBS:S!6   !BB;M #:"# 12!6!<!<":"K*
R $B$y 9*!
""s`   K
K
,K
	K$K$9	K)	K)	K."	K.1	K3>K3K37K8K8!K=?K=c                   UR                   R                  5        VVs0 s H'  u  p#U[        R                  R                  ;  d  M%  X#_M)     nnnUR
                  R                  5        VVs0 s H'  u  p%U[        R                  R                  ;  d  M%  X%_M)     nnnUR                   Vs/ s H3  nUR                  5       [        R                  R                  ;  d  M1  UPM5     nnUR                   Vs/ s H%  nU[        R                  R                  ;  d  M#  UPM'     n	n[        UR                  UUUUR                  U	5      $ s  snnf s  snnf s  snf s  snf )z
Updates the partition signature by removing buffers specified in
V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
)rN	  r/  rW   r   r  r	  rO	  maybe_get_namerP	  r:   r	  rM	  )
ri   rU	  r   r  rN	  r  r	  r   rO	  rP	  s
             rj   .clean_removed_buffer_from_partition_signatures8Scheduler.clean_removed_buffer_from_partition_signatures  sR    !* 5 5 ; ; =
 =177222 DL = 	 
 '99??A
A	177222 DIA 	 
 "..
.""$AGG,C,CC . 	 
 "00
0177222 0 	 

 '##$$
 	
)






s/   $EE,$EE+0EE5"EEc                  ^ ^^	^
^^^ SSK m	[        5       m/ m/ m[        U5       VVs0 s H  u  p#X2_M	     snnmSUU	UUU 4S jjm
SU
U4S jjnU H8  n[        UR                  R
                  5      TU'   TU   S:X  d  M0  T
" U5        M:     / nSnU[        U5      :  a  T(       d  T(       a  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  US-  nU[        U5      :  a  T(       a  M  T(       a  M  U[        U5      :  a  [        S5      eU$ s  snnf )ad  
Reorder nodes to minimize the number of partitions via a bfs
topological sort. This is the optimal reordering such that the
number of partitions cannot be reduced further. This may be
sub-optimal for other metrics such as peak memory. This does not
change relative orders of two cudagraphable nodes, nor the
relative order of two non_cudagraphable nodes.
r   Nc                   > TU    U 4nTR                  U 5      (       a  TR                  TU5        g TR                  TU5        g ry   )r2	  heappush)r   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesri   s     rj   insert_pending_nodesHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  sA    ,T2D9O$$T**6H2ODrm   c                   > U R                   R                   H.  nTU   S:  d   eTU==   S-  ss'   TU   S:X  d  M&  T" U5        M0     g )Nr   r   )r  
succ_nodes)r   	succ_noder	  node_to_indegrees     rj   update_indegreeCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sO    !]]55	'	2Q666 +q0+#I.!3(3	 6rm   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r\   r   rs  )	r	  r(  r4  r   r  
pred_nodesheappopr   r  )ri   r5  r  r   r	  r$  	num_itersr  r	  r	  r	  r	  r	  r	  s   `       @@@@@@rj    reorder_for_minimizing_partition*Scheduler.reorder_for_minimizing_partition  s_    	9=CEGI4=e4DE4Dys4DE	E 	E	4 	4 D%()A)A%BT"%*$T* 
 -/	#e*$#':)--(?@%% *)
 &--(;<%% &%
 NI #e*$##':': s5z!  ] Fs   E(c           	     R   SSK JnJn  [        [        R
                  R                  5       5      nU" UU R                  U R                  [        [        R
                  R                  R                  5       5      U5      u  pVU R                  U5      nU" XvU5      u  pXS-  :  a  U$ U$ )z`
Reorder nodes to minimize the number of partitions if this only slightly
increase peak memory.
r   )estimate_peak_memoryprepare_planning_infor  )r.  r	  r	  r   rW   r   r/  rX  r)  r  r  r	  )
ri   r5  r	  r	  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             rj   r9  0Scheduler.maybe_reorder_for_minimizing_partition   s     	H"177#;#;#=>:O##qww++0023;
7 ??F!5"

 s!::""rm   c                4   / n/ n/ nSS jnU H  nU R                  U5      SLnU(       a,  [        UR                  5      S:X  a  UR                  U5        MI  U(       a   U" U5      (       a  UR                  U5        Mp  UR                  U5        M     X#-   U-   $ )z
Reorder a node if it should be partitioned and has simple dependency:
1. move a partitioned node to the front if it has no dependency
2. move a partitioned node to the back if it is only used by OutputNode
3. otherwise do not reorder
c                    U R                  5        H8  nUR                   H%  n[        UR                  [        5      (       a  M$      g   M:     gr_  )r  r-  r   r   r`  )r   r   ra  s      rj   only_output_userPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userM  s<    '')99C%chh
;;$ % * rm   Nr   r  )r2	  r   r  r   )ri   r5  frontmiddlebackr	  r   r2	  s           rj   r:  6Scheduler.reorder_for_partition_with_simple_dependency?  s     *,*,(*	 D#44T:$FC(?(?$@A$ET"!&6t&<&<D!d#  ~$$rm   c                   / nSn/ n/ nU R                    HY  nU R                  U5      SLnU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        M[     U(       a"  UR                  U5        UR                  U5        U R                  XS9nU R	                  U5        U R                  X5        X4$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
TN)r|	  r}	  )r5  r2	  r   r	  rX	  _log_graph_partitions)ri   r|	  rM	  cur_partitionr}	  r   node_should_partitionrQ	  s           rj   r7  Scheduler.graph_partition_  s     +-
')JJD$($9$9$$?t$K!!H!!-0&&~6 "2N  &  m,"">277! 8 

 	))*5""::%%rm   c                   [         R                  [        R                  5      (       d  g [	        S [
        R                  R                   5       5      nU(       d  g [        S U 5       5      n[        U5      U-
  n[         R                  S[        U5      UU5        [        [        X5      5       H  u  nu  px[         R                  SU[        U5      UR                  (       a  SOS[        UR                  5      [        UR                  5      5        UR                  (       d  Mw  U H  n	U R!                  U	5        M     M     g )Nc              3  8   #    U  H  n[        U5      v   M     g 7fry   )rR   )r   r[  s     rj   r   2Scheduler._log_graph_partitions.<locals>.<genexpr>  s     O:NVF^^:Nr  c              3  J   #    U  H  oR                   (       a  M  S v   M     g7f)r   N)rM	  rb	  s     rj   r   r	    s     !PZ?O?O!!Zs   #	#zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)cudagraphs_logrZ  r[  rh  r   rW   r   device_typesr   r   r  r4  r)  rM	  rN	  rO	  _log_non_cudagraphable_node)
ri   r|	  rQ	  has_gpu_devicecudagraphable_countnon_cudagraphable_countr1  rl	  rU	  r   s
             rj   r	  Scheduler._log_graph_partitions  s   
 **7==99 O!'':N:NOO!!PZ!PP"%j/4G"GQ
O#		
 *33z3N)O%A%	  EI'0'?'?#_I))*I**+ '''%D44T: & *Prm   c                   U R                  U5      nU(       d  gUR                  5       nUR                  b  UR                  R                  5       OSnSU 3/n[	        UR                  5      R
                  nUR                  SU 35        UbF  UR                   SSR                  S UR                   5       5       S3nUR                  SU 35        [        R                  S	USR                  U5      5        Uba  UR                  R                  S
S5      nU(       a=  UR                  5       R                  S5       H  n	[        R                  SU	5        M     ggg)z)Log details for a non-cudagraphable node.Nzreason=zir=r  r  c              3  8   #    U  H  n[        U5      v   M     g 7fry   )r  )r   rU  s     rj   r   8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s     2P<a3q66<r  r  zfx=z
    %s: %sr  r  z         %s)r2	  r0  r   r  r   r{   r   r  r  rL  r	  r  r  r  stripsplit)
ri   r   r~  r  r  partsir_typefx_strr  lines
             rj   r	  %Scheduler._log_non_cudagraphable_node  s,   &&t,MMO	151F$))++-D6(#$tyy/**s7)_%'q2P7<<2P)P(QQRSFLL3vh(\9dii6FG !,,**=$?K'--/55d;D"((= <  rm   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r  r  r&   r7  _codegen_partitions_codegenr5  rh   s    rj   r  Scheduler.codegen  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                l   SSK Jn  [        R                  R                  n[        U R                  5      n[        R                  R                  5          [        R                  R                  SSU 3UUS9  U R                  U5        [        [        R                  R                  U5      (       d   eU R                  U5      nU[        R                  R                  l        [        R                  R                  R                  5         [        R                  R                  n[        R                  R                  R                  [        R                  R                   5      u  pxSSS5        [        R                  R                  R#                  WW5        [        R                  R                  R%                  XR5        [        R                  R                  R&                  R)                  UR*                   V	s/ s H  oR-                  5       PM     sn	5        g! , (       d  f       N= fs  sn	f )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  r	  rW   r   rR  r3  r  set_current_wrapper_codeinit_wrapper_coder	  r   r	  r	  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  rO	  r0  )
ri   rl	  rU	  r	  r	  graph_partition_id
graph_namepartition_coder  r   s
             rj   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQQKKIVI8AAGG  5GG  --/J ! 4 4 = =agg>R>R SN/ 02 	
88^T	334FR	&&--)2)?)?@)?]]_)?@	
9 0/: As   DH ?H1 
H.c                P   ^ ^^ [         R                  SUU U4S jj5       nU" 5       $ )Nc               3    >#    TR                  T T5        TR                  (       a  [        TR                  R                  5      (       a[  TR                  R                  c   S5       e[
        R                  R                  R                  TR                  R                  5         S v   TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        g ! TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        f = f7f)Ndevice should have an index)
%update_graph_partition_default_devicer  rH   r   r   rW   r   rR  codegen_device_guard_entercodegen_device_guard_exit)r|	  ri   rQ	  s   rj   ctx1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 0 2288D 1D $$??//553..3D//444 4 GG((BBD.2+	 ..3D//444 4 GG((BBD.2+s    B#E9'D +A%E9A&E66E9)r   zIterator[None])
contextlibcontextmanager)ri   r|	  rQ	  r	  s   ``` rj   use_default_device_context$Scheduler.use_default_device_context  s+     
	"	"	3 	3 
#	3* urm   c                N   [        U5      S:X  a  US   R                  (       d  g SS jn      SS jnS n[        X5       H   u  pgUR                  (       a  M  U" U5      n  O   Uc  g [        X5       H'  u  pgUR                  (       d  M  U" Xe5      (       a  M'    g    XPl        g )Nr   r   c                6    U S   R                  5       nUc   eU$ r   r   )rl	  partition_devices     rj   get_cudagraph_partition_deviceWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##rm   c                D    U  H  nUR                  5       nX1:w  d  M    g   gr_  r	  )rl	  target_devicer   r[  s       rj   all_on_target_deviceMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s(     "**  " rm   )rl	  r]   r   r  )rl	  r]   r	  r  r   rx   )r   rM	  r)  r  )ri   r|	  rQ	  r	  r	  cudagraph_partition_devicerl	  rU	  s           rj   r	  /Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
$? I+++-KI-V* %@ &-$'
$? I'''0D1 1 	 %@ 'A#rm   c                   U R                  5       u  p[        U5      S:  a  [        S   S==   [        U5      -  ss'   U R                  X5         [	        X5       H\  u  p4[        U5      S:  d   S[        U5       35       eUR
                  (       a  U R                  U5        MK  U R                  X45        M^     SSS5        [        U R                  5      n[        R                  R                  R                  U5        US:  as  [        R                  R                  c   eU[        [        R                  R                  5      :X  d.   SU S[        [        R                  R                  5       35       egg! , (       d  f       N= f)	z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r7  r   r   r	  r)  rM	  r	  r	  r3  r  rW   r   rR  set_all_partition_namesrL	  )ri   r|	  rQ	  rl	  rU	  num_partitionss         rj   r	  Scheduler._codegen_partitions:  sO    "&!5!5!7
z?QZ !78C
OK8,,ZD(+J(C$	9~* KCPYNK[\* ++MM),33II )D E d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@  EDs   A,E<<
F
c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     U R                  U l        U R                   b   eU R                  (       aG  [         R"                  R$                  (       a(  [&        R(                  R*                  R-                  5         U GH  n[.        R1                  [2        R4                  5      (       a4   [.        R7                  SUR9                  5       UR;                  5       5        U R?                  U5        URA                  5       =n(       Ga  XR                  :w  d*  URC                  5       (       d  URE                  5       (       a  U RG                  5         XR                  :w  a  U R                  (       aL  [I        U R                  RJ                  5      (       a(  [&        R(                  R*                  RM                  5         Xl        [I        URJ                  5      (       aG  URN                  c   S5       e[&        R(                  R*                  RQ                  URN                  5        Xpl)        U RT                  RW                  URX                  5        URE                  5       (       aN  UR[                  []        UR_                  5       5      5      u  pnU Ra                  U5      Rc                  XU	5        GOhURC                  5       (       a.  [d        Rf                  " [h        U5      nU Rk                  U5        GO%URm                  5       (       aw  [d        Rf                  " [n        U5      nU Ra                  U5      nS	S
K8J9n  S	SK:J;n  [y        XU45      (       a  UnO[{        S[K        U 5      < 35      eUR}                  U5        O[y        U[~        5      (       a!  U Ra                  U5      R                  U5        Oc[y        U[        [        45      (       a!  U Ra                  U5      R                  U5        O'[y        U[        5      (       d   eUR                  5         [         R"                  R                  (       a  U Ra                  U5      R                  5         U R                  RW                  UR                  5       5        U R                  RW                  UR                  5       5        [y        U[        5      (       dW  URA                  5       nUbD  URJ                  S:w  a4  U Ra                  U5      R                  5       (       a  U RG                  5         [        S UR_                  5        5       5      (       a	  Xpl        GM  S U l        GM     U R                  U R                  :w  a[  U R                  c   e[I        U R                  RJ                  5      (       a(  [&        R(                  R*                  RM                  5         S U l        U RG                  5         g ! [<         a(    [.        R7                  SUR9                  5       5         GN/f = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r	  r   )CUDACombinedSchedulingr  ztype(self)=r  c              3  B   #    U  H  n[        U[        5      v   M     g 7fry   )r   r   rC  s     rj   r   %Scheduler._codegen.<locals>.<genexpr>  s     J9IA:a//9Irt  )Nr&   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r  rN  r  r   autotune_at_compile_timerW   r   rR  write_get_raw_stream_headerr  rZ  r[  rh  r  r0  r  r  r$	  r   rm  rj  r
	  rH   r   r	  r   r	  r  rA  r  r~  r:  r  r   r  codegen_templater  r  r  r	  rp  r	   codegen.cuda_combined_schedulingr
  r  r  r   r  codegen_combo_kernelr  codegen_mix_order_reductionr   r   codegen_noder  r  debug_sync_kernelcodegen_syncr  r=  r  r   ready_to_flushr   )ri   r5  r  stackr  framer  r   r[  r7  r8  r9  backend_r
  r  r  s                   rj   r	  Scheduler._codegenZ  s   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #99!!))) &&6==+Q+QGG  <<>D..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU $%%,,T__=!!484W4W)*51   (99!X !!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D"9::  (DDTJD#5}"EFF  (55d;!$(>???? }}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*&v-((0??AAJJLJ9IJJJ%)"%)"s v $"="== &&222 !4!4!9!9:: $$>>@!

} ! IIPs   3Z33.[%$[%c                    US   R                  5       nU [        R                  l        X0l        Uc   eU R                  U5      nUR                  X5      $ )r  r   )r   rW   r   r(  rN  r  benchmark_combo_kernel)ri   rP  node_benchmark_resultsr[  r  s        rj   r 
   Scheduler.benchmark_combo_kernel  sU     1((* $!!!""6*--iPPrm   c                \  ^ UnUS   R                  5       m[        U4S jU 5       5      (       d   S5       e[        R                  (       d  gSSKJn  S/ pT0 n[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  pX4Xh'   [        R                  " U
5      (       a  [        R                  SU5          g	 XJ-  nUR                  U5        M      U R                  X&5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  XM:  d  U(       a$  [        R                  S['        XM-  S 5      5        O#[        R                  S[)        XM-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)ra  r   c              3  H   >#    U  H  oR                  5       T:H  v   M     g 7fry   r	  )r   r   r[  s     rj   r   4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s     K?4??$.?r  z<All nodes in a combo kernel group must be on the same deviceTrd  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speeduprg  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r&   r 
  r  re  r4  r   rQ  r  r  r	  r  r  r  r   rZ  r[  rh  rB   rC   )ri   r5  subkernel_nodesre  rj  
path1_listr!
  r1  r\  rP  r  r  r  rk  	ms2_clone_path2_listsmall_kernelr[  s                    @rj   r  !Scheduler.speedup_by_combo_kernel  s      #..0K?KKK 	
J	
K ,,;rZ!#!/2HA)I ##I..  R55i@13
&-::b>>$$U ! " ICd#9 3<	*.*E*E+'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44Q $ *c!f4$$]      	&#a&0  Y 	s=   %A	F=G6 =G3$G.-G..G36H+<$H&%H&&H+c                r    U R                   U   nUR                  c   eUR                  R                  5       $ ry   )rX  r   
get_layout)ri   rR  r   s      rj   get_buffer_layoutScheduler.get_buffer_layout<  s5    x(xx###xx""$$rm   c                   U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        [        45      (       a  Mw  UR                  5       / :X  d  M  [        R
                  R                  R!                  UR                  5        M     M     g rX  )r5  rR   r   r   rW   r   r  r  r   r9   r   r=  r=   r<   r  zero_dim_cpu_tensor_listr  )ri   r   r  r  s       rj   r  $Scheduler.update_zero_dim_cpu_tensorA  s    JJD{{}} ,,22DWW3377		BF+F3u< *"MMJ8I+J! ! #OO-388<<TYYG 3 rm   )r  r  r  rA  r  rN  r  r  r!  r  r  rX  rW  r)  r  r5  r  rB  r
  r  rD  r  )r5  zlist[ir.Operation]r   rs  )r   z!dict[str, SchedulerDonatedBuffer]rv  )r[  rw  r   rs  rr  )r]  r  r   rs  )r   r<  r   r\   )rL  Optional[str]r   rx   ri  )r\  r\   r   rI  )r   rj  r5  r  r   tuple[float, str]ry   r5  r  r  rx   r  Optional[int]r   r  )r  r   r[  r  r   r5
  )r  ir.MultiTemplateBufferr   rx   )
r?  ir.OperationBufferr  r8
  r1  r!  r   r   r   rs  )rP  r  r   rx   )r5  r  r  r7
  r   z)tuple[Optional[LambdaFuture], ModuleType])r   r\   r   r\   r   ra   )r   r\   r   r\   )r   r\   r   r\   r(  OrderedSet[BaseSchedulerNode]r   r\   )r   r\   r   r\   r  rz   r(  r:
  )r  ,dict[BaseSchedulerNode, list[PendingFusion]]r(  r:
  r   rs  )
r  1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  &dict[BaseSchedulerNode, PendingFusion]r  r;
  r(  r:
  r  rx   )r(  r:
  r  r=
  )r  r<
  r  r<
  )r5  rI  r  rx   r   rI  )r  r7
  r   rs  r  )r5  rI  r  rx   r   r<
  r  )r   r\   r   r\   r8  r!  r   rx   )r   r\   r   r\   rN  z'Union[tuple[str, ...], OrderedSet[str]]r   r  r"  r  )r  r\   r8  r\   r  r{  r   rx   )r   r\   r   r\   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])FT)
r   r\   r   r\   r  rx   r  rx   r   rx   )r  r4   r   r\   r   r\   r   rx   )r  r1   rs  r2   r   rx   rE  )r   r1   r  rx   r   r!  )TFT)r   r\   r   r\   r  rx   r  rx   r  rx   r   zint | tuple[int, bool])r  r<
  r   r<
  )r5  r   r   r   )r	  r  r   rs  )r[  r  r   BaseScheduling)r[  rw  r   r>
  r	  )r   r  r9  r}  r   rx   )r   r\   r   r3
  )r   rj	  )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])rQ	  list[GraphPartitionSignature]r   rs  )rl	  r]   rN	  r?
  r   rj	  )r|	  list[PartitionType]r}	  z
list[bool]r   r@
  )rU	  r:   r   r:   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r|	  rA
  rQ	  r@
  r   rs  )rl	  r]   rU	  r:   r   rs  )r|	  rA
  rQ	  r@
  r   z'contextlib.AbstractContextManager[None]rP  r  r   z(tuple[float, float, list[Optional[str]]])r5  rI  r   rx   )rR  r  r   z	ir.Layout)cr{   r|   r}   r~   r#  r  r  r  propertyrN  setterr@  r^  r  r   rZ   r>  r  r  r  r  rC  r  r  rw  r6  r	  r  r  r!  r&  r6  rQ  r^  r  r  r  r  r  r  r  r  r   r-  r*  r  r  r-  rC  rH  rW  r{  r  r  r  r  r   r  r  r  r  r  r  r  r;  r	  r
	  r	  r	  r  r$	  r7  r2	  rF   r@	  r5	  rI	  rX	  ro	  r	  r	  r	  r9  r:  r7  r	  r	  r  r	  r	  r	  r	  r	  r 
  r  r.
  r  r   r  r  s   @rj   r'  r'  :  sX   
U9n	# & & ( (7#,"HMP^KZ+#Z ,	 6S*4#&$6!F	808	8, (,	*  %	
 
&
> 
>*6
>	
>$0$	$LVDp'8&'8 +'8 	'8
 '8 
'8R
 RV0AN	2 u&u/@u	un>  ! 3	
 
"  ! '	
 3"D2$PD2 3D2 
	D2LO?PO? @O?  L	O?
 3O? O?bS2S @S00K0 $U0(E&E E 
!	EN..`?6 &6  6  
;	6 p,&,/@,	,\7&7/@7	7r.2&.2/@.2MP.2	.2`$&$/@$	$6< < !< B	<
 
<|M&M/@M	M^Y
&Y
/@Y
	Y
v
9(9 )9 	9
 
9v`&`/@`	8`L "*.uM uM !uM 	uM
 $(uM 
uMn3&3/@3	3j-)-)(9-)BS-)	-)f%N; !.3*.3
 3
 !3
 	3

 (,3
 $(3
 
 3
j6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
=~ ! !F%	"	D '1' 
'RBH BH QBH 
"	BHHK -K @JK 	&K Z"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@ &	B &D";'"; 2"; 
	";H>0)
 )
 +)
 
	)
V-;X	06-A--A;X-A	-A^@BHQ4Q	1QN5`%
H Hrm   c                  r  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr S       SS jjrS S jrS!S jrSS jrS"S jrSS jr    S#S jrS$S jr      S%S jr    S&S jr S     S'S jjrSrU =r$ )(r>
  iQ  c                .   > [         TU ]  5         Xl        g ry   )r  r  r(  )ri   r(  r  s     rj   r  BaseScheduling.__init__R  s    "rm   c                \    U R                   (       a  U R                   R                  5         g g ry   )r(  r	  rh   s    rj   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_schedulerV  s    >>NN'') rm   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   rZ  s     rj   get_backend_features#BaseScheduling.get_backend_featuresZ  s
    |rm   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r  s      rj   r   BaseScheduling.can_fuse_vertical^  
     "!rm   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r  s      rj   r  "BaseScheduling.can_fuse_horizontalf  rP
  rm   c                    g)aE  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.
Frw   r  s      rj   r  .BaseScheduling.can_fuse_multi_outputs_templaten  s     rm   c                @   UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R	                  X5      (       a  [        X5      $ [        U[
        5      (       a  UR                  U5      $ [        R                  X5      $ )z
Fuse two nodes
)	rp  r	  rq   r   r  r  r   r  r   r  s      rj   rq   BaseScheduling.fusez  s|     !1!1!3!3-225@@77EE*588677??5))%**588rm   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )ri   r  s     rj   r  BaseScheduling.group_fn  rP
  rm   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )ri   r8  epilogue_nodesr  s       rj   r
  BaseScheduling.codegen_template  s
     "!rm   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )ri   r5  r  r  s       rj   r  .BaseScheduling.generate_kernel_code_from_nodes  s
     "!rm   c                    [         er]
  r  r  s     rj   r
  BaseScheduling.codegen_node  
     "!rm   c                    [         ery   r  r  s     rj   r
  *BaseScheduling.codegen_mix_order_reduction  r  rm   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  rh   s    rj   r
  BaseScheduling.codegen_sync  ra
  rm   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Frw   rh   s    rj   r
  BaseScheduling.ready_to_flush  s    
 rm   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  rh   s    rj   r
	  BaseScheduling.flush  ra
  rm   c                    [         e)r  r  r  s     rj   r	  $BaseScheduling.benchmark_fused_nodes  
     "!rm   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )ri   r  s     rj   r  )BaseScheduling.benchmark_codegened_module  s
    
 "!rm   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   rw   r  s      rj   r  'BaseScheduling.get_fusion_pair_priority  s     rm   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  )ri   rP  r!
  s      rj   r 
  %BaseScheduling.benchmark_combo_kernel  rl
  rm   c                    U(       a9  SSK Jn  U" UU5      n[        R                  R                  R                  X$5        g g )Nr   )'set_kernel_post_grad_provenance_tracing)r  rt
  rW   r   rR  write_provenance_debug_handle)ri   node_schedulerU  rt
  debug_handles        rj   codegen_commentBaseScheduling.codegen_comment  s<    
 UBL GG  >> rm   rT  )r(  zOptional[Scheduler]rr  )r[  r  r   zOrderedSet[BackendFeature]r  r  )r  rZ  r   z"tuple[tuple[sympy.Expr, ...], ...])r8  r\   rZ
  r  r  r  r   r3
  ry   r6
  )r   z(Union[FusedSchedulerNode, SchedulerNode]r   rs  )r   r  r   rs  rt  r4
  )r  r   r   r5
  r"  rB
  )rv
  r  rU  r3
  r   rs  )r{   r|   r}   r~   r  rI
  rL
  r  r  r  rq   r  r
  r  r
  r
  r
  r
  r
	  r	  r  r  r 
  rx
  r   r  r  s   @rj   r>
  r>
  Q  s   #*"&"/@"	""&"/@"	"
&
/@
	
9&9/@9	9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	""""""0"	""&/@	"4"	1" &*2 # 
	 rm   r>
  )r   z$torch._inductor.codecache.LocalCache)r\  r\   r   r  )r\  r\   r   zOptional[Callable[[Any], Any]])r\  r\   r   r  )r  r   r   r  )r   r\   r)  rB  rX  r  r   rs  )r_  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   rs  )r_  rz
  r(  r'  r   rI  r   rs  )rw   )r  zlist[list[int]]r  r[  r  rX  r   z	list[int])r  r8
  r)  r9
  r   rs  rG  rt  )r   z	ir.IRNoder   rj	  )r   r\   r   rj	  )r   r\   r   r\   )
__future__r   r  r	  rx  r  r	  r  r[  r  r  rV  r  r  r

  r  r   r   concurrent.futuresr   r   r   r	   r
   r   r   r   r   typing_extensionsr   torch.utils._ordered_setr   r)   r   collections.abcr   r   r   typesr   r   r  torch._inductor.async_compiletorch.utils._pytreers  _pytreerZ  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr    torch.utils._sympy.symbolr!   r"   r#   torch.utils._tritonr$   r  r%   r&   r'   r(   r*   analyze_preserves_zero_maskr+   codegen.commonr,   r-   r.   comm_analysisr/   r0   r1   r2   r3   r4   excr5   r6   fx_utilsr7   r8   r9   r:   r;   r<   r=   r  r>   r.  r?   r@   runtime.hintsrA   runtime.runtime_utilsrB   rC   r   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   virtualizedrW   	getLoggerr{   r  _logginggetArtifactLoggerr  r  r  r	  r  r]   r   r^   r_   	dataclassra   r   r   r&  r{  r\   r  r  r  rg  r  r{  r?  r`  r(  r  r  r   re  rn  r   r  r	  rm  r  r  r  r  r  r  r  r  r  r  r  r  r  r'  r>
  rw   rm   rj   <module>r
     s   "          	     , 3 S S S ' /  <<    $ $ $ 6 E ? 7 M > O O * D D D M M ; : 2 $    J ( 7 &    (  !^^--hA
NN44XO  >>;;$  11(LI 34y 4T]t_ D D D* ( ( (V Vr h8 h8 h8V 4_ 4 4y1 y1x 2 2(' #L T"
 
 #
*  *K
*K4*K ,*K 
	*KZW 1 W"5. 5L*% L*^
@	$@ $ 
	,}** }*@[G0 [G|w:!3 w:tb, bP #%+#++  + 	+\0%01C0	08
1 
 
 
> %??, 4$
&VVP@T@H T@Hn@e erm   