
    N j                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJr  S SKJrJrJrJrJrJr  S SKJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJr  S S	KJr  S S
K J!r!  S SK"J#r#  S SK$J%r%J&r&J'r'  S SK(J)r)J*r*J+r+J,r,  SSK-J.r.  SSK/J0r0J1r1J2r2  SSK3J4r4  SSK5J6r6J7r7  SSK8J9r9J:r:J;r;  \(       a  S SK<J=r=  SSK1J>r>  SSK?J@r@  SSKAJBrB  SSKCJDrD  SSKEJFrFJGrGJHrH  SSK2JIrIJJrJJKrK  SSKLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  SSKWJXrXJYrYJZrZ  SSK[J\r\  SS K]J^r^J_r_J`r`Jara  SS!KbJcrcJdrd  SS"KeJfrfJgrgJhrhJiriJjrj  \(       a  S S#K<JkrkJlrlJmrm  S S$KJnrn  \R                  " \p5      rq\R                  R                  \pS%5      rt\R                  R                  \pS&5      ru\R                  R                  \pS'5      rv\a" 5       R                  rx\#" / S(Q5      rySASBS) jjrz\R                   " S* S+5      5       r| " S, S-\|5      r} " S. S/\|5      r~SCS0 jr\" S1\^\^S29r\R                   " S3 S45      5       r " S5 S6\5      r " S7 S8\`\   \\   5      r " S9 S:\J5      r\R                  " S;S<9 " S= S>5      5       r " S? S@\5      rg)D    )annotationsN)Counter)AnyGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)TypeVar)metrics)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hashPyCodeCache)	MemoryDepStarDepWeakDep)CallableIRNode)!indexing_dtype_strength_reduction)CoordescTuner)DeviceProperties)
green_textlast_power_of_2yellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_property_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernelSizeHintMultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                l    [         R                  R                  R                  R                  nUb  U$ U $ N)torch	_inductorr   triton	max_tiles)defaultr\   s     m/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesr_   ^   s-    &&--77I!-9:7:    c                     ^  \ rS rSrSr\R                  R                  \R                  R                  S.               S	U 4S jjjr\	\
S
S j5       5       rSS jr\	\
SS j5       5       rSrU =r$ )IterationRangesc   a  
Each range tree represents multiple sets of iteration indexing
in a single tiled dimension in the output kernel.

If you have two loops ranges one (4, 3, 2) and another (4, 6),
then the range tree will be:
        4 (i0)
    3 (i1)  6 (i3)
    2 (i2)
Where i0 is shared between both loops, but then the split into
different indexing vars.  All loop ranges must iterate over
the same number of elements.
)divisorlengthc                  > [         T
U ]  5         Xl        X l        X0l        X@l        XPl        Xpl        Xl        X`l	        Xl
        g rX   )super__init__namevar_list
var_rangesnumelprefixrd   re   kernelroot)selfri   rj   rk   rl   rm   rn   rd   re   ro   	__class__s             r^   rh   IterationRanges.__init__s   s=     		 $
	r`   c                ,    [        U R                  5      $ rX   )r6   rm   rp   s    r^   is_reductionIterationRanges.is_reduction   s     #4;;//r`   c                ,    [        U R                  5      $ rX   )r7   ri   rt   s    r^   symbolIterationRanges.symbol   s    !$)),,r`   c                |    [         R                  " 5        VVs0 s H  u  pX!_M	     nnnX0R                     $ s  snnf rX   )r   itemsrm   )rp   symtrm   prefix_to_symts       r^   r|   IterationRanges.symt   s;     <F;K;K;MN;M<4&,;MNkk** Os   8)	rd   rn   re   ri   rl   rm   ro   rj   rk   )ri   strrj   list[sympy.Symbol]rk   dict[sympy.Symbol, sympy.Expr]rl   
sympy.Exprrm   r   rn   
SIMDKernelro   IterationRangesRootreturnNoner   boolr   zsympy.Symbol)r   r   )__name__
__module____qualname____firstlineno____doc__sympySOnerh   propertyr1   ru   rx   r|   __static_attributes____classcell__rq   s   @r^   rb   rb   c   s    . ww{{ % 3	
    " 
 0 0  0- +  +r`   rb   c                     ^  \ rS rSrSr S                     SU 4S jjjrSS jrSS jrSS jrSS jr	    SS jr
SS	 jr    SS
 jrSrU =r$ )r      z
Root of a iteration range tree that represents a single
tiled dimension in the output kernel. It contains multiple
sets of iteration represented with IterationRangesEntry.
c          
        > Uc  0 n[         TU ]  U/ 0 UUUU S9  X@l        0 U l        X`l        U(       a  U R
                  (       a  U	b   eXpl        Xl        Xl        Xl	        g )N)ri   rj   rk   rl   rm   rn   ro   )
rg   rh   indexnodes	pid_cacheru   is_loop
tensor_dimgrid_dimhas_zdim)rp   ri   rl   rm   r   rn   r   r   r   r   r   rq   s              r^   rh   IterationRangesRoot.__init__   sx     I 	 	
 
=?
 *3
 t00X5EFF$  r`   c                >    SU R                   < SU R                   S3$ )NzIterationRangesRoot(, z, ...))ri   rl   rt   s    r^   __repr__IterationRangesRoot.__repr__   s    %dii]"TZZLGGr`   c                f    U R                   R                  5        H  nUR                  5         M     g rX   )r   valuescache_clear)rp   nodes     r^   r   IterationRangesRoot.cache_clear   s%    JJ%%'D (r`   c                2    [        U R                   S35      $ )Nr   )r7   rm   rt   s    r^   	index_symIterationRangesRoot.index_sym   s    !T[[M"788r`   c                   [         R                  R                  R                  X-  U R                  5      (       a  [        U R                  5       U5      nO[        U R                  5       X5      nX0R                  ;  a  [        U R                   [        [         R                  R                  5       3UUUU 5      nU[         R                  R                  UR                  5       '   U R                   R#                  UR                  5       5        X R$                  UR                  5       '   X@R                  U'   U R                  U   $ )z6
Lookup a given RangeTreeEntry, creating it if needed
)r=   graphsizevarsstatically_known_equalsrl   r   r   r   r   IterationRangesEntryrm   nextrn   iter_vars_countrange_tree_nodesrx   rj   appendrk   )rp   rd   re   exprr   s        r^   lookupIterationRangesRoot.lookup   s     7733G4DdjjQQDNN,g6D"4>>#3WEDzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3OODKKM*#JJtzz$r`   c                    [         R                  R                  n/ n[        U5       H'  nUR	                  U R                  X$5      5        X$-  nM)     / [        U5      Q$ rX   )r   r   r   reversedr   r   )rp   lengthsrd   itervarsre   s        r^   construct_entries%IterationRangesRoot.construct_entries   sT     ''++w'FOODKK89&G ( %(#$$r`   c                j    U R                  U5       Vs/ s H  o"R                  5       PM     sn$ s  snf rX   )r   rx   )rp   r   es      r^   	constructIterationRangesRoot.construct   s+    $($:$:7$CD$Cq
$CDDDs   0c           
       ^^^	^
 SS jmUR                    Vs/ s H,  n[        R                  R                  R	                  U5      PM.     nnU Vs/ s H)  oD(       d  M  UR
                  U R
                  :X  d  M'  UPM+     nnUR                  U4S jS9  [        R                  R                  m/ m	/ m
UU	U
4S jnU H|  n[        R                  R                  R                  UR                  T5      (       d8  U" U R                  T[        UR                  T5      5      5        UR                  mU" U5        M~     [        R                  R                  R                  U R                   T5      (       d,  U" U R                  T[        U R                   T5      5      5        / [#        T	5      Q/ [#        T
5      Q4$ s  snf s  snf )z,Figure out vars from this tree used in indexc                    [         R                  R                  R                  U R                  5      n[         R                  R                  R                  U R
                  5      S:H  nX(       + 4$ )z
Gets the key for sorting nodes. When two nodes have the
same divisor, the node with length as 1 should be handled
first so the current divisor is not changed after multiplied
node.length. Returns `not length_is_one_hint` for ascending
sort.
r>   )r=   r   r   optimization_hintrd   re   )rT   divisor_hintlength_is_one_hints      r^   get_sort_key8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   sS     77++==aiiHL!"!1!1!C!CAHH!MQR!R "899r`   c                   > T" U 5      $ rX    )rT   r   s    r^   <lambda>4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s	    ar`   keyc                   > TR                  U R                  5       5        TR                  U R                  5        TU R                  -  mg rX   )r   rx   re   )r   rd   
index_varssizess    r^   add/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+Gr`   )rT   r   r   ztuple[int, bool])free_symbolsr=   rn   r   getrm   sortr   r   r   r   r   r   rd   r   r   rl   r   )rp   r   sr   nr   r   rd   r   r   r   s          @@@@r^   vars_and_sizes"IterationRangesRoot.vars_and_sizes   sR   

	: <A;M;MN;Ma**..q1;MN!CEqQ188t{{+BEC

0
1''++
	, D77##;;DLL'RRDKK$,,)HIJ,,I  ww77

GLLGXdjj'%BCD&*%&(:(5/(:::/ OCs   3F=
GG;G)r   r   r   r   r   r   r   rX   )ri   r   rl   r   rm   r   r   intrn   r   r   Optional[dict[str, str]]r   r   r   Optional[int]r   r   r   r   r   r   r   r   r   r   r   )rd   r   re   r   r   r   )r   list[sympy.Expr]r   zlist[IterationRangesEntry])r   r   r   r   )r   r   r   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   r   rh   r   r   r   r   r   r   r   r   r   r   s   @r^   r   r      s     /3)!)! )! 	)!
 )! )! ,)! )! ")!  )! )! 
)! )!VH9 .%'%	#%E(;(;	4(; (;r`   r   c                     ^  \ rS rSr            SU 4S jjrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rU =r$ )r   i%  c                  > [         TU ]  UUR                  U-  UR                  UR                  UR
                  UUUR                  UR                  S9	  XPl        [        R                  " S 5      " U R                  5      U l        X@l        g )N)	ri   rl   rj   rk   rm   rd   re   rn   ro   )rg   rh   rl   rj   rk   rm   rn   ro   parent	functools	lru_cache_codegencodegenr   )rp   ri   rd   re   r   r   rq   s         r^   rh   IterationRangesEntry.__init__&  sx     	,,'__((==== 	 
	
  **40?	r`   c                    SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )NzIterationRangesEntry(r   ))ri   rd   re   r   rk   rt   s    r^   r   IterationRangesEntry.__repr__=  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrr`   c                N   ^ U4S jU l         S U R                   l        TU l        g )Nc                    > T $ rX   r   )ri   s   r^   r   /IterationRangesEntry.set_name.<locals>.<lambda>A  s    tr`   c                     g rX   r   r   r`   r^   r   r   B  s    4r`   )r   r   ri   )rp   ri   s    `r^   set_nameIterationRangesEntry.set_name@  s    ##/ 	r`   c                8    U R                   R                  5         g rX   )r   r   rt   s    r^   r    IterationRangesEntry.cache_clearE  s      "r`   c                X    [         R                  R                  U 5        U R                  $ rX   )r=   rn   codegen_iteration_ranges_entryri   rt   s    r^   r   IterationRangesEntry._codegenH  s    	//5yyr`   c                   / n[        U R                  [        R                  5      (       a  U$ [        U R                  [        [
        45      (       d   [        U R                  5      5       eU R                  R                  SS   H{  n[        U[        R                  [        R                  45      (       a  M4  UR                  n[        U5      S:  d  MQ  [        S U 5       5      (       d  Mj  UR                  U5        M}     U$ )Nr>   r   c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frX   )r   r   SIZE.0r   s     r^   	<genexpr>8IterationRangesEntry.precomputed_args.<locals>.<genexpr>U  s!      ,:AQN1dii00's   '))
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )rp   precomputed_argsargsymbolss       r^   r  %IterationRangesEntry.precomputed_argsL  s    -/dii..##$))h%@AAR4		?RA99>>!"%CcEMM5<<#@AA**w<!# ,:A, ) ) %++C0 &  r`   c                ,    [        U R                  5      $ rX   )hashri   rt   s    r^   __hash__IterationRangesEntry.__hash__[  s    DIIr`   c                b    [        U[        5      (       d   eU R                  UR                  :H  $ rX   )r   r   ri   )rp   others     r^   __eq__IterationRangesEntry.__eq__^  s)    %!56666yyEJJ&&r`   )r   r   ri   r   )ri   r   rd   r   re   r   r   r   r   rb   r   r   r   )ri   r   r   r   r   )r   r   r   r   )r  objectr   r   )r   r   r   r   rh   r   r   r   r   r  r  r  r   r   r   s   @r^   r   r   %  sk      	
    
.s
# ' 'r`   r   c                    U [        S5      :X  a  gU [        S5      :X  a  g[        R                  " U 5      (       a  g[        U 5      $ )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r^   constant_reprr  c  s<    e	%-		E		;r`   CSEVariableType)boundr]   c                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg)	PartialAccumulateip  r   buffer_namereduction_typer   r  r   N)r   r   r   r   __annotations__r   r   r`   r^   r  r  p  s    Jr`   r  c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   Srg)NodeInfoiw  z>
Pre-computed node information for combo kernel partitioning.
listnode_scheduledicttilingr   rl   rnumelrJ   featuresr   is_persistent_reductionr   N)r   r   r   r   r   r"  r   r   r`   r^   r$  r$  w  s*     LJK  !!r`   r$  c                    ^  \ rS rSr% Sr\rS\S'   S\S'   SrS\S'   S	\S
'        S?               S@U 4S jjjr	    SAS jr
SBS jrS r\\SCS j5       5       rSDS jrSES jr\SFS j5       rSGS jr            SHS jrSIS jrSJS jrSKS jrSGS jrSGS jrSLS jrSCS jrSBS jrSMS jrSFS jrSFS jrSNS  jr       SOS! jr!      SOS" jr"SPS# jr#SQS$ jr$\%      SRS% j5       r&\'\(RR                  RT                  4       SSS& jj5       r+\'\(RR                  RT                  4       STS' jj5       r,    SUS( jr-\'      SVS) j5       r.SWS* jr/SWS+ jr0SXS, jr1    SNS- jr2SYS. jr3SZS/ jr4S[S0 jr5S1 r6 S\       S]S2 jjr7\8Rr                        S^S3 j5       r:S_S4 jr;\%S5 5       r<S`S6 jr=S7 r>S8 r?S9 r@S: rAS; rBS< rCSaS= jrDS>rEU =rF$ )br   i  zg
Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
zCallable[[sympy.Expr], str]sexprkexprFr   allow_block_ptrr   kernel_namec                  >^  Uc  0 n[         TT ]  5         UT l        UR                  5       T l        [        5       T l        [        5       T l        UR                  5        VV	s0 s H/  u  pU[        R                  R                  R                  U	5      _M1     sn	nT l        / T l        0 T l        [         R"                  " 5       T l        UR'                  5       T l        Ub  UOT R+                  5       T l        UT l        UT l        Ub  UOT R3                  5       T l        UT l        T R9                  5       T l        S T l        [         R"                  " 5       T l        ST l         [B        RD                  RF                  (       a  T R                  RH                   Hv  n
[K        U
[L        RN                  5      (       d  M$  [K        U
RP                  [R        RT                  5      (       d  MO  U
RP                  RW                  5       S:X  d  Mo  ST l           O   [X        RZ                  SU 4S jj5       nUT l.        T R_                  U5        ST l0        / T l1        g s  sn	nf )NFdotTc                   > [         R                  R                  R                  U TR	                  5       5      n TR
                   H  nTR                  X5      n M     TR                  U 5      $ rX   )r=   r   r   simplify_with_rangesrk   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treerp   s     r^   simplify_indexing.SIMDKernel.__init__.<locals>.simplify_indexing  sY    GG$$99%ARSE((44UA ) 66u==r`   r   )r   r   )2rg   rh   r*  get_mutations	mutationsr4   bodyindexing_coder{   r=   r   r   simplifynumelsr5  r   	itertoolscountr   ru   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scoresr(  should_use_persistent_reductionpersistent_reductionmix_order_reductionwant_no_x_dimno_x_dimr    store_output_ctris_native_matmulr   r[   native_matmulr&  r   r   SchedulerNoder   r   ComputedBufferget_reduction_typer   cacher9  initialize_range_treersplit_sizesaved_partial_accumulate)rp   r(  r*  r   override_persistent_reductionoverride_cooperative_reductionrF  rI  rm   valr   r9  rq   s   `           r^   rh   SIMDKernel.__init__  s    I !//1"$	+-FLlln
FT{vFAGG$$--c22n
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L-3 -8 *557 	!
 *= **,(, ) 1 %==&&33tY%<%<=="499b.?.?@@		446%?,0D) 4 
	> 
	> "3""9-AC%a
s   #6Ic	                    g)zOverride template codegen. Return None to use default flow.

External template handlers (e.g. Helion) can override this method
to implement custom code generation.
Nr   )	rp   
schedulingtemplate_nodeepilogue_nodesprologue_nodesbuf_name_to_prologue_groupprologue_preserves_zero_mask_fnrenderonly_gen_src_codes	            r^   codegen_template_override$SIMDKernel.codegen_template_override  s      r`   c                    SU S3$ )Nz<STORE_OUTPUT_>r   )rp   is     r^   _get_store_output_subgraph_name*SIMDKernel._get_store_output_subgraph_name  s    s!$$r`   c                j    [        U R                  5      n[        R                  " US-
  SS9U l        U$ )Nr>   )startstep)r   rL  rA  rB  )rp   totals     r^   get_store_output_count!SIMDKernel.get_store_output_count  s.    T**+ )eaia Hr`   c                :    [        S U R                   5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frX   )r6   )r   rm   s     r^   r   0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I[6&v..[   )sumr@  rt   s    r^   num_reduction_dimsSIMDKernel.num_reduction_dims  s     IT[[IIIr`   c                    [         erX   NotImplementedError)rp   dtypes     r^   dtype_to_strSIMDKernel.dtype_to_str      !!r`   c                6    U R                   R                  5       $ rX   )r*  select_index_dtypert   s    r^   get_index_dtype_as_torch_dtype)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11r`   c                @    U R                  U R                  5       5      $ rX   )r{  r  rt   s    r^   index_dtypeSIMDKernel.index_dtype  s      !D!D!FGGr`   c                    gNFr   rt   s    r^   rJ  SIMDKernel.want_no_x_dim      r`   c                  ^ [        U4S j[         5       5      nU(       + =(       d    U(       + nS	S jn/ SQn	[        [        U	5      5      n
SS/nU(       a  UnOU(       a  U
nOX-   nU" X5      nU" U	[        5      n/ n[	        U5       H|  u  nn[        U5      nUR                  U5      nUR                  U5      nUc  UOUnUR                  [        U S3TU   UUU UU=(       a    U R                  (       + UUST;   S9
5        M~     U$ )
Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frX   r   )r   rm   r@  s     r^   r   3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
!-v61AFF   		c                d   ^ [        U4S jU  5       5       VVs0 s H  u  p#X2_M	     snn$ s  snnf )Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frX   r   )r   rX  masks     r^   r   OSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U#3PT33#r  )	enumerate)seqr  idxrX  s    `  r^   filtered_index_map<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s4    )22U#2U)U)UXS)U  s   ,)rT   rS   rR   rU   rV   r   rR   )r   r   r   r   r   )r   zdict[Any, int])
r   all_prefixesr%  r   r  r6   r   r   r   rH  )rp   r   rC  ru   r@  rK  active_prefixesno_r_dimr  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr5  rg  rm   r   r   r   s       `                r^   construct_range_trees SIMDKernel.construct_range_trees  s*    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/@K ,KI))\B"?3IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F] 4& r`   c                    U R                  UU R                  U R                  R                  5       U R                  U R
                  5      nU R                  R                  U5        g rX   )r  rC  r*  ru   r@  rK  r5  extend)rp   r   r5  s      r^   rS   SIMDKernel.initialize_range_tree5  sR    00!!MM&&(KKMM
 	,r`   c                    g)zZ
Hook called right before codegen with every index that will be
used in the fused kernel.
Nr   )rp   indicess     r^   finalize_indexingSIMDKernel.finalize_indexing?  s    r`   c                p    U R                   nSU l          U R                  XU5      X@l         $ ! X@l         f = fr  )rC  store)rp   ri   r   r  priors        r^   store_reductionSIMDKernel.store_reductionE  s5    %% %	*::d51$)!E!s   - 5c                    gr  r   rt   s    r^   rD  +SIMDKernel.should_use_cooperative_reductionM  r  r`   c                    gr  r   rt   s    r^   rG  *SIMDKernel.should_use_persistent_reductionP  r  r`   c                t    [        [        R                  R                  S U R                   5       5      5      $ )Nc              3  T   #    U  H  oR                   R                  5       v   M      g 7frX   )rk   r{   r   r8  s     r^   r   (SIMDKernel.var_ranges.<locals>.<genexpr>U  s"      *4DD%%''4Ds   &()r'  rA  chainfrom_iterabler5  rt   s    r^   rk   SIMDKernel.var_rangesS  s4    OO)) *484D4D* 
 	
r`   c                :    [        S U R                   5       5      $ )Nc              3  P   #    U  H  n[        UR                  S L5      v   M     g 7frX   )r   r   r  s     r^   r   0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>[  s#     Q@P3td233@Ps   $&)rt  r5  rt   s    r^   triton_tensor_ndimSIMDKernel.triton_tensor_ndimZ  s    Q@P@PQQQr`   c                \    S/U R                  5       -  nSX!'   SSR                  U5       S3$ )Nr   :[r   ])r  join)rp   rg  r   s      r^   indexing_size_strSIMDKernel.indexing_size_str]  s7    42244499U#$A&&r`   c                   S/U R                  5       -  nU R                   H_  nUR                  c  M  UR                  (       a  U R                  (       d  M6  UR
                  R                  5        S3XR                  '   Ma     U$ )N1BLOCK)r  r5  r   ru   rC  rm   upper)rp   r   r8  s      r^   dense_size_listSIMDKernel.dense_size_listb  sp    //11$$D&$$(=(=(=,0KK,=,=,?+@)Foo& % r`   c                    UR                   nUR                  c  U R                  5       nU SU S3$ S/U R                  5       -  nSXAR                  '   SR	                  U5      nU SUR                  5        SU S3nU$ )	Nzmask = tl.full(z, True, tl.int1)r   r  r   zmask = tl.full([zBLOCK], True, tl.int1)[r  )rm   r   dense_size_strr  r  r  )rp   entryrT   sizestrr   suffixouts          r^   create_constant_maskSIMDKernel.create_constant_maskl  s    LL#))+GSy0@AA42244"%5!#AGGI;.EfXQO
r`   c                L    U R                  5       nSSR                  U5       S3$ )Nr  r   r  )r  r  rp   r   s     r^   r  SIMDKernel.dense_size_strw  s)    $$&499U#$A&&r`   c                   [        U[        5      (       d  U$ UR                  S   nU R                  R	                  U5      =nc  U$ [        XUR                  05      n[        R                  R                  R                  U5      n[        UUR                  R                  5       UR                  R                  [        R                  R                   UR                  R"                  5      R%                  5       05      $ Nr   )r   r   r  r   r   r9   r   r=   r   r   r7  ro   r   r   r   r   r   rl   rx   )rp   r   rT   	tree_node	new_indexs        r^   r7  )SIMDKernel.combine_modular_indexing_pairs{  s    %11LJJqM..22155I>Lu)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
r`   c                    [         R                  R                  R                  U5      =n(       a  Uu  pE[	        U R                  XB5      U5      $ U R                  X5      $ rX   )r=   r   r   expand_floor_divr   _combine_contiguous_dims)rp   r   r8  
expand_resr  denominators         r^   r6  "SIMDKernel.combine_contiguous_dims  sU     ))::5AA:A%/"ID99)JKXX00==r`   c                   [        U[        R                  [        R                  45      (       a  U$ UR	                  U5      u  p4[        U5      S::  a  U$ [        R                  R                  R                  X4[        U/X45      5      u  pVnXT:X  a  U$ UR                  U5      n[        U[        [        X6" U5      5      5      5      n	U	$ )z9
More aggressive simplification to merge contiguous dims
r>   )r   r   r  r   r   r  r=   r   r   _simplify_loopsrA   r   r9   r'  zip)
rp   r   r8  r   r   	new_sizesreindex_prunenew_index_varsr  s
             r^   r  #SIMDKernel._combine_contiguous_dims  s     eemmU\\:;;L //6
u:?L%&WW%5%5%E%E7S&
"	F L	2ud3z7>;R+S&TU	r`   c                   ^ ^ T R                   S   R                  =(       d    T R                  m[        R                  U U4S j5       nU" 5       $ )Nc               3    >#    T R                   R                  5       (       d  T R                  (       a   eS v   g T(       a  T R                  5         ST l         S v   T(       a  T R                  5         ST l        g ! ST l        f = f7f)NFT)r*  ru   rC  codegen_body)rp   should_flushs   r^   ctx)SIMDKernel.disable_reduction.<locals>.ctx  sn     ==--//0000 !!#$)D!-%%'(,%%s   AB	A= 5B	=	BB	)r5  r   rE  
contextlibcontextmanager)rp   r  r  s   ` @r^   disable_reductionSIMDKernel.disable_reduction  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ur`   c                    [        U5      [        U R                  5      :X  d   e[        XR                  5       VVs/ s H  u  p#UR                  U5      PM     snn$ s  snnf rX   )r  r5  r  r   )rp   r   re   rangess       r^   
set_rangesSIMDKernel.set_ranges  s^    7|s4#3#34444 #&g/?/?"@
"@ V$"@
 	
 
s   Ac                  ^^^^ [        S U 5       5      (       a  U  Vs/ s H  n/ PM     sn/ 4$ [        R                  R                  mU  Vs/ s H  n/ PM     snmU  Vs/ s H  nTR	                  U5      PM     snm[
        R                  " 5       mSUUUU4S jjn      SS jn/ nSnU GHt  n	/ n
U	 GHV  nTR                  US5      (       a  U
R                  S 5        M/  U[        T5      :  aJ  TR                  TU   S5      (       a0  US-  nU[        T5      :  a  TR                  TU   S5      (       a  M0  [        T5      S:H  =(       a    TS   S:H  nUS	-   [        T5      :  a  TR                  UTU   TUS-      -  5      (       a  U(       a}  TR                  UTU   TUS-      -  5      (       d  [        eTU   nTUS-      n[        XU-  5      nU
R                  U" X/U" X5      U" US-   U5      U" US	-   U5      /5      5        GM^  US-   [        T5      :  a  TR                  UTU   5      (       d$  TR                  [        UTU   5      S5      (       ak  TR                  UTU   5      (       d  [        UTU   5      eTU   n[        UTU   5      nU
R                  U" U/U" X5      U" US-   U5      /5      5        GM  U[        T5      :  d  GM+  U
R                  [        R                  " U" X5      5      5        GMY     UR                  U
5        GMw     [        S
 T 5       5      (       d   ST SU 35       eTU4$ s  snf s  snf s  snf )Nc              3  >   #    U  H  n[        U5      S :H  v   M     g7fr   Nr  )r   re   s     r^   r   5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6gFs6{ags   c                   > TR                  U5      nTR                  TU    U5      (       d  [        TU    U5      e[        TU    U5      TU '   TU    R	                  U5        [        T5      $ rX   )r?  statically_known_multiple_of	CantSplitr   r   r   )rg  r   
new_ranges	remainingsv	var_counts     r^   	add_range5SIMDKernel._split_iteration_ranges.<locals>.add_range  si    ;;t$D229Q<FF	!d33#IaL$7IaLqM  &	?"r`   c                V   ^ ^ [        T5      [        T 5      S-   :X  d   eSUU 4S jjnU$ )z`
Builds the nested expression:
  ((...((s1*v[i1] + v[i2]) * s2 + v[i3]) ... ) * sk + v[i(k+1)])
r>   c                Z   > U TS      n[        TTSS  5       H  u  p#X!-  X   -   nM     U$ )Nr   r>   )r  )	flat_varsr   r   r  idxsr   s       r^   getterISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s=     a)!%ab2FA8in4D 3r`   )r  r   r   r   r  )r   r  r	  s   `` r^   make_combined9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s0     t9E
Q...  Mr`   r   r>   c                6    [         R                  R                  $ rX   )r   r   Zero)_s    r^   r   4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLLr`      r  r   c              3  z   #    U  H1  n[         R                  R                  R                  U5      S :H  v   M3     g7f)r>   N)r=   r   r   	size_hintr   s     r^   r   r  V  s*     Iy!177##--a0A5ys   9;zfailed to set ranges  )rg  r   r   r   r   r   )r   r   r  z	list[int]r   z(Callable[[list[sympy.Expr]], sympy.Expr])r  r=   r   r   r?  rA  rB  r   r   r  statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_getterssizeis_bmm_then_pwsize1size2size3r  r   r  r  s                   @@@@r^   _split_iteration_ranges"SIMDKernel._split_iteration_ranges  s    6g666$*+F5BF+R//WW:@-A&Qb&-A
-34VR[[^V4	OO%		# 	#	#	+4	5	" !##LN$--dA66"))*@A#c)n49S9Sm,: :
 "Q&M $c)n49S9Sm,: :$ "%Y1!4!K2!9K!A%I6..i6=STCT9UU  ' ::i6=STCT9UU  (%m4E%ma&78E$T5=9E"))%"N )- ? )-!*;U C )-!*;U C
 #Q&Y7**4=1IJJ **8D)M:R+SUVWW ::i6  (i.FGG%m4E$T9]+CDE"))%"G )- ? )-!*;U C	 %s9~5&--$//	-0NOu %| "((8A $D IyIII 	
#I;ay9	
I 000S , .B4s   MM"Mc                *   [         R                  R                  n[        US   5      S:X  af  UR	                  U[
        R                  R                  5      (       d7  UR	                  [        U5      [        US   5      U-  5      (       a  US   U/4$ U$ )z1Fill in the reduction numel of lengths if missingr>   r   )	r=   r   r   r  r   r   r   r   r8   )clsr  r   reduction_numelr   s        r^   prepare_split_iteration_lengths*SIMDKernel.prepare_split_iteration_lengths\  s     77##wqz?a00%''++NN00f%gaj)O; 
 AJ 122r`   c                l    U R                  XU5      n U R                  X5        g! [         a     gf = fNTF)r)  r$  r  )r'  r  r   r(  s       r^   is_compatibleSIMDKernel.is_compatiblep  s>     55fW	''8 		s   & 
33c                X   U R                    Vs0 s H  o"R                  UR                  _M     nnU R                  (       d7  U H1  n[	        U5      (       d  M  [
        R                  R                  X4'   M3     / UR                  5       QnU R                  XQU R                  5      $ s  snf )a  
Split and set iteration ranges for the kernel based on the provided lengths.

This method maps the kernel's tiling structure to the node's iteration space,
handling both pointwise and reduction dimensions appropriately.

Args:
    lengths: A sequence of sequences of symbolic expressions representing
            the sizes of different dimensions for each node.

Returns:
    A list of lists of symbolic expressions representing the mapped
    iteration variables for each dimension.
)r5  rm   rl   rC  r6   r   r   r   r   map_kernel_groups_to_node_sizesr  )rp   r   rtr(  rm   r  s         r^   split_and_set_rangesSIMDKernel.split_and_set_ranges  s    $ 150@0@A0@"))RXX%0@A $$ &v..%*WW[[FN !
 $6==?# 33FT__UU Bs    B'c           
     T   [        U5      [        U5      :X  a%  [        S [        X!5       5       5      (       a  U" U6 $ U R                  X5      u  pE/ [        R
                  R                  U" U6 5      QnU VVs/ s H  ow Vs/ s H
  o" U5      PM     snPM     snn$ s  snf s  snnf )aY  
We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

To do this we need to split up the iteration space of i0 into something like:
    for i1 in s0:
      for i2 in s1:
        i0 = i1*s1 + i2
        ....

This function matches and resplits lengths to the groups of
this kernel to enable tiled + non-tiled fusions.
c              3     #    U  H?  u  p[         R                  R                  R                  [	        U5      U-
  5      S :H  v   MA     g7fr  r=   r   r   r?  r8   )r   rT   r  s      r^   r   =SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s=      /
, GG%%mA&6&:;q@,s   AA	)r  r  r  r$  rA  r  r  )	r'  r  r   r  r  r  r   fnsfns	            r^   r0  *SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
G,/
 ,
 ,
 w'',/,G,G,X)
LY__22:z3JKL8MN8M,"H,8MNN,Ns   :	B$BB$B$c                6    [        U[        R                  5      $ rX   )r   r   TMPrp   r   s     r^   is_indirect_indexingSIMDKernel.is_indirect_indexing  s    "5$((33r`   c                  ^ U R                  U5      (       a  gS/[        U R                  5      -  nUR                   Hn  nX0R                  ;  a  M  U R                  U   n[        UR                  [        5      (       d   eX$R                  R                  ==   UR                  -  ss'   Mp     [        R                  R                  R                  m[        U4S j[        X R                  R!                  5       5       5       5      $ )NFr>   c              3  J   >#    U  H  u  pT" U5      T" U5      :g  v   M     g 7frX   r   )r   	idx_range
iter_ranger?  s      r^   r   ,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s*      
)P%	 Y8J#77)Ps    #)r>  r  r@  r   r   r   r   r   r   re   r=   r   r   r?  anyr  r   )rp   r   index_numelsrx   r  r?  s        @r^   is_broadcastedSIMDKernel.is_broadcasted  s    $$U++sS--((F222))&1Eell,?@@@@++,<, ) 77##,, 
),\;;;M;M;O)P
 
 	
r`   c                    [        U[        5      (       a)  SSR                  [        U R                  U5      5       S3$ U R                  U R                  U5      5      $ )a`  
Convert an index expr to a string that can be used in output code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

Index expressions often need to be passed in as arguments to the triton kernel.
Rename_indexing and codegen_indexing keep track of the needed indices and add
new parameters to the function signature.
r  r   r  )r   r%  r  mapindex_to_strr.  rename_indexingr=  s     r^   rK  SIMDKernel.index_to_str  sQ     eT""tyyT%6%6!>?@BBzz$..u566r`   c                   U R                  U5      n[        U[        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       d-  [        UR                  [        R                  5      5      (       a3  UR                  [        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       a  UR                  [        R                  5       Ho  nUR                  n[        U5      S:  d  M   [        S U 5       5      (       d  M9  U[        R                  R                  R                  U5      0n[        X5      nMq     U R                  U5      n[        U[         5      (       d  UOUR"                  S   nU R%                  U5      $ )Nr   c              3  v   #    U  H/  n[        U[        R                  [        R                  45      v   M1     g 7frX   )r   r   r   PRECOMPUTED_SIZEr   s     r^   r   .SIMDKernel.prepare_indexing.<locals>.<genexpr>  s0      ,$ #1tyy$2G2G&HII$s   79)r9  r9   r=   r   r   precomputed_replacementsr  atomsr   floorceilingsubsr   r  lookup_precomputed_sizer   r   r  codegen_indexing)rp   r   ar  replacements
simp_indexs         r^   prepare_indexingSIMDKernel.prepare_indexing  sQ    &&u-5!''"2"2"K"KLu{{5;;'((CEMM0J,K,KJJqww//HHIE u{{5==)**[[/ ..w<!# ,$, ) ) %&qww'7'7'O'OPQ'R#SL&u;E 0 ++E2
 )X>>JJOOTUDV 	 $$Z00r`   c                    U R                    Vs/ s H(  oR                  (       a  U R                  (       d  M&  UPM*     sn$ s  snf rX   )r5  ru   rC  )rp   ts     r^   active_range_treesSIMDKernel.active_range_trees  s6    ''
'!~~AVAVA'
 	
 
s
   %AAc                8   [         R                  R                  R                  XR	                  5       5      n[        UR                  [        S9 H  nX R                  ;   d  M  0 nU R                  U   R                  5        H.  n[         R                  R                  R                  U5      X4'   M0     [        U5      S:  a5  [        U R                  U   R                  U5      U R                  U   l        U R                  U   R                  5         M     U$ )Nr   r   )r=   r   r   r4  rk   sortedr   r   r   r  rW  r  r9   r   r   )rp   r   symrZ  pss        r^   rX  SIMDKernel.codegen_indexing  s    ww44T??;LM$++5C+++  "//4EEGB'(ww'7'7'O'OPR'SL$ H|$q(6@--c277$7D))#.3 %%c*224 6 r`   c                    [        S5      e)NzNYI: codegen_nan_checkrx  rt   s    r^   codegen_nan_checkSIMDKernel.codegen_nan_check!  s    !":;;r`   c                    [         R                  R                  n[        U R                  R
                  5       H  nUR                  U5        M     g rX   )r=   r   wrapper_coder   r  workspace_argsgenerate_workspace_deallocation)rp   wrapperwss      r^   deallocate_workspaces SIMDKernel.deallocate_workspaces$  s8    ''&&499334B33B7 5r`   c                    [        S5      e)NzNYI: call_kernelrx  )rp   ri   r   deallocate_wss       r^   call_kernelSIMDKernel.call_kernel)  s     ""455r`   c              #     #    U R                   nU R                  nU(       a  [        R                  " X5      n[        R
                  " U5      nXl         X l         Uv   X0l         X@l        g! X0l         X@l        f = f7f)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr;   logical_andr<   _unwrap)rp   r  r  r  	prior_vals        r^   
mask_loadsSIMDKernel.mask_loads.  sj     
 $$	??4/D!!$' 	)J#O( $O(s   AA=A, A=,A::A=c                &   U R                   R                  5        VVs0 s H  u  p#X#R                  _M     nnn[        X5      n0 nU R                   H5  n[        UR                  5      n[        XXS05      [        XXS05      -
  Xh'   M7     U$ s  snnf )a  
This gets the stride of the index for each of the tiling variables
(technically, it does it at index 0)

For example, if
xindex = x0 + 512*x1 + 1024*r0
x0 = (xindex//512)
x1 = (xindex % 512)
r0 = rindex // 1024

this function would return
{xindex: 512, rindex: 1024}
r>   r   )r   r{   r   r9   r5  r7   ri   )	rp   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r^   get_strides_of_loadSIMDKernel.get_strides_of_loadB  s     8<7L7L7R7R7T U7TtqFF7T U'E**J":??3A#$6A?*"FC GJ +
  !Vs   Bc                d    [        U[        5      (       a  [        [        X5      5      $ U " U5      $ rX   )r   tuplerJ  )r9  r  s     r^   _map_tuple_or_scalarSIMDKernel._map_tuple_or_scalarZ  s(    eU##R((%yr`   c                    [         R                  " U R                  R                  5       Vs/ s H  nUR	                  5       PM     nn[        [        S U5      5      $ s  snf rX   )rI   
only_nodesr*  r&  estimate_flopsrt  filter)rp   r   flopss      r^   r  SIMDKernel.estimate_flops`  s[     +55dmm6Q6QR
R !R 	 
 6$&''	
s   Ac           	     "   / n[        [        U R                  R                  R	                  5       5      5      nU R                  R                  5       u  p4  nU R                  R                  5       n[        R                  R                  R                  [        U R                  R	                  5       5      5      n[        U5       GH;  u  pxX;  a  UR                  S5        M  [        R                  R!                  U5      n	[        R                  R                  R                  U	5      n
X:  a  ["        [$           " 5       nSnXX    HT  n['        U[(        [*        45      (       a  UR-                  SU 35        US-  nM9  UR-                  UR.                  5        MV     [        U5      U-  nOU
n[        R                  R1                  U5      n[3        U5      nUR                  UU-  S[5        Xr:  5      -   -  5        GM>     [7        U5      $ )a  
Try the best to estimate the total size (in bytes) of the
kernel's inputs and outputs, which is used for estimating the memory
throughput of this kernel. This information is used for checking how
far we are from the peak memory bandwidth. It's important that
we want to avoid overestimating the sizes of the inputs and outputs,
because it can wrongfully give us a very large memory traffic value,
which may be even larger than the theoretical bandwidth and thus
become very misleading. This is particularly problematic for cases
where we slice some inputs. In those cases, we should only count
the size of the "slices" instead of the original inputs, because
only the slices contribute to the real memory traffic.
r   no_index_dep_r>   )r  r:   r  inplace_buffersr   python_argdefsr*  buf_accessesr=   r   r   r   r8   r@  r  r   	get_numelr   r   r   r#   r$   r   r   	get_dtyper3   r   rt  )rp   nbytesninplace_argsr  	call_argsr  	out_numelrg  r  	arg_numelbuf_sizer  no_index_dep_countdeprl   rz  
dtype_sizes                    r^   estimate_kernel_num_bytes$SIMDKernel.estimate_kernel_num_bytesg  s    F499#<#<#C#C#EFG!YY557a}}113 GG$$66$++,,./
	  	*FA &a ))#.Iww''99)DH# %S/+%&"',C!#'9::m4F3G$HI*a/*CII. - Gy0 GG%%c*E'.JMM%*,C8I4J0JKL; +< 6{r`   c           	        [        U R                  R                  5      S:X  aG  [        U R                  R                  5      S:X  a$  [        U R                  R                  5      S:X  a  gU R                  R                  5       u  p#pESnU GHr  n[        R                  R                  U5      nU(       d  M,  UR                  5       n	[        U	R                  5      S:X  d  MW  [        U	R                   V
s/ s H  oS:X  d  M
  U
PM     sn
5      S:X  a  M  [        R                  " U	R                  5      nUc  UnM  Xk:w  d  M  [        SU S3SU S	U 3-   5      n[        R!                  U5        U Vs/ s Ht  n[        R                  R                  U5      (       aK  [        R                  " [        R                  R#                  U5      R                  5       R                  5      OSPMv     nnU Vs/ s H`  n[        R                  R                  U5      (       a7  [        R                  R#                  U5      R                  5       R                  OSPMb     nnU Vs/ s HE  nU[        R                  R$                  ;   a  S
O!U[        R                  R&                  ;   a  SOSPMG     nnU V
s/ s H  oR(                  PM     nn
[        SU SU SU 3SU SU S3-   5      n[        R!                  U5          g   [+        SU S35      n[        R!                  U5        gs  sn
f s  snf s  snf s  snf s  sn
f )zZ
Print message if the kernel have mixed layout inputs.
Only care about 4D tensor for now.
r>   r   Nr  r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r  r  input_buffersoutput_buffersr  r  r=   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider-   logwarning
get_buffergraph_inputsname_to_bufferri   r+   )rp   r0  argdefsr  
_signaturer  uniform_stride_orderarg_namebuflayoutrT   stride_ordermsgri   stride_order_list	size_listsource_listargdef_namess                     r^   warn_mix_layoutSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#!H''((2C^^%F6;;1$6;;9;aq&;9:a?!226==A'/+7()9%01E0FF^_l^<}EFC KK$ %.) %.D 7711$77 ++GG..t4??AHH "	"
 %. & ) %.	! %.D 7711$77 **40;;=BB!" %.	  ! %.# %.D	  177#7#77 %  177#9#99 2!	"
 %.   # 5<#<GqFFGL#<%(nYK|\m[no&ykk]"MNC KK$a "b 3K=@TU
 	C[ :)!# $=s'   6	L(
L(
5A;L-6A'L2#AL75L<c                   [         R                  " XSU5      nSU l        [         R                  " U R                  R
                  U5      n[         R                  " X45      nSU l        [         R                  " X%5      n[         R                  " Xf5      n[         R                  " XSU5      n[        R                  " XXU45      $ )Nrt  FT)r;   	reductionrC  
index_exprr*  r(  truedivsubmulr<   rz  )	rp   rz  r  sum_r)  meandxdx2m2s	            r^   welford_reduce_fallback"SIMDKernel.welford_reduce_fallback  s    }}U5%8 % = =uE{{4( $WWU!ggbo]]54!!4V"455r`   c                    [         R                  " XSU5      n[         R                  " X#5      n[         R                  " U5      n[         R                  " XSU5      n[        R
                  " X645      $ )Nmaxrt  )r;   r  r  expr<   rz  )rp   rz  r  vmaxr  r  vsums          r^    prepare_softmax_twopass_fallback+SIMDKernel.prepare_softmax_twopass_fallback  sT    }}U5%8gge"ggcl}}U5#6!!4,//r`   c                    [         erX   rx  rt   s    r^   codegen_kernelSIMDKernel.codegen_kernel  r}  r`   c                    g rX   r   rt   s    r^   r  SIMDKernel.codegen_body      r`   c                    g rX   r   )rp   r  s     r^   r   )SIMDKernel.codegen_iteration_ranges_entry
  r  r`   )rw  rx  r=  r    rE  r*  r>  rC  rM  r   rI  r<  rK  r@  rH  r   r5  rT  rU  r9  rL  r(  rF  )NNNNF)r(  dict[str, sympy.Expr]r*  rJ   r   r   rV  Optional[bool]rW  r  rF  Optional[dict[str, sympy.Expr]]rI  r   r   r   )rb  r   r   z
str | None)rg  r   r   r   r  )rz  torch.dtyper   r   )r   r  r   r   )r   r   rC  r   ru   r   r@  r  rK  r   r   list[IterationRangesRoot])r   zdict[str, str]r   r   )r  Sequence[sympy.Expr]r   r   )ri   r   r   r   r  r@   r   r   )r   r   )r   z	list[str])r   r   r   r   )r   r   r8  r   r   r   )r   z'contextlib.AbstractContextManager[None])r   r   r   r   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]r   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r  r   r  r(  r   r   r  )r  r  r   r  r(  r   r   r   )r   r  r   list[list[sympy.Expr]])r  r  r   r  r   r  )r   r   r   r   )r   r   r   r   )r   r  )r   r   r   r   r   )NT)ri   r   r   zOptional[IRNode]rs  r   r   r   )r  zUnion[str, OpsWrapper]r  Union[int, float]r   zIterator[str])r   r   r   r   )r   r   )r  r   )Gr   r   r   r   r   pexprr-  r"  r/  rh   rc  rh  rn  r   r1   ru  r{  r  r  rJ  r  rS  r  r  rD  rG  rk   r  r  r  r  r  r7  r6  r  r  r  staticmethodr$  classmethodr   r   r   r)  r-  r2  r0  r>  rG  rK  r\  r`  rX  rh  rp  rt  r  r  r|  r  r  r  r  r  r  r  r  r  r   r   r   r   s   @r^   r   r     s    */E&.&&!OT! /38<9=9=$)AD%AD %AD ,	AD
 (6AD )7AD 7AD "AD 
AD ADF   
$%
 J  J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
	'
$>>':>	>':	(0
 P1$P1/MP1
P1 P1d 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
  V5 V	 VD O$O 0O
 
 O O84
,7$1$1 
$1L

"<8 OS66/6GK6	6
 )*)3D)	) )&0  
(@DFP
60" r`   r   c                  f   \ rS rSr% Sr\rS\S'   S rS r	\	r
\	rS rS rS	 rS
 r S0   S1S jjrS r S2   S3S jjr  S4S jr\      S5S j5       r S6       S7S jjrS8S jr    S9S jrS rSS.S jr    S:S jrS;S jr      S<S jrSSS.   S=S jjrS r S6           S>S jjr S r!\"\#RH                  " S 5      S?S! j5       5       r%\"      S@S" j5       r&\"      SAS# j5       r'\"        SBS$ j5       r(\"  SCS% j5       r)\"          SDS& j5       r*\"        SES' j5       r+\"        SFS( j5       r,\"\-R\                  R^                  S4   SGS) jj5       r0\"\-R\                  R^                  S4   SHS* jj5       r1S+ r2SIS, jr3 SJ SKS- jjr4S. r5S/r6g)LSIMDSchedulingi  zc
Single Instruction Multiple Data parent class used for fusion across
multiple different backends.
z	type[Any]kernel_typec                &    [        S U 5       5      $ )Nc              3     #    U  H7  n[         R                  R                  R                  [	        U5      5      v   M9     g 7frX   r6  r   s     r^   r   *SIMDScheduling.group_fn.<locals>.<genexpr>  s-     P%QQWW%%..}Q/?@@%s   ?Ar  r  s     r^   group_fnSIMDScheduling.group_fn  s    P%PPPr`   c                
  ^^^^ [        U[        R                  5      (       d  [        U[        R                  5      (       a  [        R                  R                  X5      $ UR                  u  nu  nmUR                  u  nu  mm[        X5      nUR                  5       (       a3  UR                  5       (       d  UR                  5       (       a  U" S5        OGUR                  5       (       a2  UR                  5       (       d  UR                  5       (       a  U" S5        UR                  5       (       a  UR                  5       (       a  UT:H  =(       a    TT:H  nU(       d  SSKJ	n  UR                  X5      nU(       d  U" SUTTT5        U(       a  UR                  5       (       d  UR                  5       (       aj  UR                  5       (       d  X!p!U R                  UR                  5       UT5      m[        UU4S jUR                  5        5       5      (       d	  U" S5        gU$ UR                  5       (       Gd  UR                  5       (       Gd  UT:X  a  TT:X  d  UR                  5       (       d  U" SUTTT5        gUR                  5        Hl  nUR                  5       (       a    OVUR                  5       UR!                  5       -  (       d  MB  UR                  u  nu  pXI:X  a  TU
:X  a  M`  U" S	UU	TU
5          g   X4 H  nUR                  5       (       d  M    g
   U R                  UR                  5       UT5      nU R                  UR                  5       UT5      nU R                  UR                  5       UR                  5       -   UT5      n["        R$                  R&                  (       a`  S
n[)        U5      S:  a)  [)        U5      S:  a  Xs=:H  =(       a    U:H  Os  nOX:H  nO[)        U5      S:  a  X:H  nU(       d  U" SUUU5        gg
UR                  5       (       d  UR                  5       (       a  TS:X  a  TS:w  d   eUTT-  :X  a  [        UU4S jUR                  5        5       5      (       d	  U" S5        g["        R$                  R*                  (       ag  UR                  5       (       dR  [-        U R                  UR                  5       U5      R/                  5       5      US4TTS44;   nU(       d  U" S5        U$ g
UT:w  a  U" S5        UT:H  $ UR                  5       (       a  UR                  5       (       a   eU R1                  X!5      $ )z
Hook called by Scheduler to determine if the Triton backend
can fuse node1 and node2.  These nodes might already be
FusedSchedulerNodes.
z&Split scan cannot fuse with reductionsr   )MixOrderReductionz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)c              3     >#    U  H6  n[         R                  TR                  5       UR                  5       TS 9v   M8     g7f)r(  N)r   r-  r   
get_ranges)r   n2rnumel1r(  s     r^   r   *SIMDScheduling.can_fuse.<locals>.<genexpr>Q  s?       0 ,,' -  0s   >Az/invalid loop order and tiling for native matmulFz5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)z:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r>   c              3  p   >#    U  H+  n[         R                  TT4UR                  5       5      v   M-     g 7frX   )r   r-  r  )r   r   numel2rnumel2s     r^   r   r    s3      . ,,fg->OO.s   36z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r0   is_split_scanru   torch._inductor.schedulerr  rM  select_tiling	get_nodesr  is_templateused_buffer_namesget_buffer_namesr   r[    tiling_prevents_pointwise_fusionr   tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)rp   node1node2r  numel1whyreduction_can_fuser  r   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  r  r(  s                    @@@@r^   r  SIMDScheduling.can_fuse  s    eYAABBj977G
 G
 77@@NN${{FG${{FG%  )<)<)>)>!!##<=  ""5+>+>+@+@!!##<=E$6$6$8$8!'6!1!Hg6H%G%6%?%?%M"%G "&&((E,B,B,D,D --//#(5 ++EOO,=vwO  $oo/	   IJ %%!!##E,>,>,@,@f$G);((**O ! !& 1++--!  $557%:P:P:RR$59ZZ22I & 3:8M \ & ) ' * $)# !2& ^==?? $
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&<<W<&1\A%"-D6	 !!!##(:(:(<(<a<GqL00')) "__.   <= MMBB!--//05**5??+<fELLN1  !,1- 5:;4412V##!!##E,>,>,@,@@@''55r`   c           
       ^^^^^^^ / m[         [        R                     " 5       m[        5       m[        5       mS mUU4S jnUU4S jnU4S jnUUUU4S jn[        R                  UUUU4S j5       nUU4S jn	U H  n
U
T;   a  M  TR                  U
5        U" U
5      (       aT  U	" U
T5      (       a  U" 5           S S S 5        T(       a"  U" U
5      (       d  T=(       d    [        T5      mOS mU" U
5        M}  U" U
5      (       a#  U" 5          TR                  U
5        S S S 5        M  [        ST ST S	U
R                  S
    35      e   T$ ! , (       d  f       N= f! , (       d  f       M  = f)Nc                ~   > U R                   u  nu  p#UT:H  =(       a    UT:H  =(       d    UTT-  :H  =(       a    US:H  $ Nr>   r  r   r  
node_numelnode_rnumelrl   r)  s       r^   fits_in_main_body@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sF    +,77(A(
%'AK6,A efn,A1Ar`   c                `   > U R                   u  nu  p#UT:H  =(       a    US:H  =(       a    TS:g  $ r  r  r  s       r^   fits_outside_reductionESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s2    +,77(A(
&K;!+;K!Kr`   c                d   > U R                   R                   H  nUR                  T;   d  M    g   gr,  )read_writesreadsri   )r   readcurrent_loop_buffer_usages     r^   expect_improved_memory_usageKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s,    ++99 99 , r`   c                  > TR                  U 5        TR                  U 5        TR                  U R                  R                   Vs/ s H  oR
                  PM     sn5        U R                  5       (       a  [        U [        R                  5      (       a|  [        U R                  [        R                  5      (       aS  [        U R                  R                  [        R                  5      (       d   TR                  U R                  5       5        g TR                  U R                  R                    Vs/ s H  oR
                  PM     sn5        g s  snf s  snf rX   )r   r   updater#  r$  ri   ru   r   r   rO  r   r   rP  dataScanget_namewrites)r   rT   r&  doner&  not_ready_yet_nodess     r^   schedule_node_in_loopDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-R>Qff>Q-RS
   q)"9"9::qvvr'8'899"166;;88#''

5)00!--BVBV1WBVQ&&BV1WX .S 2Xs   E6Ec               3  b  >#    T(       a  TS   [         L a  TR                  5         OTR                  [        5        T(       a1  TR	                  T[        5        TR	                  TS-   [         5        S mS v   TR                  [         5        TR                  5         T R                  5         g 7f)Nr  r>   )rG   popr   rF   insertclear)r&  maybe_split_indexr&  r0  s   r^   end_current_reduction_loopISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B,B/c                   > TS:X  a  gTU R                   -  (       d  gU(       a  [        US   [        [        45      (       a   e[	        T5      $ )Nr>   Fr  )	ancestorsr   rG   rF   r   )r   r&  r0  r)  s     r^   #requires_closing_previous_reductionRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sS    {&7 b!O5E#F* *   +,,r`   zunexpected group: (r   z) != r>   )
r   r   r.   r  r  r   r  r   ry  r  )rp   r   rl   r)  r  r   r'  r1  r8  r<  r   r&  r/  r7  r&  r0  s     ``       @@@@@r^   generate_node_schedule%SIMDScheduling.generate_node_schedule  sT   #%)5568 0:|5?\!+/		L		Y 	Y" 
	"	"	. 
#	.	- Dt|HHTN &&6t]KK35 6 -5QRV5W5W(9(OS=O% )-%%d+'--/1!((. 21 *)%6(%

1O - 4 ' 65 21s   <EE!
E	!
E0	c                    UR                   UR                  p2UR                  UR                  5       -  (       d"  UR                  UR                  5       -  (       a   eU R	                  X#5        g rX   )r	  r
  r;  get_operation_names_codegen_mix_order_reduction)rp   r   r	  r
  s       r^   codegen_mix_order_reduction*SIMDScheduling.codegen_mix_order_reduction  sW    zz4::u OOe&?&?&AAOOe7799	
 
 	))%7r`   c                    UR                  5       n/ n/ nU H<  nUR                  5       (       a  UR                  U5        M+  UR                  U5        M>     X44$ rX   )r  ru   r   )rp   r   r   
reductions	epiloguess        r^   #_split_mix_order_reduction_epilogue2SIMDScheduling._split_mix_order_reduction_epilogue(  sX     
	D  ""!!$'  &	 
 $$r`   c           	        UR                   UR                  pTUR                  nU R                  UXES./USSSS.5      S   nUR                  (       d   eUR
                  (       d   eX'l        U R                  Xg5        UR                  R                  [        UR                  5      UR                  S   -  UR                  S   UR                  -   S-
  UR                  -  -  S	[        R                  S
9u  pn
U
S:X  d   SU
< 35       eU   UR                  5         SSS5        [         R"                  " 5       n[$        R&                  " U5         U   U(       a#  UR)                  [*        R,                  " SS95        UR/                  5       nSSS5        SSS5        U(       a)  WR1                  [3        [4        R6                  5      S5      nXyW4$ ! , (       d  f       N= f! , (       d  f       N[= f! , (       d  f       Nd= f)z
for_benchmark:
    True if the generated code is for benchmarking. We need make
    sure benchmark harness code is generated.
)rT   rU   NT)r*  rF  rI  rV  r   rU   rT   r>   F)rz  zws_off=)benchmark_kerneltriton_)rl   r(  r&  create_kernel_choicesrH  rI  rT  !codegen_node_schedule_with_kernelr  	workspacer  rU  r@  rY   r  r  r  	ExitStackr=   set_kernel_handlerenter_contextr   patchr  replacer   r5   KERNEL_NAME)rp   kernel_features
split_sizefor_benchmarkrl   r)  r&  rn   r  ws_namews_offstacksrc_codes                r^   -_generate_kernel_code_for_mix_order_reduction<SIMDScheduling._generate_kernel_code_for_mix_order_reduction4  s    (--/N/Nv'55++()+!%'+15		
 	 ****))))'..}E $[[22//0mmE"#c"V%7%77!;@R@RRT ++ 3 
F {(wviL({!  $$&!!&)5##FLL$$GH,,.H ,1)
 
  ''K,C,C(DiPH(( V ,15))s0   GG6;G%G6
G"%
G3	/G66
HNc                    [         erX   rx  )rp   modn_spills_threshold
node_namess       r^   benchmark_codegened_module)SIMDScheduling.benchmark_codegened_moduleh  s
     "!r`   c                t  ^ ^^!^" [         R                  R                  T5      u  m"nUU"4S jnU" 5       n[        =R                  S-  sl        T R                  U5      u  pg/ nU HD  n	U	R                  5         U	R                  5       n
U
R                  5         UR                  U
5        MF     T R                  TR                  5       U-   T"U5      n[        UT"U5      m![        R                  R                  R                   (       d  [        R"                  R$                  ch  [        R"                  R&                  (       d*  [        R(                  (       d  [        R*                  (       a  U!U 4S jn[,        R.                  " UUS5      nT R1                  T!USS9u  pn[3        US   R4                  R6                  5      n0 nU(       Gac  U GH  n	U	R9                  5       S   R4                  R;                  5       nU	R9                  5       S   R<                  S   R4                  R9                  5       S   R4                  R;                  5       nUUU'   T R                   (       d   eT R                   R>                  RA                  U	R9                  5       S   R<                  S   R4                  R;                  5       5        [B        RD                  RF                  RA                  U5        GM!     URH                   H.  nURK                  URL                  URL                  5      Ul&        M0     T RO                  XU5      nUUl(        [S        U5      Ul)        [B        RT                  " U5         T!RW                  5        HD  nUR9                  5       S   R4                  R;                  5       U;  d  M4  URY                  5         MF     S S S 5        [B        RD                  RZ                  R]                  S5        T R_                  US 5        URa                  URP                  SS	9  [B        RD                  =RF                  URF                  -  sl#        [B        RD                  =Rb                  URb                  -  sl1        [e        U5      [e        URH                  5      :X  d   e[B        RD                  RZ                  Rg                  T"U-   S-
  U-  5      n[i        URH                  5       GH  u  nnURL                  nU S
U 3nU S
U 3nSU SU 3nSSS.nURK                  URj                  URj                  5      nU SU SU SU SU SU SU S3n[B        RD                  Rm                  U5      =n [        Rn                  :w  a	  USU  S3-  n[B        RD                  RZ                  Rq                  U5        [B        RD                  RZ                  Rr                  RA                  U5        GM     URu                  5         U(       a  T Rw                  U5        T Ry                  5         g ! , (       d  f       GNj= f)Nc                 t  > [         R                  R                  b  [         R                  R                  $ [        R                  " TR                  5       5      n U R                  nUS-  n[        R                  R                  R                  T5      n[        [        X2-  5      S5      n[        US5      nU$ )N         )r   r[   mix_order_reduction_split_sizer*   create
get_devicemulti_processor_countr=   r   r   r  r  r,   min)device_propnum_smestimated_num_splits
numel_hintrW  r	  rl   s        r^   _pick_split_sizeESIMDScheduling._codegen_mix_order_reduction.<locals>._pick_split_sizep  s    }};;G}}CCC +11%2B2B2DEK 66F#)A:  ))33E:J_Z-OPRTUJZ-Jr`   r>   c                   > TR                  TU SS9u    p[        R                  " U5      nTR                  U5      u  pAU$ )NTrW  rX  )r]  r!   loadrc  )candidate_split_sizer  r\  r`  msrV  rp   s        r^   _bench;SIMDScheduling._codegen_mix_order_reduction.<locals>._bench  sO    !%!S!S#3"& "T "1
 "&&x077<	r`   rg  Frv  r   z!# Call mix order reduction kernel)rs  z * (z + 1) * aminamax)rn  r  z = r  z : z].view(r   z).z(dim=0)z.to(r   )=r   r  get_numel_rnumelr   rC  rH  cancel_reduction_splitextract_pw_from_reductionswap_pw_red_dimensionr   r>  r  rJ   rY   rZ   r   deterministicr[   rj  'mix_order_reduction_autotune_split_sizemax_autotunecoordinate_descent_tuningr)   autotune_single_fieldr]  r   r   _split_sizeget_outputsr-  usersremoved_opsr   r=   r   removed_buffersrU  r   r   define_kernelr0  r    rQ  scheduler_nodesmark_runrk  make_commentcodegen_commentrt  inplaced_to_remover  codegen_python_sizevarr  r!  r  r  	writeline	allocatedrp  _codegen_nodesfree_buffers_in_scheduler)#rp   r	  r
  r)  rs  rW  node2_reductionsnode2_epilogueconverted_nodessubnode	convertedr&  rz  rn   rY  r\  is_split_reductionrenamebufnameusernamepartial_accumr0  r   nsplitr  r   
stride_strrk  endreduction_type2opopnamefinal_reducebuffer_dtyperV  rl   s#   ``                               @@r^   rB  +SIMDScheduling._codegen_mix_order_reductionm  sD   !33DDUKv	  &'
 	++q0+ ,0+S+S,
( 'G**,99;I++-""9-	 (
 33OO/
 -]E6J &&44<<DEE&&33 '<<J %)$V$V! %W %
! ""21"5":":"F"FG+!--/277@@B'')!,U1T++-+ T((*	  #+w~~%~**..'')!,2215::CCE ''++G4 , "(!@!@,2JJ!--}/H/H-) "A
 ((&I($X.!!&)'779 ##%a(--668FMMO : * 	
))*MN]D16--UC	6#9#99	""f&?&??" ?#s6+J+J'KKKK%%<<Z!#
2
 #,F,K,K"LC'33K"83vh/Je3zl+EcU(:,/C! '**,,m.J.JF *]#gYawc#gfXUWX^W__abhaiipqL !" 1 1+ >>5;;N$|nA 66GG  **<8 GG  **..{;- #M0 	$$&/&&(c *)s   ,AX(1X((
X7c                r   U R                   (       d   eU Vs/ s H.  o3R                  5       U R                   R                  ;  d  M,  UPM0     nnU(       d  g [        US S9R                  u  nu  pVU R                  XU5      n[        R                  SU5        U R                  [        XuXb5      5      $ s  snf )Nc                4    [        U R                  5       5      $ rX   r   ru   rT   s    r^   r   /SIMDScheduling._codegen_nodes.<locals>.<lambda>  s    c!..:J6Kr`   r   zSchedule:
 %s)
r   r-  r  r  r  r>  schedule_logdebugcodegen_node_schedulerJ   )rp   r   coalesce_analysisr   r  rl   r)  r&  s           r^   r  SIMDScheduling._codegen_nodes  s    
 ~~~"
"TmmoT^^=W=W&WDU 	 
  ,KLRR?E33E&I+];))}VO
 	

s   +B4B4c                >   U R                   (       d   eUR                  5        Vs/ s H/  nUR                  5       U R                   R                  ;  d  M-  UPM1     nn[	        U5      S:X  a  g[
        R                  R                  R                  R                  (       af  [	        U5      [	        WR                  5       5      :w  a4  U R                   (       d   e[         R                  " U R                   U5      n[        U5      nOSnU R                  X#5      $ s  snf )z;
Given a set of pre-fused nodes, generate a Triton kernel.
r   N)r   r  r-  r  r  rY   rZ   r   r[   coalesce_tiling_analysisFusedSchedulerNoder   r  )rp   r   r   r  s       r^   codegen_nodeSIMDScheduling.codegen_node  s     ~~~ (
(}}dnn&@&@@ ( 	 

 u:???!!((AA5zS!122~~%~ 33DNNEJ 9$ ? $""5<<!
s   ,DDc                :   [         R                  " [         R                  5      R                  n[	        U 5      (       d  gU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     nnU H  nUR                  5       (       a  M  [        U[        R                  5      (       d  M;  UR                  5       nUU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     sn-  nM     [        S U 5       5      (       d  g[        R                  R                  R!                  X5        U H,  n[        R                  R                  R!                  Xb5        M.     gs  snf s  snf )NFc              3  8   #    U  H  n[        U5      v   M     g 7frX   )r2   )r   r  s     r^   r   8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>P  s     FID)$//Irs  T)rY   iinfoint32r  r2   has_tensor_outputr  storage_sizer   r   MutationOutputget_mutation_buffersr  r=   r   r   	check_leq)rl   buffersint_maxr  	buf_sizesmutated_bufsr  s          r^   can_use_32bit_indexing%SIMDScheduling.can_use_32bit_indexing3  sG    ++ekk*..%e,, 
$$& ,CNN))+ 	 
 C((**z#r?P?P/Q/Q"779++,,. 4CNN$113+ 	  FIFFF 	
""52DGG&&t5 /
s   F!"FF4"FFc                   U R                  X!5        U(       dL  [        R                  " U5         [        R                  " U5       H  nUR                  5         M     SSS5        [        R                  =R                  UR                  -  sl        [        R                  =R                  UR                  -  sl        g! , (       d  f       Ni= f)zU
Process a kernel by generating code for its node schedule and updating graph state.
N)	rN  r=   rQ  rI   r  r  r   r  r  )rp   rn   r&  rb  r   s        r^   process_kernelSIMDScheduling.process_kernelZ  s     	..}E %%f-.99-HDMMO I . 	
6#9#99	""f&?&??"	 .-s   .C  
Cc                   UR                   nU R                  UUR                  UR                  UR                  5      u  p4U R                  UU/XS.5      nU H  nU R                  X&5        M     [        R                  " U5        U Hp  n[        R                  " U5         UR                  5       nSSS5        U R                  WX&5      n[        R                  SU5        Xl        [!        U5      Ul        Mr     A[#        U5      S:  a  [        U5      n	OUu  n	[        R                  " U	5         UR%                  5        H  n
U
R'                  5         M     SSS5        U V
s/ s H  n
[)        U
[*        5      (       d  M  U
PM     nn
U R-                  XR                  5        [.        R0                  R2                  (       a\  [        R4                  R6                  R9                  5         [        R4                  R6                  R;                  U	R                  U5        U	R=                  U	R                  5        [.        R0                  R2                  (       a(  [        R4                  R6                  R?                  5         [.        R@                  (       a  U	RC                  5         [.        RD                  (       a  U	RE                  US   R                  5        [        R4                  =RF                  U	RF                  -  sl#        [        R4                  =RH                  U	RH                  -  sl$        [        R4                  R6                  RJ                  (       a  [.        RL                  (       a  US   RN                  RQ                  5       nUR%                  5        H  n
U
RS                  5       nX;  a  M  U
RT                  c   eU
RT                  RW                  5       nUc  MH  [X        S   S==   S-  ss'   [        R4                  R6                  R[                  SUR\                  < S	U S
35        M     U R_                  5         g! , (       d  f       GN= f! , (       d  f       GN= fs  sn
f )z,
Generate code for nodes in kernel_features
)r*  rF  Nz+Generating kernel code with kernel_name: %sr>   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )0r&  get_tiling_and_scoresrl   r(  r  rM  rN  rD   merge_workspaces_inplacer=   rQ  r  r  r  r  r0  r    r  r  r  r   r.   r  r   cppenable_kernel_profiler   rk   write_kernel_context_guard_beginwrite_kernel_context_guardrt  write_kernel_context_guard_endnan_assertsrh  r  r  r  supports_intermediate_hooksgenerate_intermediate_hooksr  live_output_buffersr-  r   get_origin_noder   r  ri   r  )rp   rV  r&  r(  tiling_scorekernelsrn   r\  r0  final_kernelr   base_scheduler_nodes	live_outsri   origin_nodes                  r^   r  $SIMDScheduling.codegen_node_schedulek  sV    (55#99!!++--	 
 ,,H(H

 F22=I ,,W5F%%f-!002 .,,X}MKIIC[Q!,(2F   w<!&w/L%O\!!,/'779 : 0 + 
*Tj?P.QD] 	  
 	13K3KL::++GG  AACGG  ;;(($ 	  !9!9:::++GG  ??A**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779}}(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO : 	&&(y .- 0/
 
s$   "Q3(Q'Q1Q1
Q	
Q.c                (    U R                   " U0 UD6/$ rX   )r  )rp   rV  kernel_argskernel_kwargss       r^   rM  $SIMDScheduling.create_kernel_choices  s'     
 	
r`   c           	     ^   U   [         R                  " 5       n0 nU H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  UR                  5         UR                  UR                  5       5      nUR                  [        R                  UR                  R                  U5      R                  5       5      5        M     UR!                  UR#                  5       5        U H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  [%        UR                  5        UR                  UR                  5       5      nUR'                  U5        M     S S S 5        g ! , (       d  f       g = frX   )r  rP  rF   rR  r  rG   closedecide_inplace_updater2  r  r*  r'  fromkeys_bodyindexing_from_argsr   r  keysr(   r   )rp   r&  rn   r[  all_indexingr   r   s          r^   rN  0SIMDScheduling.codegen_node_schedule_with_kernel  s>   ((*EL &++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN & $$\%6%6%89 &++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL, &- VVs   FF
F,rb  c               d
   0 nUR                  5       n/ n	U H  n
U
R                  5       nU	R                  U
5        X-  (       d  M/  [        U5      S:X  d   eX[	        [        U5      5      '   UR                  R                  [	        [        U5      5      5        / n	M     [        U	5      S:X  d   eUR                  U UUUU[        UU5      nUb  U$ U   U(       d  U/UQ H  nUR                  5         M     U" 5       nUR                  5       n[        U5       H  nUR                  U5      nUR                  U5         U H1  nUR                  UR!                  UR#                  5       5      5        M3     UR$                  R'                  [)        5       5        SSS5        M     UR*                  R-                  5        GH@  u  nnSU S3nUR/                  UR1                  5       / 5      =n	(       d  M6  [3        S U	 5       5      n[4        R6                  " SU(       + 5         UR                  U5         U	 H  n[        UR                  5       5      S:X  aB  [        U	5      S:X  a3  [        U5      (       a#  U=R8                  UR                  5       -  sl        UR                  UR!                  UR#                  5       5      5        M     UR$                  R'                  [)        5       5        SSS5        SSS5        GMC     SSS5        [:        R<                  " U5         [?        W[@        5      (       d]  [B        RD                  RG                  URH                  RJ                  5         URM                  S5        SSS5        URM                  S	S
S9  UR*                   H  nSU S3nURM                  US
S9  M     UR                  5       n[        U5       H%  nUR                  U5      nURM                  U5        M'     [?        U[@        5      (       a  UnOURO                  5       n/ UQUPUQn[4        RP                  (       aH  URS                  5       S-  nURU                  5        SU SURW                  U5      RY                  5        3nU(       a  UsSSS5        $ U R[                  UUU5      Ul.        UsSSS5        $ ! , (       d  f       GM  = f! , (       d  f       GN= f! , (       d  f       GM:  = f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       g= f)z;
Helper method to codegen a single template kernel variant
r>   r   Nz<LOAD_INPUT_rf  c              3  @   #    U  H  oR                  5       v   M     g 7frX   )can_codegen_without_upcasts)r   p_ns     r^   r   :SIMDScheduling._codegen_single_template.<locals>.<genexpr>+  s      5ESc7799^   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )/r  r  r   r  r   iterprologue_fused_inputsr   rc  r   r  rn  rangerh  set_subgraph_bodyr   r2  r  cse
invalidater   named_input_nodesr{   r   r-  r  r   rS  #prologue_fused_inputs_preserve_zeror=   rQ  r   r   r   r'   current_originsr   originsfinalize_hookfinalize_remainingrK  r  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r0  )rp   rn   ra  r\  r]  r^  rb  r_  template_readsprologue_groupprologuenamesresultr   partial_codenum_store_subgraphsrg  subgraph_name
input_namebuffercan_codegen_without_upcastprologue_noder\  r&  num_gbs                            r^   _codegen_single_template'SIMDScheduling._codegen_single_template  s    &("&88:&H--/E!!(+%%5zQ&@N4U+<=,,00d5k1BC!# ' >"a''' 11&(	
 M$ +<^<DMMO = "8L"("?"?"A./ & F Fq I--m< .V%@%@AR%ST !/JJ))*,7 =< 0 '-&>&>&D&D&F"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W $55mD1?$'(F(F(H$IQ$N(+N(;q(@'CM'R'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!" 2@ #JJ11*,?! E  'G# p !!&)lC00YY..}/A/A/I/IJ ..~> K**;u*E %66
".zl! <**=*G 7
 #)"?"?"A./ & F Fq I**=9 0
 ,,,' (::<MnMmMnMM&&99;cA::<=Rj66v>GGIJL  !M *)P "&!3!3HmV!TFU *)Y =<& ED 9 Vv KJ *)s   A+S=?ASAS=6.S=$S*6B:S	0S*8S=#A	T!,T>DT!"T!
S	S=
S'"S**
S:4	S==
T
T	T!!
T/c                (  ^^ SSK Jm  U4S jm/ n[        UR                  5      U/-    H[  n[	        U[        [
        45      (       a&  UR                  [        U4S jU 5       5      5        MD  UR                  T" U5      5        M]     [        U5      $ )Nr   r&   c                   > [        U T5      (       d  g [        U [        R                  5      (       a  U R                  5       n U R	                  5       =nc  g [        S U 5       5      $ )Nc              3  $   #    U  H  ov   M     g 7frX   r   r   s     r^   r   KSIMDScheduling._get_multikernel_shapes.<locals>.get_size.<locals>.<genexpr>  s     )DqDs   )r   r   BaseViewunwrap_viewmaybe_get_sizer  )r  r  r'   s     r^   get_size8SIMDScheduling._get_multikernel_shapes.<locals>.get_size~  sX    c6**#r{{++oo'**,,5)D)))r`   c              3  4   >#    U  H  nT" U5      v   M     g 7frX   r   )r   _argr  s     r^   r   9SIMDScheduling._get_multikernel_shapes.<locals>.<genexpr>  s      @CD$Cs   )r   r'   r%  inputsr   r  r   )rp   r   r  r  r'   r  s       @@r^   _get_multikernel_shapes&SIMDScheduling._get_multikernel_shapesy  sr     	 	* $v-C#e}--

5 @C @@A

8C=)	 .
 Szr`   c                H    U R                  U5      n[        S U 5       5      $ )Nc              3  F   #    U  H  n[        S  U 5       5      v   M     g7f)c              3     #    U  HE  n[        U[        R                  5      =(       a    [        U[        R                  5      (       + v   MG     g 7frX   r   r   Exprr  r   s     r^   r   FSIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>.<genexpr>  s8      A 1ejj)N*Q2N.NNs   AAN)rE  )r   shapes     r^   r   <SIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>  s2      

  	      s   !)r"  rE  )rp   r   shapess      r^   _kernel_has_dynamic_shapes)SIMDScheduling._kernel_has_dynamic_shapes  s.    --d3 

  
 
 	
r`   c                P   ^ U R                  U5      n[        U4S jU 5       5      $ )z[
Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
c              3  N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3     >#    U  HG  n[        U[        R                  5      (       a!  [        U[        R                  5      (       d  TOUv   MI     g 7frX   r'  )r   r   hints     r^   r   ASIMDScheduling._make_shape_cache_key.<locals>.<genexpr>.<genexpr>  sG       A a,,Z5==5Q5Q  s   AANr  )r   r*  r2  s     r^   r   7SIMDScheduling._make_shape_cache_key.<locals>.<genexpr>  s5      
     	    s   "%)r"  r  )rp   r   r2  r,  s     ` r^   _make_shape_cache_key$SIMDScheduling._make_shape_cache_key  s1     --d3 
  
 
 	
r`   rb  hint_overridec          
        UR                   u  nu  pxUS:X  d   e[        UR                  [        5      (       Ga#  UR                  R                  (       Ga  [        UR                  R                  5      S:  Ga  U R                  UR                  5      (       Ga  0 n	/ n
UR                  R                  R                  5        H  u  nnU" UR                  US9u  pU(       a>  U R                  UUUUUSS9n[        U[        5      (       d   eU
R                  U5        M^  Uc  Mc  U R                  UUUUUSS9nUc  SOU R                  UR                  U5      nXU'   M     U(       a  SR                  U
5      $ [        R                  " [        U	R!                  5       5      5        [#        U	5      n/ UQUPUQnU R%                  UUR&                  5        UR)                  UR&                  5        [*        R,                  =R.                  UR.                  -  sl        [*        R,                  =R0                  UR0                  -  sl        U R3                  5         gUR                  R5                  UR                  US9u  pU(       a  U R                  UUUUUSS9$ U R                  UUUUUSS9n/ UQUPUQnU R%                  UUR&                  5        UR)                  UR&                  UR                  5        [*        R,                  =R.                  UR.                  -  sl        [*        R,                  =R0                  UR0                  -  sl        U R3                  5         g)z
Codegen a triton template with multi-kernel dispatch support

If `only_gen_src_code=True` the src code will be returned instead of being
codegenned into the wrapper
r>   )r8  Tr  NFz

)r  r   r   r   _make_kernel_rendersr  r-  r{   r  r   r   r5  r  rD   r  r%  r   rE   r  r0  rt  r=   r   r  r  r  make_kernel_render)rp   r\  r]  r^  rb  r8  r  _numelr)  r  	src_codesr  r;  rn   ra  r\  shape_cache_keymulti_kernelr&  s                      r^   codegen_templateSIMDScheduling.codegen_template  sD     ,11F{{ }))+>??""777M&&;;<q@//0B0BCCGI
 ##88>>@"!3!&&m" %#<<%&&*.  =  H &h4444$$X. ( !::%&&*/ ; F %, !778J8JIV $ 06O,E AH !{{9--00gnn6F1GH.w7LMnMmMnMM  0H0HI$$\%=%=>GG##|'C'CC#GG&&,*I*II&**,*//BB""- C NF !44!""&* 5   66!""&+ 7  !R. Q- Q. Q$$]F4F4FG""6#5#5}7I7IJ''6+A+AA'**f.G.GG*..0r`   c                    [         R                  R                  R                  [         R                  R                  R                  5       5        g rX   )r=   r   rk  r  
device_opssynchronizert   s    r^   codegen_syncSIMDScheduling.codegen_sync	  s-    	&&qww'9'9'E'E'GHr`   c           
        SSK Jn  SSKJn  [	        U R
                  U5      (       d   eU Vs/ s H  oR                  5       PM     n	n0 n
[        X5       H  u  p[        US S9R                  u  nu  pU R                  XU5      nU R                  UX5      n[        UX5      nUR                  5       =(       a    [        R                  R!                  USS9n[#        UUUUUUS9X'   M     UR%                  UU UU
S	9n[&        R)                  S
[+        U5      U Vs/ s H  n[+        U5      PM     sn5        / nU GHs  n[+        U5      S:X  a  M  [+        U5      S:X  a  U
US      nU(       a  UR-                  SSU45        MI  U R                  UR.                  UR0                  S9nU R3                  UUR4                  U5        [        R6                  " U5         UR9                  5       nSSS5        UR-                  WUU45        M  U" U R
                  UUS9nU Hh  nX   nUR;                  UR.                  UR0                  U(       + U R
                  S9nU R3                  UR=                  U5      UR4                  U5        Mj     UR9                  5       nUR-                  UUU45        GMv     U$ s  snf s  snf ! , (       d  f       N= f)a<  
Generate kernel code for combo kernel partitions.

Partitions subkernel_nodes using horizontal_partition(), then generates
kernel code for each partition. Single-node partitions are generated as
regular kernels, while multi-node partitions use ComboKernel.

Returns a list of (src_code, kernel, node_group) tuples.
r>   )TritonKernel)ComboKernelc                4    [        U R                  5       5      $ rX   r  r  s    r^   r   ;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>7	      #ann>N:Or`   r   F)rE  )r&  r(  rl   r)  r*  r+  )r   triton_schedulingcustom_algorithmnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsr   Nr*  )triton_kernel_clsenable_autotunemixed_sizes)r*  optimize_maskrQ  )r[   rH  triton_combo_kernelrI  
issubclassr  r  r  r  r  r>  r  rJ   ru   r=   choicesrG  r$  horizontal_partitionr  r  r  r   r(  r*  r  r&  rQ  r  create_triton_kernelcreate_sub_kernel)rp   subkernel_nodescustom_part_algorithmrR  rS  rb  rH  rI  r   fused_node_listsnode_schedule_mappnr   r  rl   r)  r&  r(  r*  r+  
partitionspkernel_code_list
node_group	node_inforn   r\  	subkernels                               r^   generate_combo_kernel_code)SIMDScheduling.generate_combo_kernel_code	  s   " 	)4 $**L99999HINN,I13_?IB!$U0O!P!V!VA 77fMM''uEF)-GH%%' II==E >  $ %-+!(?%! @( !55!"2+	 6 

 			? '(ZSVZ(	

 $J:!#:!#-jm<	$$++T4,DE "--!((!*!3!3 . F ''	 7 79J --f5#)#8#8#: 6 %++Xvz,JK %&*&6&6$3 +
 %B 1 5I + @ @!((!*!3!3*5o*.*:*:	 !A !I ''00;!//) % "002 ''6:(FG] %`  e J> ), 65s   J8J=K
K	c                   UR                  5       nUR                  nUR                  n[        R                  S:  =(       d    [        R                  S:H  =(       a    UnU R                  X#XE5      nU H\  u  pxn	U R                  Xq/U5      n
U R                  UR                  U
5        [        R                  SU
5        UR                  U
5        M^     U R                  5         g )Nr>   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algorR  r   combo_kernel_allow_mixed_sizesrf  r  r  snodesr  r  rt  r  )rp   combo_kernel_noder[  r\  rR  rS  rb  r\  rn   r  r0  s              r^   codegen_combo_kernel#SIMDScheduling.codegen_combo_kernel	  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::O
 $4Ha,,X7JFSK  !2!9!9;GII:KH{+	 $4 	&&(r`       c           
       ^ ^^
 TS:H  nSU UU
4S jjnUR                  5       u  nm
[        U5      S::  a  [        T
5      S::  d  [        UT
-   5      (       a  / $ UR                  5       u  nm
U" UU(       a  UOT
UR                  U5      5      nU Vs/ s H=  n[	        T R                  UR                  UT5      UR                  UR                  S9PM?     n	nU	$ s  snf )Nr>   c                  > [        UR                  5      [        U5      :X  d   SUR                  < SU< 35       eUR                  UR                  /n[	        S [
        R                  R                  U5       5       5      (       d   e[
        R                  R                  U5       Vs/ s HF  nUR                  [        R                  R                  ;  d  M-  [        U[        5      (       d  MD  UPMH     nn[        UR                   Vs/ s H  oDR                  PM     sn5      nSS jn[        TR!                  U" U5      /U 5      SSS9/nU GH  n[        R                  R"                  R%                  UR&                  UR                  5      n	[        U	5      [        U5      :X  d   e U	R'                  S5      S-   n
U
[        U5      :X  a  M  [	        S	 XS
  5       5      (       a  M   U" US
U
 5      U" XS
 5      4n[        R                  R"                  R+                  [-        S [/        X5       5       5      5      nUR                  U;   a  US-  n[        R1                  US   5      (       a  US-  n[        R1                  US   5      (       a  US-  n[        R                  R"                  R+                  U[-        [
        R                  " UT5      5      -
  5      S:  d  GM  UR3                  [        TR!                  U" US
U
 5      U" XS
 5      /T5      UUR                  S95        GM     U$ s  snf s  snf ! [(         a     GM  f = f)z@
Compute tiling candidates by dividing up the iteration ranges.
zrw.range_vars=z ranges=c              3  N   #    U  H  n[        U[        [        45      v   M     g 7frX   )r   r"   r#   )r   r  s     r^   r   HSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>	  s&      EC 3G 455Es   #%c                f    [         R                  R                  R                  [	        U 5      5      $ rX   r6  )r  s    r^   collapse_rangesNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges	  s"    ww''00v1FGGr`   noner   )r(  ri   scorer>   c              3  *   #    U  H	  oS :H  v   M     g7fr  r   r   s     r^   r   rt  	  s     ;?a6?s   Nc              3  :   #    U  H  u  pUS :w  d  M  Uv   M     g7fr  r   )r   r  r  s      r^   r   rt  	  s      "1EST1Es   	r   r(  ry  ri   )r  r  r   r   )r  
range_varsr$  r.  r  rA  r  r  ri   r=   r   r  r   r"   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr  r8   r  is_good_sizer   )is_pointwiser  rwdep_sourcesr  depswrite_namesrv  tilingsr  splittiled_groupsry  r'  r(  reduction_rangess                r^   tile_ranges5SIMDScheduling.candidate_tilings.<locals>.tile_ranges	  s#    r}}%V4S8H	&6SS4 88RYY/K $??88E     %??88EEC88177#:#::  sI. E   %"))%D)3hh)%DEKH
  44(01<  G ''**77		2==Q7|s6{222
#MM!,q0EF+ ;76?;;; ! < $F6EN3#F6N3  ((22! "14V1E" 
 88{*QJE"//Q@@QJE"//Q@@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F6N$C!" !0$ #(!$
Q l N[ &E: " s0   ,,MM3MM1#MM
MMr|  )r  r   r   list[CandidateTiling])	r  r  r   "pointwise_or_reduction_read_writesr~  complete_partial_tilingr(  ry  ri   )r'  r   rl   r(  r  r  pointwise_rangespartial_tilingsr(  full_tilingsr  s   `  `      @r^   candidate_tilings SIMDScheduling.candidate_tilings	  s     '!+\	 \	| .2__->** !Q&$%*$%58H%HIII .2__->**% ,2B33LA
 *	
 * 22MM5/ ll[[ * 	 	
 	
s   ACc                    / SQ[        U5      * S nSS/S[        U5       n[        / [        X15      Q[        XB5      Q5      $ )z;
Create a tiling dict from pointwise and reduction splits.
)rR   rS   rT   NrU   rV   )r  r   r  )r'  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r^   create_tilingSIMDScheduling.create_tiling
  sT     &s9~o&78#U^,Cc2B.CDVc+)VC0B,UV
 	
r`   c                R    U R                  U(       a  UO/ U(       d  U5      $ / 5      $ rX   )r  )r'  r(  r  s      r^   r  $SIMDScheduling.create_partial_tiling*
  s0       "F&F
 	
,.
 	
r`   c                    [        UR                  5       5      nSU;   nX#-  nU[        U5      -  /nU(       a  XG4OXt4nU R                  " U6 $ )zR
Given a tiling for only pointwise or reduction dimensions, adds the missing one.
rT   )r%  r   r8   r  )	r'  r(  rl   r(  splitsr  total_numelmissing_tilingtiling_argss	            r^   r  &SIMDScheduling.complete_partial_tiling5
  s^     fmmo&f}-%f(==> )5V$>:R 	   +..r`   c           
     
   US:H  n[         [        [        [        R                  4      " 5       n[
        R                  " U5       GH   n[        U[        R                  5      (       d  M%  UR                  5       nU(       d  [        US   5      S:X  a  MP  Xt(       a  SOS   nU/n	UR                  R                  5        V
s/ s H7  n
[        U
[        5      (       d  M  [        U
R                  5      S:  d  M5  U
PM9     nn
U GH  n
/ U
R                  R!                  5       Qn[        R"                  R$                  n[&        R(                  R*                  nSn[-        U5       H(  u  nu  nnUU-  nUnUR/                  X5      (       d  M(    O   UR1                  X5      (       d  M  US-   nU(       a  USU OUUS n/ nU H  u  nn[2        R4                  " U
R6                  U5      n[9        SUR;                  [<        5      UR;                  [>        5      -   [        U5      5      n[2        R@                  " UUUU5      nUb  US   OU/nURC                  U5        M     U Vs/ s HN  n[&        R(                  R*                  R1                  U[        R"                  R$                  5      (       a  ML  UPMP     nn[        U5      S:  d  GM  U	RE                  U5        GM     U	 H{  n[9        S[        U5      [G        S5      -
  5      nUS-   n[I        USU 5      nU4[K        UUS 5      -   n URM                  U RO                  U RQ                  U U5      UU5      5        M}     GM#     [S        U[        SS9n!U!$ s  sn
f s  snf )z
Creates N-dimensional tiling candidates, attempting to simplify loads/stores
by tiling the kernel into higher dimensions.

Returns a list of tilings ranked by dimensionality.
r>   r   Nr   T)r   reverse)*r   r   r   r   r(  rG   r  r   r   rO  r  r  r#  reads_and_writesr"   r  r{   r   r   r=   r   r   r  statically_known_geqr   r?   get_subexpr_involving_symbolr   r  rB  r   r   match_mod_div_block_exprr  r   r_   r8   r  r   r  r  rc  )"r'  r&  pointwise_numelr(  r  r  r   node_rangesranges_to_tilenode_tilingsr  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxr  _varrl   reduction_start_idxrk   index_tilingvarr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss"                                     r^   get_nd_tilingsSIMDScheduling.get_nd_tilingsJ
  s    '!+^CO<=?#**=9DdI$;$;<< //+KCA$71$< )lBN*+L  ++<<>>Cc9- 25cjj/A2E >  
 # "73::#3#3#5!6',ww{{$77++$%!*3N*C&C$(E1((+%44,   +D  77(   '8!&;# $ ##7$78'(;(<=   "",JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-% #-.  , +77++CCCU +    |$q( ''5w #|  ,#&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''  ,i :J  
 ur s   M;M;9M;%AN 4N c                  ^^^^^^^^^^^^^ TR                   (       d  SOTR                   R                  mTR                  R                  mTR                  R                  mTR                  R
                  nT Vs/ s H  oeU   PM	     snmT Vs/ s H  oeU   PM	     snm[        R                  R                  R                  n[        R                  " U" [        T5      5      U" T5      :H  UUU4S j5        [        R                  " U" [        T5      5      U" T5      :H  UUU4S j5        0 m/ n   S       SUUUUUUUUU4	S jjjn	UR                  U	" SS9U	" SS945        T(       a  UR                  U	" T4SSS9U	" SS945        TTR                  R                  5       -  n
U
 H   nUR                  U	" U4SS9U	" SS945        M"     [!        S	S
9S	:X  a@  TS:X  a:  ["        R$                  " U
S5       H  nUR                  U	" USS9U	" SS945        M!     / nU H^  u  u  pu  nn['        U R)                  X5      [+        U5      [+        U5      -   S9nU R)                  UU5      nUR                  UU45        M`     U R)                  T/T/5      nSmSm[+        TR,                  R/                  5       5      mUUU4S jn[1        UUS9 H  u  nnU R3                  TTTUR4                  5      (       d  UR4                  U:X  a  [7        UR4                  5      TS:X  a  SOS-
  nU[!        S	S
9:  aE  [8        R;                  SU[        R<                  R>                  R@                  RB                  5        M  UR4                  U4s  $ UR4                  U:X  d  M  UR4                  U4s  $    US4$ s  snf s  snf )zb
Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
Nc                    > T ST ST  3$ Nr   r   )r&  r  	pw_rangess   r^   r   8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    ykO#4B}oFr`   c                    > T ST ST  3$ r  r   )r&  
red_rangesr(  s   r^   r   r  
  s    zl"_$5RGr`   Fc                  >	 U(       a  TOTnU(       a  TOTnU(       d  U(       a  U// 4$ / / 4$ [        U 5      X4nTR                  U5      =n(       a  U$ U(       a  TOTn/ n/ n	Sn
Sn[        Xs5       GH  u  pX;  a"  X-  n
TR                  R                  US5      nM-  U(       a  UT:X  a  TR                  nUc   eUR
                  n[        XR
                  5      nUR                  U
U-  5        U	R                  UR                  5        UR                  U5        U	R                  TR                  R                  US5      5        Sn
SnM  X-  n
UR                  U
5        U	R                  TR                  R                  US5      5        Sn
GM"     U
S:w  d  U(       a1  [        U5      S:X  a"  UR                  U
5        U	R                  U5        [        [        U5      5       HQ  n[        R                  R                  R                  UU   SS9n[        US5      n[!        U	U   U-  S-  5      U	U'   MS     X4TU'   X4$ )zE
Generate a tiling, and a tiling score, given vars to use as splits.
r>   r   rp  fallbackrg  )r  r   r  coalesced_by_varsuggested_splittiling_factorr   r   ry  r  r  r=   r   r   r   rn  r   )vars_to_useuse_split_varr  r  target_numelr   r  splitting_varsr  split_scoresprodprev_var_coalesced_scorer  v_range
var_tilingtile	remainderrg  r   all_iter_varsall_red_varsr  r  r  r  r(  scored_sub_split
tiling_vars                      r^   process_node_varsASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars
  s<    #/YJF.:?L)NB//8O$mBC&**3//s/
.:]NFLD'($ ".9
'OD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ; :> qy\c&kQ.>d###$<= 3v;'GG$$66vay26N1I"%l1o&9A&=">Q (
 &,$:S!))r`   T)r  )r  r  r   r]   r>   r   )ry  gffffff?gGz?c                   > SnU S   R                   R                  5        H)  n[        R                  U5      (       d  UT-  nM$  UT-  nM+     TS-  nU S   R                  U-   * U-  $ )Ng      ?r   g?)r(  r   r~  r  ry  )r_  score_factor	tile_sizeuncoalesced_penalty"bad_size_additional_tiling_penaltygood_size_tiling_penaltytotal_uncoalesceds       r^   	score_mod9SIMDScheduling.compute_tiling_strategy.<locals>.score_modx  ss    LqT[[//1	&33I>>#/2T#TL#/2J#JL	 2 #4d":qTZZ"556EEr`   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r  ztuple[sympy.Expr, ...]r  r   r  r   r   ztuple[list[int], list[int]])"r  r  norm_read_writesr   reduce_varsrk   r=   r   r   r   rY   _checkr8   r   r  r  r_   rA  combinationsr~  r  rt  uncoalesced_addrsr   rc  tiling_is_compatibler(  r  perf_hint_loginforZ   r   r[   r\   ) r'  r&  r  r(  r  r  r  get_hintscore_splitr  overlapping_iter_varsr  r  pw_splitpw_score	red_split	red_score	candidater  default_tilingr  cand
tiling_lenr  r  r  r  r  r  r  r  r  s     ````                  @@@@@@@@@r^   compute_tiling_strategy&SIMDScheduling.compute_tiling_strategy
  s    %44 "2266 	 *::EE(99EE"33>>(561AY6	)56AQi6
 77##55]9-.(?2KKF	

 	]:./8O3LLG	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 'A%qd>%59 ' #q(_-A(556KQO"")+DI)u=  P RT<G8 X"89i'!!(6(mc)n4I ,,XyALNNI|45 =H **O+<>OP .3*#(  1 C C J J LM	F #)i"@D,((!?OT[[  ;;.0 !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00/ #A2 t##K 76s   =N7N<c                `   ^^ [        T[        5      (       d   e[        UU4S jU 5       5      $ )Nc              3     >#    U  HW  n[        U[        R                  5      (       d  M$  [        R	                  TR                  5       UR                  5       TS 9v   MY     g7fr  )r   r   rO  r   r-  r   r  )r   r   r(  r(  s     r^   r   6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>  sR      
 &$	 7 78	J$$!2O %  &s
   #A"8A")r   r'  r  )r'  r&  rl   r(  r(  s      ``r^   r  #SIMDScheduling.tiling_is_compatible  s4     &$'''' 
 &	
 
 	
r`   c                L    U H  nU R                  XX55      (       d  M  Us  $    g rX   )r  )r'  r&  rl   r(  r  r(  s         r^   get_first_compatible_tiling*SIMDScheduling.get_first_compatible_tiling  s+     %F''oVV % r`   c                ,    U R                  XX45      S   $ r  )r  )r'  r&  rl   r(  r  s        r^   r  SIMDScheduling.select_tiling  s$     ((/

 	r`   c                   US:H  nU R                  U/U/5      n[        R                  " U5       H  n[        UR                  [
        R                  5      (       d  M.  UR                  R                  5       S:X  d  MN  [        R                  R                  (       d  Mo  UR                  5       nUS   n	US   n
U R                  X5      nUS4s  $    [        R                  R                  R                  R                  (       a8  U(       a1  [        R                  R                  (       d  U R!                  XX45      $ U(       d  [        R                  R"                  (       a  [%        SS9S::  a  [&        R(                  [*        R,                  ::  a  [        R                  " U5       Hq  n[        R                  R"                  (       a  M$  [/        U R1                  XrU5      5      S:  d  ME  [&        R3                  [4        R6                  " S5      5          US4$    US4$ [9        5       n[:        R<                  " 5       n[        R                  " U5       Hl  nU R1                  XrU5       HS  nUR>                  U;   a  M  UR>                  b  URA                  UR>                  5        X==   URB                  -  ss'   MU     Mn     URE                  5        VVs/ s H  u  pURF                  PM     nnn[%        SS9S:  aH  U(       aA        SS	 jn[I        S[/        U5      5       H  nU" US   UU   5      nUc  M  U/U-   n  O   [/        U5      S:  a  [&        R3                  S
U5        [        R                  R                  (       a  U RK                  XU5      U-   nU RM                  XUU5      =n(       a  US4$ US4$ s  snnf )z
Heuristics to decide how to tile kernels.
Currently, we tile based on stride-1 dimensions.

Returns:
    `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

r>   r2  r   Nr   r  z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                :   U S   U R                  SS5      p2US   UR                  SS5      pT[        X5/5      (       d/  [        R                  R                  R                  X5-
  5      S:X  a  g [        R                  R                  R                  X5-
  5      S:  a  XE4X#4su  p#u  pE[        R                  R                  R                  X5-
  5      S:  d   e[        R                  R                  R                  X55      (       d  g U[        X55      UU S   S.nU$ )NrT   rS   r>   r   rU   )rR   rS   rT   rU   )r   r   r=   r   r   r  r  r   )tiling0r  a0a1b0b1
new_tilings          r^   convert_tiling_to_3dBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d)  s     !w{{3':B w{{3':B *2(33ww''11"':a?77##--bg6:*,B8&HRhrww''11"':Q>>>ww''DDRLL !")"5>	
 "!r`   zpossibly bad tiling: %s)r  r  r  r  r   r  )'r  rG   r  r   r   r   rP  rQ  r   r[   rN  r  rY   rZ   r  prefer_nd_tilingr  tile_reductionsr_   r  levelloggingWARNINGr  r  r  textwrapdedentr   collectionsr   ri   r   ry  most_commonr(  r  r  r  )r'  r&  rl   r(  r  r  r  r   r  	range_y_xrange_rr(  
seen_namescandidate_tilescandidate_tilingry  r  r  rg  new_3d_tilings                       r^   r  $SIMDScheduling.get_tiling_and_scores  sl   " '!+ **E7_4EF $**=9D$))R%6%677II002e;333 #'//"3K +AI)!nG ..yBF!4<' :  OO""))BB!MM22..o  V]]%B%B}H
H ""goo5+22=AD"MM999 5 5d? STWXX%**$OO!$ !4'' B "4''&0l
4?4G4G4I#**=9D$'$9$9$$W #((J6%**6NN#3#8#8915E5K5KK1 %X : ,;+F+F+H7
+H'  ##+H 	 7

 #q(\"."9N"0"8 1c.12 4"1%~a'8! !,&3_~%EN 3 ~"8.I ==))""=I ! 
 44/>
 
6 
 4<t##A7
s   Oc                    g rX   r   rt   s    r^   flushSIMDScheduling.flush_  r  r`   c                    gr  r   rt   s    r^   ready_to_flushSIMDScheduling.ready_to_flushb  r  r`   c           	        [        S U 5       5      (       d  [        US S9R                  u  nu  pVU R                  XU5      nU R	                  XuU5      nU R                  U[        XuU5      S9n	U R                  Xy5        [        R                  " SU5         [        R                  " U	5         U	R                  5       n
S S S 5        S S S 5        OJUS   R                  U5      u  pn[        R                  " SU5         U R                  UUUSUS9n
S S S 5        W
R                  [!        ["        R$                  5      S	5      n
U
$ ! , (       d  f       N= f! , (       d  f       NJ= f! , (       d  f       N[= f)
Nc              3  @   #    U  H  oR                  5       v   M     g 7frX   )r  )r   r   s     r^   r   ASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>h  s     2Eq==??Er  c                4    [        U R                  5       5      $ rX   r  r  s    r^   r   @SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>i  rL  r`   r   rP  rK  r   Tr7  rL  )rE  r  r  r>  r  r  rJ   rN  r   rS  r=   rQ  r  get_prologue_template_epiloguer@  rT  r   r5   rU  )rp   r   rK  r8  r  rl   r)  r&  r(  rn   r\  r  templateepilogues                 r^   generate_kernel_code_from_nodes.SIMDScheduling.generate_kernel_code_from_nodese  sZ    2E222!$U0O!P!V!VA 77fMM''fEF%%+M&I & F 22=I/1AB$$V,!002 - CB
 ,18+R+R,(H 02BC00&*"/ 1  D ##C(?(?$@)L% -, CB DCs0   E/E E?E(
E	E
E%(
E6c                    [         erX   rx  )rp   r\  r&  rn   s       r^   r  SIMDScheduling.define_kernel  r}  r`   r   )rg  N)rb  zOptional[OrderedSet[str]]r   ztuple[float, str]rX   )r   z!Sequence[scheduler.SchedulerNode]r  Optional[CoalesceVarAnalysis])r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rl   r   r  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]r   r   )F)rn   r   r&  list[NodeScheduleEntry]rb  r   r   r   )rV  rJ   )rV  rJ   r   zlist[SIMDKernel])r   r   r   tuple[tuple[int, ...], ...])r   r   r   r   )r   r   r2  r   r   r7  )r8  r   r   Optional[str])r[  zlist[BaseSchedulerNode]r\  r   rR  r   rS  r   rb  r   r   z$list[tuple[Optional[str], Any, Any]])r   r  )r  r  r  r  r   immutable_dict[str, sympy.Expr])r(  r  r  r   r   r9  )r(  r  rl   r   r(  r   r   r9  )r   z%list[immutable_dict[str, sympy.Expr]])
r&  r6  r  r   r(  r   r  rN   r   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])r&  r6  rl   r   r(  r   r(  r  )r&  r6  rl   r   r(  r   r  zlist[dict[str, sympy.Expr]])r  r5  r   r  )r  r5  r   r:  r   )FN)r8  r   )7r   r   r   r   r   r   r  r"  r  r  can_fuse_verticalr  r>  rC  rH  r]  rc  rB  r  r  r  r  r  r  rM  rN  r  r"  r-  r5  r@  rE  rf  rn  r  r   r   r  r  r  r  r  r  r  r  r   r   r   r  r  r$  r'  r1  r  r   r   r`   r^   r  r    s   
 (K'Q`6D !"^@8
%2)j RV"5N"	"
W)x <@
0
 9
(=P=2 $$
$
 
$ $T #(	@@ /@  	@
 
@"Q)f
1
	
 -T  M^'	$,

'
/2
	$
.  '+n %n 
n`I #(i 0i   $i  	i 
 i   i  
.i V)( }  }~ 

,

@T

	(

 

 
$
 
 
)	
 
 /%/ / $	/
 
)/ /( y
 
/y yv Y$.Y$ $Y$ $	Y$
 /Y$ 
GY$ Y$v 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?O$
 9O$ 
GO$ O$b MQ <I D"r`   r  T)frozenc                  H    \ rS rSr% S\S'   S\S'   SrS\S'   \S	 5       rS
rg)r~  i  r  r(  r   ry  Nr8  ri   c                z    [         R                  R                  R                  U SS9n U S:  =(       a    U S-  S:H  $ )z@Somewhat arbitrary heuristic used to boost scores for some sizesi    r  rp  r   )r=   r   r   r   )r   s    r^   r  CandidateTiling.is_good_size  s:     GG..q4.@Bw(AFaK(r`   r   )	r   r   r   r   r"  ri   r  r  r   r   r`   r^   r~  r~    s)    !!JD-) )r`   r~  c                  .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i  c                :   > [         TU ]  5         Xl        X l        g rX   )rg   rh   r   r   )rp   r   r   rq   s      r^   rh   CantSplit.__init__  s    	"r`   c                8    U R                    SU R                   3$ )Nz not divisible by r   r   rt   s    r^   __str__CantSplit.__str__  s    )).t~~.>??r`   rD  )r   r   r   r   rh   rE  r   r   r   s   @r^   r  r    s    #
@ @r`   r  )r   )r]   r   r   r   )r  r  r   r   )
__future__r   r  r  dataclassesr   rA  r  r  r  r  r   typingr   r   r   r   r	   r
   typing_extensionsr   r   rY   torch._loggingtorch._inductorr   torch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher    r!   dependenciesr"   r#   r$   collections.abcr%   r'   optimize_indexingr(    runtime.coordinate_descent_tunerr)   runtime.hintsr*   runtime.runtime_utilsr+   r,   r-   r.   r/   r0   utilsr1   r2   r3   r4   r5   r6   r7   r8   r9   r:   virtualizedr;   r<   r=   block_analysisr?   commonr@   rA   rB   rC   r?  rD   rE   simd_kernel_featuresrF   rG   rH   rI   rJ   rK   rL   rM   rN   	getLoggerr   r  _logginggetArtifactLoggerr  r  
fusion_logdoprintr  r  r_   	dataclassrb   r   r   r  r  r  r$  r   r  r~  	Exceptionr  r   r`   r^   <module>rj     s   "           K K %    # 2 B G 9 / L L  & $ $ F . 6 6 ( A < , L L D D   - , / P P :  <<@ !00<H~~//*E^^--hA
 	78;
 3+ 3+ 3+lH;/ H;V;'? ;'| +;T   
"z 
"G('/*B GTz"^ z"z; d#	) 	) $	)@	 @r`   