
    N jGx                   	   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJr  S SKJrJr  S SKJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SKJ r!  S SK"J#r#J$r$J%r%  S S	K&J'r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-J.r.  S SK/J0r0  S SK1J2r2J3r3J4r4J5r5J6r6  S SK7J8r8  S SK9J:r:  S SK;J<r<  S SK=J>r>J?r?  SSK@JArAJBrBJCrC  SSKDJErE  SSKCJFrFJGrG  SSKHJIrI  SSKJJKrK  SSKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrVJWrW  SSKXJYrY  SSKZJ[r[J\r\J]r]J^r^J_r_J`r`  SSKaJbrb  SSKcJdrd  SSKeJfrfJgrgJhrh  \(       a$  S S KJiriJjrj  S SKkrkSS!KlJmrm  SS"KCJnrn  SS#KoJprp  SS$KqJrrr  \R                  " \t5      ru\^" 5       R                  rw\x\R                  \R@                  \z\{4   r|\x\R                  \R@                  \zS%\z4   r}\\CR                  \_4   r\S&/S4   rSgS' jrShS( jrSiS) jrS* r\\z\4   r\\x\\\R2                  4   S+4   \\/\x\S+4   4   4   r  Sj           SkS, jjrSlS- jr\GR                   " S. S/5      5       r " S0 S15      r " S2 S&5      r\GR                   " S3 S4\5      5       r\GR                   " S5 S6\5      5       r\GR                   " S7 S8\5      5       r\GR                   " S9 S:\5      5       r\GR                   " S; S<\5      5       r\GR                   " S= S>\5      5       r " S? S@\5      r\GR                   " SA SB\5      5       r\GR                   " SC SD\5      5       r\GR                   " SE SF\5      5       r\GR                   " SG SH\5      5       r\GR                   " SI SJ\5      5       r\GR                   " SK SL\5      5       r " SM SN5      r\GR                   " SO SP\5      5       r\GR                   " SQ SR\5      5       r\GR                   " SS ST\5      5       r\GR                   " SU SV\5      5       r " SW SX\5      r\GR                   " SY SZ\5      5       r\GR                   " S[ S\\5      5       r\GR                   " S] S^\5      5       r\GR                   " S_ S`\5      5       r\GR                   " Sa Sb\5      5       r\zr\\\S4   r " Sc Sd\\5      r " Se Sf\5      rg)m    )annotationsN)Callable)chaincount)AnyOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timedget_debug_dir)DebugPrinterManager)MultiKernelState)	cache_dir)get_opaque_obj_repris_opaque_value_type)trace_structured)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)CUSTOM_EXTERN_KERNEL_CODEGEN)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)ExternKernel)BaseSchedulerNode)FxConverterzir.CommBufferTypeWrapperLinec                :   [         R                  R                  U 5      nU R                  5       [         R                  R                  ;  nU R                  5       U R                  5       [        [         R                  R                  R                  U5      5      U4$ N)
r5   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper2   sizevarssimplify)nodestorage_size	alignments      p/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrW   d   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                p   [         R                  R                  U 5      nU R                  5       n[	        U[
        R                  5      (       d   eU R                  5       U R                  5       [        [         R                  R                  R                  U5      5      UR                  UR                  4$ rJ   )r5   rK   rL   get_output_spec
isinstancer#   CommBufferLayoutrO   rP   r2   rQ   rR   comm_buffer_type
group_name)rS   rT   layouts      rV   comm_buffer_reuse_keyr`   r   s    7766t<L!!#Ffb112222  "!''""++L9: rX   c                   U R                  5       UR                  5       :w  a  gU R                  5       UR                  5       :w  a  g[        R                  R                  R                  [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U5      5      n[        U5      [        U5      :X  d`  [        R                  R                  R                  USU-  5      (       a/  [        R                  R                  R                  X25      (       a  gg)NFgffffff?T)
rO   rP   r5   rK   rQ   rR   rL   r2   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rV   can_match_buffer_sizerh      s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMMGG11+JJrX   c                H   [        U [        R                  5      (       a?  U R                  5       nUR                  UR
                  UR                  UR                  S4$ / nU n[        U[        R                  [        R                  [        R                  45      (       aq  UR                  5       nUc  gUR                  U5        UR                  n[        U[        R                  [        R                  [        R                  45      (       a  Mq  [        U[        R                  5      (       d  gU H-  nUR                  UR                  5       R                  :w  d  M-    g   UR                  5       nUR                  UR
                  UR                  UR                  S4$ )z
Collapse a chain of ReinterpretView <- StorageBox
<- ReinterpretView <- StorageBox.... <- buffer wrappers if every layer
has the same offset as the innermost (base) buffer.

Returns:
    (size, stride, offset, dtype, collapsible: bool)
T)NNNNF)r[   r#   Buffer
get_layoutsizestrideoffsetr   	TensorBox
StorageBoxr&   appenddata)rr   laylayoutscurbase_lays        rV   codegen_reinterpret_view_helperrw      s(    $		""ooxxSZZD@@G
C
S2<<8J8JK
L
Lnn;0shh S2<<8J8JK
L
L c299%%, ::)0000  ~~H==(//8??HNNDPPrX   .c                R  ^ ^^^ [        5       mSS jm S   SUU4S jjjnSSU UU4S jjjnST  3nU" SU S35        T(       a9  [        R                  R                  (       a  TR                  R                  5       O[        R                  " 5       nTR                  5          U   [        R                  R                  (       ab  U(       a[  [        R                  R                  (       a<  U[        R                  R                  ;   a  [        R                  R                  U   n	OS /[        U5      -  n	[        U5      S:X  a!  U" US   U	S   5      u  pU" S	U
 3S	U 35        O[        U5      S:  d   e[        U5      [        U5      :X  d   e[        5       n[        [        X!U	5      S
 SS9 H  u  pn/ nUR                  (       a?  UR                   H/  nUS;  d  M  UR!                  SU SUR                  U    35        M1     U(       a  SR#                  U5      nOSnU" X5      u  pSU SU
 3nUU;   a  M  UR%                  U5        U" USU SU 35        M     S S S 5        S S S 5        UTR'                  5       4$ ! , (       d  f       N(= f! , (       d  f       N1= f)Nc                p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ rJ   )r[   sympyr   Integer)items    rV   _convert_to_sympy_expr@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s&    !$

33tLt9LLrX   c                  > Tb  [        U 5      (       a  X 4$ [        U4S jU  5       5      nU(       d  UnTR                  U5      [        R                  R
                  (       a%  TR                  [        U4S jU 5       5      5      4$ S4$ )z
This function return a tuple of two values: the first one is for the real grid
which is used in the generated code; the second one is an example grid with
concreate values which is used in the autotune block to run the generated
kernels at compile time.
Nc              3  4   >#    U  H  nT" U5      v   M     g 7frJ    ).0gr}   s     rV   	<genexpr>Kuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     Cd1!44ds   c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frJ   generate_example_arg_valuetype)r   r   wrappers     rV   r   r      s,      !-A  ::1d1gFF!-   (+)callabletuplecodegen_python_shape_tupler"   tritonautotune_at_compile_time)gridexample_grid
sympy_gridr}   r   s      rV   determine_grid8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htnn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rX   c                   > TR                  U 5        T(       aV  [        R                  R                  (       a6  TTR                  ;  a%  TR
                  R                  U=(       d    U 5        g g g g rJ   )	writeliner"   r   r   kernel_autotune_nameskernel_autotune_calls)liner   nameoutputr   s     rV   r   3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rX   grid_wrapper_for_def z(meta):r6   r   zreturn c                2    [        U S   R                  5      $ Nr6   lenkwargsxs    rV   <lambda>2user_defined_kernel_grid_fn_code.<locals>.<lambda>  s    c!A$++.rX   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )r|   Union[int, sympy.Expr]return
sympy.ExprrJ   )r   
TritonGridr   zOptional[TritonGrid])r   strr   Optional[str])r-   r"   r   r   r   indent
contextlibnullcontextr5   rK   autotuning_gridsr   r   sortedzipr   rq   joinaddgetvalue)r   configsgridsr   original_fxnode_namer   r   fn_namekernel_autotune_calls_indentexample_gridsr   r   seenc
guardslistkwargguards	statementr}   r   s   `  `              @@rV    user_defined_kernel_grid_fn_coder      sb    FM
 .2

*
 
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
6MM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KDv&',(@Au:>!>u:W---$.LD *0EM2.*%
  
88!"  ) 
 '--ugVAHHUOCT.UV "* $\\*5F#F%3D%G"!&4&9	$#)s6()L>%JK1*- 7` FOO%%%a 76s,   %J(D8J$BJ%J
J	J
J&c                   ^^^^^^ [        5       mTR                  U R                  SS9  SSKmSSKJm  SSKJm  [        U R                  /5      mUUUUUU4S jmT" U 5        TR                  5       $ )z[
Given a triton kernel function pointer collect the transitive closure of
its dependencies
Tstripr   N)JITFunction)	constexprc           	     \  > [        S [        R                  " U R                  5       5       5      nU R                  R                  R                  S0 5      nU R                  R                  R                   GH  nUT;   a  M  X0R                  R                  ;   d  M'  U R                  R                  U   n[        UT5      (       aV  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        M  [        TS5      (       a  [        UTR                  R                   R"                  5      (       aW  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        GM>  [        U[$        [&        [(        T
45      (       a  T	R                  5         [        UT
5      (       a  SUR*                  < S	3nOU< nUR                  U5      =n(       aQ  [        U[,        5      (       a  S
UR.                   SUR0                   3nOS
U< 3nT	R                  U U SU 35        OT	R                  U SU 35        TR                  U5        GM)  X1;   d  GM1  US:w  d  GM:  [        US5      (       d  GMN  UR.                  R3                  S5      (       d  GMq  T	R                  SUR.                   SUR0                   SU 35        TR                  U5        GM     g )Nc              3  ^   #    U  H#  nUR                   S :X  d  M  UR                  v   M%     g7f)LOAD_GLOBALN)opnameargval)r   insts     rV   r   ^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>K  s)      '
3{{m+ DKK3s   --__annotations__z@triton.jitTr   constexpr_functionz@triton.constexpr_functionztl.constexpr(): . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesr[   newliner   splicesrcr   hasattrruntimejitConstexprFunctionintr   boolvaluer   r   __name__
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverser   s           rV   r   Kuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverseF  s   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11::K..mm777#22;?fk22#++-#--m<#**6::T*B$((5V$V%9::zNN&&88@ @ $++-#--.JK#**6::T*B$((5V$c4(CDD#++-!&)44'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$77"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((54#t+55 ))44X>>
 $-- 1 12(6??:K4P[}] %((5o ;rX   )
r-   r   r   r   r   triton.languager   r   r   r   )kernelr   r   r   r   r   r   s    @@@@@@rV   9user_defined_triton_kernel_transitive_closure_source_coder  6  si    
 %&O6::T2 ") "6??"34B6 B6H V##%%rX   c                  0    \ rS rSr% S\S'   S\S'   S rSrg)	SymbolicCallArgi  sympy.Symbolinnerr   
inner_exprc                ,    [        U R                  5      $ rJ   )r   r  selfs    rV   __str__SymbolicCallArg.__str__  s    4::rX   r   N)r   r   __qualname____firstlineno__r   r  __static_attributes__r   rX   rV   r  r    s    rX   r  c                  p   ^  \ rS rSrU 4S jrS
S jrSS jrSS jrSS jrSS jr	      SS jr
S	rU =r$ )MemoryPlanningStatei  c                   > [         TU ]  5         [        R                  " [        5      U l        [        R                  " [        5      U l        SU l        g Nr   )super__init__collectionsdefaultdictlist
reuse_poolcomm_buffer_reuse_pooltotal_allocated_buffer_size)r
  	__class__s    rV   r  MemoryPlanningState.__init__  sG     ##D) 	 ##D) 	# 12(rX   c                L    [        U R                  R                  US 5      5      $ rJ   )r   r  r   r
  r   s     rV   __contains__ MemoryPlanningState.__contains__  s    DOO''T233rX   c                f    U R                   U   R                  5       nUR                  (       a   eU$ rJ   )r  pop	is_reusedr
  r   r|   s      rV   r#  MemoryPlanningState.pop  s+    s#'')>>!!rX   c                f    UR                   (       a   eU R                  U   R                  U5        g rJ   )r$  r  rq   r%  s      rV   pushMemoryPlanningState.push  s&    >>!!##D)rX   c                L    [        U R                  R                  US 5      5      $ rJ   )r   r  r   r  s     rV   comm_buffer_contains(MemoryPlanningState.comm_buffer_contains  s     D//33C>??rX   c                f    U R                   U   R                  5       nUR                  (       a   eU$ rJ   )r  r#  r$  r%  s      rV   comm_buffer_pop#MemoryPlanningState.comm_buffer_pop  s-    **3/335>>!!rX   c                f    UR                   (       a   eU R                  U   R                  U5        g rJ   )r$  r  rq   r%  s      rV   comm_buffer_push$MemoryPlanningState.comm_buffer_push  s*     >>!!##C(//5rX   )r  r  r  )r   ReuseKeyr   r   )r   r3  r   FreeIfNotReusedLine)r   r3  r|   r4  r   None)r   CommBufferReuseKeyr   r   )r   r6  r   r4  )r   r6  r|   r4  r   r5  )r   r   r  r  r  r   r#  r(  r+  r.  r1  r  __classcell__r  s   @rV   r  r    sD    
24
*@
6%6-@6	6 6rX   r  c                      \ rS rSrSS jrSrg)rH   i  c                0    [        S[        U 5       35      e)Nz&FX codegen not yet supported for type )NotImplementedErrorr   r
  	converters     rV   
codegen_fxWrapperLine.codegen_fx  s    !$J4PT:,"WXXrX   r   Nr=  rG   r   FxConversionFuncr   r   r  r  r>  r  r   rX   rV   rH   rH     s    YrX   c                  H    \ rS rSr% S\S'   S\S'   SS jrSS jrSS jrS	rg
)EnterSubgraphLinei  PythonWrapperCodegenr   rD   rK   c                b    U R                   R                  U R                   R                  5        g rJ   )r   push_computed_sizescomputed_sizesr	  s    rV   __post_init__EnterSubgraphLine.__post_init__  s    (()D)DErX   c                n    U R                   R                  U R                  5        UR                  5         g rJ   )r   push_codegened_graphrK   	do_indentr
  codes     rV   codegenEnterSubgraphLine.codegen  s"    ))$**5rX   c                    UR                   $ rJ   )_generate_enter_subgraphr<  s     rV   r>  EnterSubgraphLine.codegen_fx  s    111rX   r   Nr   r5  rO  r-   r   r5  r@  	r   r   r  r  r   rI  rP  r>  r  r   rX   rV   rD  rD    s    !!F2rX   rD  c                  H    \ rS rSr% S\S'   S\S'   S
S jr\SS j5       rSrg	)ConditionalLinei  rE  r   zir.ConditionalrS   c                    [        S5      e)NzOnly supports FX codegen)r;  rN  s     rV   rP  ConditionalLine.codegen  s    !"<==rX   c                    U R                   $ rJ   )_generate_conditionalr=  s    rV   r>  ConditionalLine.codegen_fx  s    ...rX   r   NrV  r@  	r   r   r  r  r   rP  staticmethodr>  r  r   rX   rV   rY  rY    s'    !!
> / /rX   rY  c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
CommentLinei  r0   r   c                :    UR                  U R                  5        g rJ   )r   r   rN  s     rV   rP  CommentLine.codegen  s    tyy!rX   c                    U R                   $ rJ   )_generate_commentr^  s    rV   r>  CommentLine.codegen_fx  s    ***rX   r   NrV  r@  r`  r   rX   rV   rc  rc    s!    
" + +rX   rc  c                  H    \ rS rSr% S\S'   S\S'   S
S jr\SS j5       rSrg	)DynamicScalarLinei  rE  r   zir.DynamicScalarrS   c                N    U R                   R                  U R                  5        g rJ   )r   _codegen_dynamic_scalarrS   rN  s     rV   rP  DynamicScalarLine.codegen  s    ,,TYY7rX   c                    U R                   $ rJ   )_generate_dynamic_scalarr^  s    rV   r>  DynamicScalarLine.codegen_fx  s    111rX   r   NrV  r@  r`  r   rX   rV   rj  rj    s'    !!
8 2 2rX   rj  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)ExitSubgraphLinei  rE  r   c                V    U R                   R                  5       U R                   l        g rJ   )r   pop_computed_sizesrH  r	  s    rV   rI  ExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rX   c                X    U R                   R                  5         UR                  5         g rJ   )r   pop_codegened_graphdo_unindentrN  s     rV   rP  ExitSubgraphLine.codegen  s    ((*rX   c                    UR                   $ rJ   )_generate_exit_subgraphr<  s     rV   r>  ExitSubgraphLine.codegen_fx  s    000rX   r   NrU  rV  r@  rW  r   rX   rV   rr  rr    s    !!H1rX   rr  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)EnterDeviceContextManagerLinei
  r   
device_idxOptional[int]last_seen_device_guard_indexc                   [         R                  R                  (       Ga  UR                  S5        [         R                  R                  (       aj  U R
                  c;  UR                  [         R                  R                  R                  5        S35        g U R
                  U R                  :X  d   S5       eg U R
                  cH  UR                  [         R                  R                  R                  5        SU R                   S35        g UR                  SU R                   S35        g UR                  S[         R                  R                  R                  U R                  5       S35        UR                  5         UR                  [         R                  R                  R                  U R                  5      5        g )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r5   rK   cpp_wrapperr   aot_moder  
device_opscpp_aoti_stream_guardr  cpp_aoti_device_guarddevice_guardrM  
set_devicerN  s     rV   rP  %EnterDeviceContextManagerLine.codegen  sM   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrX   c                    UR                   $ rJ   )&_generate_enter_device_context_managerr<  s     rV   r>  (EnterDeviceContextManagerLine.codegen_fx,  s    ???rX   r   NrV  r@  r   r   r  r  r   rP  r>  r  r   rX   rV   r~  r~  
  s    O"//K:@rX   r~  c                  (    \ rS rSrSS jrSS jrSrg)ExitDeviceContextManagerLinei0  c                d    [         R                  R                  (       d  UR                  5         g g rJ   )r5   rK   r  rx  rN  s     rV   rP  $ExitDeviceContextManagerLine.codegen1  s     ww"" #rX   c                    UR                   $ rJ   )%_generate_exit_device_context_managerr<  s     rV   r>  'ExitDeviceContextManagerLine.codegen_fx5  s    >>>rX   r   NrV  r@  )r   r   r  r  rP  r>  r  r   rX   rV   r  r  0  s    ?rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelAllocLinei9  rE  r   ir.ExternKernelAllocrS   c                    U R                   n/ UR                  5       QUR                  5       QnU R                  R	                  U R                   U5        g rJ   )rS   codegen_argscodegen_kwargsr   $_generate_extern_kernel_alloc_helper)r
  rO  rS   argss       rV   rP  ExternKernelAllocLine.codegen>  sD    yy=""$=t':':'<=99$))TJrX   c                    UR                   $ rJ   )_generate_extern_kernel_allocr<  s     rV   r>   ExternKernelAllocLine.codegen_fxC  s    666rX   r   NrV  r@  r  r   rX   rV   r  r  9  s    !!
K
7rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelOutLineiG  rE  r   ir.ExternKernelOutrS   c           	     ^   U R                   n/ UR                  5       QUR                  SS9QnUR                  5       n[        R
                  R                  (       a  UR                  S:X  a  SnOUR                  5       nUR                  5       =n(       a  UR                  O[        R
                  R                  nU R                  R                  UUR                  5       UR                  (       a  UR                  R                  5       OS UUU R                   R                  5       5        g )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)rS   r  r  get_kernel_namer5   rK   r  cpp_kernel_name
get_devicer   device_typer   "_generate_extern_kernel_out_helpercodegen_referenceoutput_viewget_stack_traces)r
  rO  rS   r  kernel_nameddevices          rV   rP  ExternKernelOutLine.codegenL  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L77""$484D4DD..0$II&&(	
rX   c                    UR                   $ rJ   )_generate_extern_kernel_outr<  s     rV   r>  ExternKernelOutLine.codegen_fxb      444rX   r   NrV  r@  r  r   rX   rV   r  r  G  s    !!

,5rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)FreeLineif  rE  r   %Union[BufferLike, ir.TorchBindObject]rS   c                    U R                   R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   5      5        g rJ   )rS   rM   r5   rK   removed_buffersr   r   make_buffer_freerN  s     rV   rP  FreeLine.codegenk  sF    yy!!#177+B+BBBBt||44TYY?@rX   c                    UR                   $ rJ   )_generate_freer<  s     rV   r>  FreeLine.codegen_fxo      '''rX   r   NrV  r@  r  r   rX   rV   r  r  f  s    !!
//A(rX   r  c                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   SS jrSS jrSrg)KernelCallLineis  rE  r   r   r  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   r   zdict[str, Any]triton_metazOptional[dict[str, Any]]inductor_metaztorch.devicer  
graph_namer   c                &   U R                   R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  U R                  S9  g )N)	r   r  r  r  r  r  r  r  r   )r   _generate_kernel_call_helperr  r  r   r  r  r  r  r  r  r  r   rN  s     rV   rP  KernelCallLine.codegen  sn    11NN;;nn]]]]((,,;;!%!:!: 	2 	
rX   c                    UR                   $ rJ   )_generate_kernel_callr<  s     rV   r>  KernelCallLine.codegen_fx      ...rX   r   NrV  r@  r  r   rX   rV   r  r  s  sR    !!L++O
/rX   r  c                  r    \ rS rSr% S\S'   S\S'   S\S'   SrS\S	'   S
rS\S'   SrS\S'   SS jrSS jr	Sr
g)KernelDefinitionLinei  rE  r   r   r  kernel_bodyNr   metadataTr   gpucpp_definitionc                    U R                   R                  U R                  U R                  U R                  U R
                  U R                  S9  g N)r  r  r  )r   _define_kernel_helperr  r  r  r  r  rN  s     rV   rP  KernelDefinitionLine.codegen  sB    **]].. 	+ 	
rX   c                    UR                   $ rJ   )_generate_kernel_definitionr<  s     rV   r>  KernelDefinitionLine.codegen_fx  r  rX   r   rV  r@  )r   r   r  r  r   r  r  r  rP  r>  r  r   rX   rV   r  r    s<    !!"Hm"C$(NM(
5rX   r  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)MemoryPlanningLinei  rE  r   c                    U $ )zFirst pass to find reuser   r
  states     rV   planMemoryPlanningLine.plan  s    rX   c                    g)zSecond pass to output codeNr   rN  s     rV   rP  MemoryPlanningLine.codegen  s    rX   c                |   / n[         R                  " U 5       Hw  nUR                  S:X  a  M  [        XR                  5      nUR	                  UR                   SUR
                  [        R                  L a  UR                  5       OU 35        My     [        U 5      R                   SSR                  U5       S3$ )z6
Emits a string representation that fits on one line.
r   =(, r   )dataclassesfieldsr   getattrrq   r   r#   rj   rM   r   r   )r
  r  fieldvals       rV   r  MemoryPlanningLine.__str__  s      ''-EzzY&$

+CKK::,a%**		2IsST	 . t*%%&a		$'8::rX   r   Nr  r  r   r  rV  r   r   )	r   r   r  r  r   r  rP  r  r  r   rX   rV   r  r    s    !!);rX   r  c                  8    \ rS rSrS rSS jrS	S jrS	S jrSrg)
EfficientPeakEstimatei  c                   SSK JnJn  [        R                  R
                  R                  n[        [        R                  R                  R                  5       5      n[        [        R                  R                  5       5      nU" X45      nU" UUU5      u  U l        nSSKJn  U" U[        R                  [         S5      U l        g )Nr    )estimate_peak_memoryget_freeable_input_bufr6   )SegmentedTreer   )memoryr  r  r5   rK   	schedulernodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer  operatorr   max)	r
  r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder  s	            rV   r  EfficientPeakEstimate.__init__  s    I''++11!!''"6"6";";"=>"177#;#;#=>!7!V;O"<
8 "8 	2+"HLL#q
rX   c                    [         R                  R                  R                  [         R                  R	                  U5      SS9[        UR                  5       5      -  $ )Nr   fallback)r5   rK   rQ   	size_hintrL   r,   rP   r
  rS   s     rV   	_get_sizeEfficientPeakEstimate._get_size  sL    ww))GG//5 * 
4>>+,- 	-rX   c                n    U R                   R                  UR                  S-   UR                  S-
  5      $ r   )r  summarize_rangescheduler_node_indexr
  line_aline_bs      rV   peak_between"EfficientPeakEstimate.peak_between  s6    ""22''!+V-H-H1-L
 	
rX   c                    UR                   S-   UR                   :X  a  g U R                  R                  UR                   S-   UR                   S-
  U R                  UR                  5      5        g r   )r  r  update_ranger  rS   r  s      rV   update_peak_between)EfficientPeakEstimate.update_peak_between  s^    &&*f.I.II((''!+''!+NN6;;'	
rX   )r  r  N)rS   
BufferLiker   r   )r  r4  r  AllocateLine)	r   r   r  r  r  r  r  r  r  r   rX   rV   r  r    s    
&-



rX   r  c                  j    \ rS rSr% SrS\S'   SrS\S'   S rSS	 jrSS
 jr	SS jr
SS jrSS jrSrg)r"  i  z6Represents a buffer allocation during memory planning.r!  rS   Fr   comm_bufferc                   [         R                  R                  R                  c   e[         R                  R                  R                  R                  [         R                  R                  R                  5      U l        g rJ   r5   rK   r   current_noder  indexr  r	  s    rV   rI  AllocateLine.__post_init__  T    ww  --999$%GG$5$5$;$;$A$AGG**%
!rX   c                    U R                   (       a  gUR                  S-   U R                  :X  a  gU R                  R                  R                  nU R                  R                  R                  X5      nX$-   nXS:*  $ )NTr6   )r$  r  r   estimate_peakr  r  )r
  	free_linerl   r  peak_memory_in_rangenew_peak_memorys         rV   should_reuse_buffer AllocateLine.should_reuse_buffer  sl    ))A-1J1JJ"ll88LL#||99FFyW555rX   c           	     T   U R                   R                  5       [        R                  R                  ;   a  [        U R                  5      $ U R                  (       a  [        U R                   5      n[        R                  (       aX  UR                  U5      (       aB  UR                  U5      nSUl        [        U R                  UR                   U R                   SS9$ U $ [        U R                   5      n[        R                  (       Ga
  X!;   Ga  UR!                  U5      n[        R                  R"                  R%                  [        R                  R'                  U R                   5      SS9[)        U R                   R+                  5       5      -  nU R-                  X45      (       aW  SUl        U R                  R.                  R1                  X05        [        U R                  UR                   U R                   5      $ UR3                  X#5        U $ U R                   R5                  5       R6                  S:X  aj  U R                  R9                  U R                   5      nUbB  U=R:                  [=        [>        R@                  " [B        RD                  US5      5      -  sl        U $ )NTr$  r   r  cpur6   )#rS   rM   r5   rK   r  NullLiner   r$  r`   r"   allow_buffer_reuser+  r.  r$  	ReuseLinerW   r#  rQ   r  rL   r,   rP   r0  r,  r  r(  rO   r   static_shape_for_buffer_or_noner  r   	functoolsreducer  mul)r
  r  r   r-  rl   static_shapes         rV   r  AllocateLine.plan  s   99177#:#::DLL))'		2C((U-G-G-L-L!11#6	&*	# LL)..$))  K tyy)$$$		#I77##--33DII> . tyy22456D ''	88&*	#**>>yO y~~tyyII

3*99((*//58<<GG		RL'11S$$X\\<C6 1 rX   c                &   U R                   R                  5       [        R                  R                  ;  d   eU R
                  (       a  U R                  U5        g U R                  R                  U R                   5      nUR                  U5        g rJ   )
rS   rM   r5   rK   r  r$  _codegen_comm_bufferr   make_buffer_allocationr   )r
  rO  r   s      rV   rP  AllocateLine.codegen+  sb    yy!!#177+B+BBBB%%d+<<66tyyADNN4 rX   c                R   U R                   R                  5       nU R                   R                  5       nUb  UR                  c
   SU 35       eU R                   R	                  5       n[        U R                   R                  5       5      n[        U R                   R                  5       5      nU R                   R                  5       n[        U[        R                  5      (       d   eUR                  nUR                  n	U[        R                  R                  :X  ai  U SU R                   R#                  U5       SU R                   R#                  U5       SU SUR                   SU	 S[$        R&                  " SS	5       S
3n
O[)        SU 35      eUR+                  U
5        g)z*Generate allocation code for comm buffers.Nz9Comm buffer requires a valid CUDA device with index, got z = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )rS   rM   r  r(  rP   r   get_size
get_striderZ   r[   r#   r\   r]   r^   CommBufferTypeSYMM_MEMr   codegen_shape_tuplerandomrandintr;  r   )r
  rO  r   r  r   shaperm   r_   r]   r^   r   s              rV   r?  !AllocateLine._codegen_comm_buffer3  s   yy!!#%%'!fll&> 	
GxP	
> 		##%dii((*+tyy++-.**,&""5"56666!22&&
r00999&-<<33E:;2<<33F;<B' &&,ll^ 4)l +"NN1i89<  &01A0BC  	trX   c                T    U R                   (       a  UR                  $ UR                  $ rJ   )r$  _generate_comm_buffer_allocate_generate_allocater<  s     rV   r>  AllocateLine.codegen_fxR  s#    ;;;+++rX   r  N)r-  r4  rl   r   r   r   r  rV  r@  )r   r   r  r  __doc__r   r$  rI  r0  r  rP  r?  r>  r  r   rX   rV   r"  r"    s4    @
K
6%N!>,rX   r"  c                  `    \ rS rSr% S\S'   SrS\S'   SrS\S'   S rSS	 jrSS
 jr	SS jr
Srg)r4  iX  r!  rS   Fr   r$  r$  c                   [         R                  R                  R                  c   e[         R                  R                  R                  R                  [         R                  R                  R                  5      U l        g rJ   r&  r	  s    rV   rI  !FreeIfNotReusedLine.__post_init__^  r*  rX   c                f   [        U R                  R                  5       5      S:  a  U $ [        U R                  R                  [
        R                  5      (       a  U $ U R                  (       a   eU R                  R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        R                  (       a_  U R                   (       a(  [#        U R                  5      nUR%                  X 5        U $ ['        U R                  5      nUR)                  X 5        U $ r  )r   rS   get_inputs_that_alias_outputr[   r_   r#   MultiOutputLayoutr$  rM   r5   rK   r  r5  r   r"   r6  r$  r`   r1  rW   r(  )r
  r  r   s      rV   r  FreeIfNotReusedLine.pland  s    tyy55781<Kdii&&(<(<==K>>!!99177#:#::DLL))$$+DII6&&s1  'tyy1

3%rX   c                   U R                   R                  5       [        R                  R                  ;  d   eU R
                  (       d  U R                  R                  U R                   5      nU R                  (       ag  U R                   R                  5       n[        U[        R                  5      (       d   eUR                  U SUR                  R                   S35        g UR                  U5        g g )Nz # z buffer free)rS   rM   r5   rK   r  r$  r   r  r$  rZ   r[   r#   r\   r   r]   r   )r
  rO  r   r_   s       rV   rP  FreeIfNotReusedLine.codegenv  s    yy!!#177+B+BBBB~~<<00;D224!&"*=*=>>>>$s6+B+B+H+H*IVWt$ rX   c                T    U R                   (       a  UR                  $ UR                  $ rJ   )r$  _generate_comm_buffer_free_generate_free_if_not_reusedr<  s     rV   r>  FreeIfNotReusedLine.codegen_fx  s#    777555rX   rP  Nr  rV  r@  )r   r   r  r  r   r$  r$  rI  r  rP  r>  r  r   rX   rV   r4  r4  X  s0    
ItK
$	%6rX   r4  c                  R    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS jrSS	 jrS
rg)ReinterpretLinei  r!  rS   	reused_asz	ir.Layoutr_   c                    U $ rJ   r   r  s     rV   r  ReinterpretLine.plan  s    rX   c                T   [        U R                  [        R                  5      (       d   e[        U R                  R                  [        R
                  5      (       d   eU R                  R                  U R                  R                  5       U R                  R                  5        g rJ   )
r[   r_   r#   NonOwningLayoutviewr&   r   codegen_deferred_allocationra  rM   rN  s     rV   rP  ReinterpretLine.codegen  sp    $++r'9'9::::$++**B,>,>????00NN##%t{{'7'7	
rX   c                    UR                   $ rJ   )_generate_reinterpretr<  s     rV   r>  ReinterpretLine.codegen_fx  r  rX   r   Nr  rV  r@  )	r   r   r  r  r   r  rP  r>  r  r   rX   rV   r`  r`    s#    

/rX   r`  c                  d    \ rS rSr% S\S'   S\S'   SrS\S'   SrS\S	'   SS
 jrSS jrSS jr	Sr
g)r7  i  r!  rS   ra  Tr   
delete_oldFr$  c                |   U R                   R                  5       [        R                  R                  ;   aM  U R
                  R                  5       [        R                  R                  ;   d   e[        U R                  5      $ U R
                  R                  5       [        R                  R                  ;  d   eU $ rJ   )rS   rM   r5   rK   r  ra  r5  r   r  s     rV   r  ReuseLine.plan  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrX   c                x   U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   U R
                  U R                  5      5        g rJ   )
rS   rM   r5   rK   r  ra  r   r   make_buffer_reuserm  rN  s     rV   rP  ReuseLine.codegen  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rX   c                    UR                   $ rJ   )_generate_reuser<  s     rV   r>  ReuseLine.codegen_fx  s    (((rX   r   Nr  rV  r@  )r   r   r  r  r   rm  r$  r  rP  r>  r  r   rX   rV   r7  r7    s1    
JK
)rX   r7  c                      \ rS rSrSS jrSrg)r5  i  c                    UR                   $ rJ   )_generate_nullr<  s     rV   r>  NullLine.codegen_fx  r  rX   r   Nr@  rB  r   rX   rV   r5  r5    s    (rX   r5  c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)MultiOutputLinei  zM
Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
rE  r   r   result_namearg_nameSequence[Any]indicesc                   ^ ^ UU 4S jmT" T R                   T R                  5      nUR                  T R                  R                   T R
                   SU T R                  R                   35        g )Nc                  > [        U5      S:  a  US   u  p#[        U[        5      (       a  T" U  SU S3USS  5      $ [        U[        5      (       a;  TR                  R                  U TR                  [        U5      5      nT" XASS  5      $ [        U[        5      (       a  T" U  SU S3USS  5      $ [        SU5      eU $ )Nr   []r6   z['z']znon supported index type: )
r   
issubclassr  r   r   codegen_tuple_accessr|  r   dictAssertionError)basenamer  itypeituple_accesscodegen_list_tuple_accessr
  s        rV   r  :MultiOutputLine.codegen.<locals>.codegen_list_tuple_access  s    7|a"1:eT**4z1#Q5GQRQSUUu--#'<<#D#D $"2"2CF$L 5\12;OOt,,4zA3b5I7STSU;WW()EuMMrX   r   )r}  r  r   r   declarer|  ending)r
  rO  r   r  s   `  @rV   rP  MultiOutputLine.codegen  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rX   c                    UR                   $ rJ   )_generate_multi_outputr<  s     rV   r>  MultiOutputLine.codegen_fx  s    ///rX   r   NrV  r@  )	r   r   r  r  rQ  r   rP  r>  r  r   rX   rV   r{  r{    s*     "!M
00rX   r{  c                  H    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS	 jrS
rg)IndexPutFallbackLinei  rE  r   ir.IndexPutFallbackrS   zlist[Optional[ir.IRNode]]r  c                   U R                   n[        R                  " UR                  5      (       d   eS UR                  S S  5       u  p4U R                   Vs/ s H0  nU(       a  UR                  5       OU R                  R                  PM2     nnU R                  R                  " UR                  5       X6U/UR                  5       Q76   g s  snf )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r   ts     rV   r   /IndexPutFallbackLine.codegen.<locals>.<genexpr>  s     Fo**,,o   r    )rS   r#   is_node_sequenceinputsr  r  r   none_str_generate_index_put_fallbackr  codegen_const_args)r
  rO  rS   r   valuesidxr  s          rV   rP  IndexPutFallbackLine.codegen  s    yy""4;;////Fdkk"1oF ||
# (+C!!#0E0EE# 	 

 	11  "A	
9=9P9P9R	

s   7Cc                    UR                   $ rJ   )r  r<  s     rV   r>  IndexPutFallbackLine.codegen_fx  s    555rX   r   NrV  r@  r  r   rX   rV   r  r    s    !!
&&
6rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ScatterFallbackLinei  rE  r   ir.ScatterFallbackrS   c           
     P   U R                   n[        R                  " UR                  5      (       d   eUR                  (       a  S UR                   5       u  p4nO$S UR                   5       u  p4UR
                  S   nUR                  5       =n(       a  UR                  O[        R                  R                  nU R                  R                  UX2R
                  S   XE/UR                  UR                  UR                  UR                  S   UR!                  5       U5        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r   .ScatterFallbackLine.codegen.<locals>.<genexpr>   s     Jk2244kr  c              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r   r    s     EA--//r  r6   r   r:  )rS   r#   r  r  src_is_tensorconstant_argsr  r   r5   rK   r  r   _generate_scatter_fallbackr  python_kernel_namer   r  )r
  rO  rS   r   r(  r   r  r  s           rV   rP  ScatterFallbackLine.codegen  s    yy""4;;////JdkkJOQsEEJQ$$Q'C!%!22A29L9L//""1%u2  ##KK!!		
rX   c                    UR                   $ rJ   )r  r<  s     rV   r>  ScatterFallbackLine.codegen_fx  s    333rX   r   NrV  r@  r  r   rX   rV   r  r    s    !!

(4rX   r  c                  H    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS	 jrS
rg)SymbolicCallArgLinei  rE  r   r  argrD   rK   c                d    U R                   R                  U R                  U R                  5        g rJ   )r   "_generate_symbolic_call_arg_helperr  rK   rN  s     rV   rP  SymbolicCallArgLine.codegen  s    77$**MrX   c                    UR                   $ rJ   )_generate_symbolic_call_argr<  s     rV   r>  SymbolicCallArgLine.codegen_fx  r  rX   r   NrV  r@  r  r   rX   rV   r  r    s    !!	N5rX   r  c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)UnbackedSymbolDefsLinei!  rE  r   r   output_namer   outputs,Optional[dict[sympy.Symbol, pytree.KeyPath]]unbacked_bindingsc                z    U R                   R                  U R                  U R                  U R                  5        g rJ   )r   )_codegen_unbacked_symbol_defs_for_outputsr  r  r  rN  s     rV   rP  UnbackedSymbolDefsLine.codegen(  s+    >>dllD,B,B	
rX   c                    UR                   $ rJ   )_generate_unbacked_symbol_defsr<  s     rV   r>  !UnbackedSymbolDefsLine.codegen_fx-  s    777rX   r   NrV  r@  r  r   rX   rV   r  r  !  s#    !!LCC

8rX   r  c            
        ^  \ rS rSr% SrSrS\S'   U 4S jr\ S       SS jj5       r	SS	 jr
SS
 jrSS jrSS jrSS jr\SS j5       rSS jr\SS j5       rSS jr\SS j5       rSS jr  SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jr S r!S  r"S! r#S" r$S# r%SS$ jr&SS% jr'SS& jr(SS' jr)SS( jr*SS) jr+SS* jr,SS+ jr-SS, jr.S- r/    SS. jr0 S             SS/ jjr1SS0 jr2SS1 jr3SS2 jr4S3 r5SS4 jr6S5 r7SS6 jr8S7 r9              SS8 jr:S9 r;SS: jr<\=R|                  SS; j5       r?SS< jr@S= rAS> rBS? rCS@ rDSSA jrE      SSB jrFSC rGSSD jrHSE rISSF.SSG jjrJSSF.SSH jjrKSSI jrLSSJ jrMSSK jrNSSL jrO  SSM jrP S   SSN jjrQSSO jrRSSP jrSSQ rTSR rUSS rVST rWSU rXSV rY   S         SSW jjrZ\ S     SSX jj5       r[   S         SSY jjr\SSZ jr]\^S[ 5       r_  SS\ jr`SSS] jjra      SS^ jrbSS_ jrcSS` jrdSa reSb rfSc rgSd rhSe riSf rjSg rkSh rlSi rmSSj jrnSk roSSSSSSSSSl. SSm jjrpSSSSSSSSnSSo.	 SSp jjrqSq rrSr rsSs rtSSt jruSSu jrv\Sv 5       rw SSw jrxSx rySSy jrzSSz jr{SS{ jr|SS| jr} S SS} jjr~SS~ jrSS jrSS jrS rSS jrS rSS jrS r        SS jr        SS jrS r    SS jrSS jrS rS rS rS rS rS rSS jrS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r    SS jrS rS rSrU =r$ )rE  i5  z:
Generate outer wrapper in Python that calls the kernels.
Tr   supports_cachingc                  >^  [         TT ]  5         [        5       T l        0 T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l	        [        5       T l
        [        5       T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l        0 T l        ST l        0 T l        [        5       T l        / T l        ST l        ST l        ST l        ST l        ST l        [6        R8                  R:                  (       a  SOST l        [6        R8                  R:                  (       a  SOST l        S T l         ST l!        0 T l"        [        5       T l#        [        5       T l$        S T l%        T RM                  5         / T l'        / T l(        T RS                  5         [U        T 5      (       d  T RW                  5         T RY                  5         [6        R8                  RZ                  (       dB  [6        R8                  R\                  R_                  5        H  u  pT Ra                  X5        M     [        [b           " 5       T l2        [        [b           " 5       T l3        0 T l4        [j        Rl                  " S 5      " T Rn                  5      T l7        [j        Rp                  S
U 4S jj5       nUT l9        0 T l:        [        5       T l;        [y        5       T l=        [        5       T l>        0 T l?        [        [        R                  R                  [        R                  R                  S	9T lE        / T lF        g )Nr    #r5  z
std::move(r   Tc                   > TR                   R                  U 5        [        R                  R                  (       a  TR
                  R                  U 5        g g rJ   )importsr   r"   r   r   r   )r   r
  s    rV   add_import_once6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rX   )debug_printer_leveluse_array_ref)r   r   r   r5  )Gr  r  r   _names_iterargs_to_buffersr-   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr  declare_maybe_referencer  commentr  r5   rK   r  
move_beginmove_endr  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsrH  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr.   write_prefix!write_kernel_autotune_defs_headerr  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr9  	lru_cachewrite_get_raw_streamcacher  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r"   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r
  r   hashedr  r  s   `   rV   r  PythonWrapperCodegen.__init__<  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(  	& L 	" 9C $!!# &("$&!2488..0ww ! 6 6 < < >##D1 !? $J/1
+-
 57$-$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rX   Nc                P    U (       a  Uc   eUc   e[        XU5      $ [        5       $ rJ   )SubgraphPythonWrapperCodegenrE  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rV   createPythonWrapperCodegen.create  s=      ,,,!---//C  $%%rX   c                    SU l         g )Ncall)r  r	  s    rV   r  )PythonWrapperCodegen.set_launcher_fn_name  s
     &rX   c                D    U R                   R                  U SU 35        g )Nz = None  # )r  r   )r
  r   r  s      rV   r  #PythonWrapperCodegen.write_constant  s    k&:;rX   c                   [         R                  R                  R                  5       nSnUb  UR                  b  SUR                   3nSn[        [        R                  R                  5      S:  a  SnO5[         R                  R                  R                  R                  (       a  SnU R                  R                  SU S[        R                   S	U S
3SS9  U R                   R                  SSS9   SSKJn  U R                   R                  SSS9  [        R*                  (       a  U R                   R-                  S5        [        R.                  R0                  (       Ga  U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        [        R.                  R2                  =(       d(    [4        R6                  R9                  [;        5       S5      nU R                   R-                  SU S35        SU S3nSU S3n[        R.                  R<                  n[        R.                  R>                  n	[        R.                  R@                  n
U R                   R-                  S5        U R                   R                  SU S U S!U	 S"U
 S#3	5        U R                   R-                  S$U S%35        U R                   R-                  S&5        gg! [&        [(        4 a     GNaf = f)'z>Write the header section of the generated Python wrapper code.r  Nz
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtxz import triton.profiler as protonz%import triton.profiler.language as plzCfrom triton.profiler.hooks import HookManager as _ProtonHookManagerzimport tritonzimport atexitz	import oszetriton.set_allocator(lambda size, align, stream: torch.empty(size, dtype=torch.uint8, device='cuda'))protonzos.makedirs("z", exist_ok=True)zos.path.join("z", "inductor")z", "inductor.chrome_trace")z^from torch._inductor.runtime.proton_utils import process_proton_trace as _proton_process_tracez
                def _proton_finalize_and_postprocess():
                    proton.finalize()
                    _trace_path = z
                    if os.path.exists(_trace_path):
                        _proton_process_trace(
                            _trace_path,
                            group_by_sm=z0,
                            split_invocations=z0,
                            per_cta_occupancy=z,,
                        )
                z5if not _ProtonHookManager.active_hooks: proton.start(z], backend="instrumentation", data="trace"); atexit.register(_proton_finalize_and_postprocess)pl.enable_semantic("triton"))!torch_guardsTracingContexttry_getaot_graph_namer   r"   r  r  	_inductortest_configstrack_memory_lifecycler  r   r!   r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   r   proton_profilingproton_output_dirospathr   r   proton_group_by_smproton_split_invocationsproton_per_cta_occupancy)r
  contextaot_config_commentinductor_debug_utilsr  
output_dirproton_name
trace_pathgroup_by_smsplit_invocationsper_cta_occupancys              rV   r  !PythonWrapperCodegen.write_header  s7   --..6687#9#9#E#-g.D.D-E!F!v""CCDqH#w __##00GG#l #$ % $,,- .%& '!$ ' 	 	
* 	 ! 	 	
$	 DKK 	   ##KK!!"?@==)))KK!!"DEKK!!"IJKK!!U KK!!/2KK!!/2KK!!+.KK!!G  88 BGGLL=J KK!!M*=N"OP*:,nEK)*5PQJ --::K & F F & F FKK!!p KK# $., /) *5 6//@.A B//@.A B KK!!  +} -DD
 KK!!"@A[ *	 , 		s   8 M& &M:9M:c                    g rJ   r   )r
  r  s     rV   include_extra_header)PythonWrapperCodegen.include_extra_header$      rX   c                    U R                   R                  S[        R                   S35         SSKJn  U R                   R                  SSS9  g ! [        [        4 a     g f = f)Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            r   )_cuda_getCurrentRawStreamzU
                get_raw_stream = torch._C._cuda_getCurrentRawStream
                Tr   )r  r   r!   r   torch._Cr;  r#  r"  )r
  r;  s     rV   r  6PythonWrapperCodegen.write_kernel_autotune_defs_header'  sw    !!((
 $,,- .	

	:%%,, 	 -  ^, 		s    A A#"A#c                   S[         R                   S3n[        R                  R                  (       a]  U R
                  R                  U5        U R
                  R                  [        R                  R                  R                  S5      5        [        R                  R                  (       d]  U R                  R                  USS9  U R                  R                  [        R                  R                  R                  S5      5        g g )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r'   r   r"   r   r   r   r   r   r5   rK   r  import_get_raw_stream_asr  r  r
  
import_strs     rV   write_triton_header_once-PythonWrapperCodegen.write_triton_header_onceC  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rX   c                   [         R                  R                  R                  S5      n[        R
                  R                  (       a;  U R                  R                  U5      (       d  U R                  R                  U5        [         R                  R                  (       d=  U R                  R                  U5      (       d  U R                  R                  U5        g g g )Nr?  )r5   rK   r  r@  r"   r   r   r   containsr   r  r  )r
  import_get_raw_stream_strs     rV   write_get_raw_stream_header0PythonWrapperCodegen.write_get_raw_stream_headerU  s    $%GG$6$6$O$O%
! ==11--667PQQ**445NOww""<<(()BCC&&'@A D #rX   c                $    U R                  5         g rJ   )rH  r	  s    rV    write_get_raw_stream_header_once5PythonWrapperCodegen.write_get_raw_stream_header_once`  s    ((*rX   c                   [        U5      nXR                  ;  a  S[        U R                  5       3nX R                  U'   U R                  R	                  U SU 35        [
        R                  R                  (       a;  U R                  R	                  U SU 35        U R                  R                  U5        U R                  U   $ )Nmetar   )reprr  r   r  r   r"   r   r   r   r  r   )r
  rN  vars      rV   add_meta_once"PythonWrapperCodegen.add_meta_onced  s    Dz{{"T[[)*+C #KKKK!!SETF"34}}55**44uCv5FG##C({{4  rX   c                ~    U R                  5        Vs/ s H  oR                  U R                  5      PM     sn$ s  snf rJ   )get_graph_outputsr  r  r
  r   s     rV   get_output_refs$PythonWrapperCodegen.get_output_refsr  s?     =A<R<R<T
<Tq 1 12<T
 	
 
s   $:c                    g rJ   r   r	  s    rV   mark_output_type%PythonWrapperCodegen.mark_output_typex      rX   c                6    [         R                  R                  $ rJ   )r5   rK   r  r	  s    rV   get_graph_inputs%PythonWrapperCodegen.get_graph_inputs{  s     ww###rX   c                6    [         R                  R                  $ rJ   )r5   rK   r
  r	  s    rV   rT  &PythonWrapperCodegen.get_graph_outputs  s    ww$$$rX   c           
     6   U R                  5       R                  5        H  u  p[        U[        R                  [
        R                  45      (       a  M6  U[        R                  R                  ;  d  [        U[
        R                  5      (       a  Mu  [        UR                  5       5      S:X  a  M  U R                  UR                  5       5      nU R                  UR                  5       5      nU R                  R!                  SU SU SU S35        M     g )Nr   zassert_size_stride(r  r   )r]  r  r[   rz   r   r#   TorchBindObjectr5   rK   graph_input_namesGeneratorStater1   rC  r   rD  r  r   )r
  r   bufrl   rm   s        rV   codegen_input_size_asserts/PythonWrapperCodegen.codegen_input_size_asserts  s    ..0668ID#

B,>,>?@@ 177444
R&&9 9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR 9rX   c                n   U R                   R                  S5        U R                  5       R                  5        Hx  u  p[	        U[
        R                  [        R                  45      (       a  M6  SU S3nU R                   R                  U5        SU S3nU R                   R                  U5        Mz     g )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r]  r  r[   rz   r   r#   rb  )r
  r   re  r   s       rV   codegen_input_nan_asserts.PythonWrapperCodegen.codegen_input_nan_asserts  s    HI..0668ID#

B,>,>?@@ &;<DKK!!$' &;<DKK!!$' 9rX   c                :    U R                   R                  S5        g )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r	  s    rV   write_async_compile_wait-PythonWrapperCodegen.write_async_compile_wait  s    	
rX   c                    SR                  U5      n[        U5      S:X  a  US-  nU R                  R                  U S35        U R                  R                  S5        g )Nr  r6   ,z = argszargs.clear())r   r   r  r   )r
  input_nameslhss      rV   
write_argsPythonWrapperCodegen.write_args  sP    ii${q 3JCWo.n-rX   c                    [         R                  (       a  U R                  R                  S5        SnU$ U R                  R                  SU R                   S35        SnU$ )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r    z
                def z(args):
                r6   )r"   graph_partitionr  r   r  r
  prefix_indents     rV   !write_launcher_fn_call_get_indent6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sm    !!KK M  KK**+ ,
 MrX   c                6    [         R                  R                  $ rJ   )r5   rK   rc  r	  s    rV   get_graph_input_names*PythonWrapperCodegen.get_graph_input_names  s    ww(((rX   c                   U R                   c   eU R                  5         U R                  5       nU R                  R	                  U5         [
        R                  R                  (       aA  U R                  R                  [        R                  R                  R                  5       5        [        R                  R                  5       n[
        R                  (       a  U R                  R                  SU S35        U R                  5       =n(       a  U R!                  U5        U R#                  5         [%        5       (       a  ['        U 5      (       a  U R)                  5         S S S 5        g ! , (       d  f       g = f)Nz0training_annotation = nvtx._device_range_start(''))r  rl  rx  r  r   r"   r   debug_sync_graphr   r5   rK   r  synchronizeget_training_phaser$  r{  rr  codegen_inputsr/   r.   "codegen_input_size_and_nan_asserts)r
  rw  phaserc  s       rV   r  !PythonWrapperCodegen.write_prefix  s	   $$000%%'>>@[[.}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!
 -..<TBB779) /..s   DE..
E<c                    [         R                  (       a  U R                  5         [         R                  (       a  U R	                  5         g g rJ   )r"   size_assertsrf  nan_assertsri  r	  s    rV   r  7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rX   c                    U R                  5         SU 3n[        R                  R                  (       aB  U R                  R                  U SU S35        [        R                  R                  (       a  U$ U R                  U SU S35        U$ )Nstream = get_raw_stream(r   )	rH  r"   r   r   r   r   r5   rK   r  )r
  r  r  r   s       rV   r  )PythonWrapperCodegen.write_get_raw_stream  s    ((*
|$==11&&00&*:,a8 ww""$1*Q?@rX   c                     U R                   S   $ )N)r  r	  s    rV   get_codegened_graph(PythonWrapperCodegen.get_codegened_graph   s    ))"--rX   c                :    U R                   R                  U5        g rJ   )r  rq   )r
  rK   s     rV   rL  )PythonWrapperCodegen.push_codegened_graph  s    ""))%0rX   c                6    U R                   R                  5       $ rJ   )r  r#  r	  s    rV   rw  (PythonWrapperCodegen.pop_codegened_graph  s    ))--//rX   c                P    SSK Jn  U R                  R                  U" U5      5      $ )Nr   )deepcopy)copyr  r  rq   )r
  rH  r  s      rV   rG  (PythonWrapperCodegen.push_computed_sizes	  s!    !((//0HIIrX   c                6    U R                   R                  5       $ rJ   )r  r#  r	  s    rV   rt  'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rX   c                .    [        U R                  5       $ rJ   )nextr  r	  s    rV   next_kernel_suffix'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rX   c                   U R                  [        XR                  5      5        [        R                  R
                  (       a  U R                  5         U R                  R                  S[        R                  R                  R                  U5       S35        U R                  R                  5         [        U 5      (       a  U R                  5         U R                  R                  SU SU S35        Xl        g )Nr  r  r  r  r   )r   r~  r  r"   r   r   rC  r   r5   rK   r  r  rM  r.   rH  )r
  r  s     rV   codegen_device_guard_enter/PythonWrapperCodegen.codegen_device_guard_enter  s    )*6W6WX	
 ==11))+&&00**77
CDAF &&0022488002&&00$6zl!D -7)rX   c                    U R                  [        5       5        [        R                  R                  (       a  U R
                  R                  5         g g rJ   )r   r  r"   r   r   r   rx  r	  s    rV   codegen_device_guard_exit.PythonWrapperCodegen.codegen_device_guard_exit'  s6    356==11&&224 2rX   c                   U(       Ga1  [         R                  (       a  U R                  R                  SSR	                  U5      -   S-   5        U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  S5        U R                  R                  S5        U R                  R                  S	SR	                  U5      -   S-   5        g U R                  R                  S
5        g )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r    zreturn (z	return ())r"   r  r  r   r   rM  rx  )r
  output_refss     rV   generate_return$PythonWrapperCodegen.generate_return,  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rX   c                    g rJ   r   r
  results     rV   generate_before_suffix+PythonWrapperCodegen.generate_before_suffix>  r[  rX   c                    [         R                  (       aO  SR                  U R                  5      [	        U R                  5      S:X  a  SOS-   nUR                  SU S35        g g )Nr  r6   ro  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r"   ru  r   all_partition_namesr   r   )r
  r  all_partition_name_lists      rV   generate_after_suffix*PythonWrapperCodegen.generate_after_suffixA  se    !!&*ii0H0H&I43349r'# MM--D,E F "rX   c                    g rJ   r   r  s     rV   generate_end!PythonWrapperCodegen.generate_endO  r[  rX   c                    UR                   nUb3  U[        ;   a)  [        U   R                  nUb  U" XR                  5        g U R                  [	        X5      5        g rJ   )r  r>   pythonr   r  )r
  rS   op_namecustom_codegens       rV   generate_fallback_kernel-PythonWrapperCodegen.generate_fallback_kernelR  sR    ))7.J#J9'BIIN)t^^4,T89rX   c                    UR                  U 5        U R                  [        X5      5        [        UR                  [
        R                  5      (       a  UR                  U 5        g g rJ   )codegen_commentr   r  r[   r_   r#   Layoutcodegen_size_assertsr  s     rV   generate_extern_kernel_alloc1PythonWrapperCodegen.generate_extern_kernel_alloc\  sJ    T",T89dkk299--%%d+ .rX   c           
        [        UR                  [        R                  5      nUR	                  5       nUR                  5       nUR                  5       nU R                  n[        R                  (       a  SU;   a  SU 3nU(       a5  U R                  U R                   U SSR                  U5       SU 35        g U R                  U R                   U SU SSR                  U5       SU 35        U R                  (       aR  [        R                  (       a<  Ub8  [        S   S==   S	-  ss'   U R                  S
UR                   < SU S35        g g g g )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr6   zrun_intermediate_hooks()r[   r_   r#   
NoneLayoutrM   get_origin_noder  r  r"   memory_planningr   r  r   r  generate_intermediate_hooksr   r   )r
  extern_kernelr  	no_returnr  origin_noder  r  s           rV   r  9PythonWrapperCodegen._generate_extern_kernel_alloc_helperb  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rX   c                Z    UR                  U 5        U R                  [        X5      5        g rJ   )r  r   r  r  s     rV   generate_extern_kernel_out/PythonWrapperCodegen.generate_extern_kernel_out  s$     	T"*467rX   c                2   [         R                  R                  R                  nUR	                  XAS S S5        UR                  SU(       a  UOU 35        U   U R                  U SSR                  U5       S35        S S S 5        g ! , (       d  f       g = f)Nexternzout=r  r  r   )r5   rK   wrapper_coder  set_printer_argsrq   r   r   )r
  r  outout_viewr  r  stack_tracesdebug_printer_managers           rV   r  7PythonWrapperCodegen._generate_extern_kernel_out_helper  sw     !" 4 4 B B..tT4Rdx8S9:;"NNfXQtyy&7q9: #""s   'B
Bc                "  ^  UR                   nUR                  nU(       aR  [        R                  R                  R                  U5      n[        R                  R                  R                  U5      nUR                  R                  5        S3nSR                  U 4S jU 5       5      nSR                  U 4S jU 5       5      n[        R                  T UR                  5      nSnU SUR                   S3nU SU SU SU 3n	U SU	 S	3n
U
$ )
Nz.data_ptr()r  c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frJ   rE  val_to_arg_strr   dimr
  s     rV   r   RPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s$     XSWC-<<T3GGSW   #&c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frJ   r  r  s     rV   r   r    s%      
FPs //c::jr  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )dims
block_dimsr5   rK   rQ   optimization_hintoptimization_hintstensorr  r   rE  r  element_sizerank)r
  descapply_size_hintsr  r  ptrr  r  r   r  r  s   `          rV   *_generate_tma_descriptor_call_experimental?PythonWrapperCodegen._generate_tma_descriptor_call_experimental  s    yy__
77##55d;D))<<ZHJ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArX   c                    UR                   nU(       a)  [        R                  R                  R	                  U5      nSnU S3nUR
                  R                  5        SU 3nU SU S3nU$ )Nz/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaper5   rK   rQ   r  r  r  )r
  r  r  r  r  r   r  r  s           rV   $_generate_tma_descriptor_call_stable9PythonWrapperCodegen._generate_tma_descriptor_call_stable  so    &&''**==kJKBx|$++//12"[MBQtfArX   c                    [        U[        R                  5      (       a  U R                  X5      $ [        U[        R                  5      (       d   eU R                  X5      $ rJ   )r[   r#   TMADescriptorExperimentalr  TMADescriptorStabler  )r
  r  r  s      rV   _generate_tma_descriptor_call2PythonWrapperCodegen._generate_tma_descriptor_call  sW    dB8899BB  dB$:$:;;;;<<TTTrX   c                    U R                  U5      nUR                   SU U R                   3nU R                  U5        g Nr   )r  r   r  r   )r
  r  r  r   s       rV   generate_tma_descriptor,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trX   c                8    U R                  [        X5      5        g rJ   )r   r  r  s     rV   generate_scatter_fallback.PythonWrapperCodegen.generate_scatter_fallback  s    *467rX   c	                   U SSR                  [        [        U5      5       3n	UR                  S5      (       a  U	SR                  S/U-   5      -  n	OU(       a  U	S[	        U5       3-  n	U	S-  n	U R                  U	5        g )Nr  ro  zaten.scatter_reducer  r  z	, reduce=r   )r   mapr   r   rO  r   )
r
  r   r  r  r  r  r:  r   r  r   s
             rV   r  /PythonWrapperCodegen._generate_scatter_fallback  s~     %%QsxxC0@'A&BC(()>??DIIrdVm,,D)DL>22trX   c                n   / nUR                   SS  n[        U5      n[        UR                  5       He  u  pVUR                  U   b?  [	        U5      n[        U[        R                  5      (       d   eUR                  U5        MT  UR                  S 5        Mg     U R                  [        XU5      5        g )Nr    )r  iter	enumerater  r  r[   r#   r%   rq   r   r  )r
  rS   r  valid_indicesiter_valid_indicesr  _r(  s           rV   generate_index_put_fallback0PythonWrapperCodegen.generate_index_put_fallback  s    -/AB!-0dll+DA||A*/0!%3333u%t$ , 	+D@ArX   c                v    SSR                  U5       S3nX&XE/nU R                  U R                  X5      5        g )Nr  r  r  )r   r   wrap_kernel_call)r
  r  r   r  r  
accumulateindices_strr  s           rV   r  1PythonWrapperCodegen._generate_index_put_fallback  s;    $))G,-Q/3t,,V:;rX   c           
     `    U R                  U SU SSR                  U" 5       5       S35        g )Nr   r  r  r   )r   r   )r
  buf_namer  get_argsop_overloadr  r  s          rV   ,generate_fallback_kernel_with_runtime_lookupAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrX   c                p    [        S5         U R                  U5      sS S S 5        $ ! , (       d  f       g = f)NzPythonWrapperCodegen.generate)r   	_generater
  is_inferences     rV   generatePythonWrapperCodegen.generate  s#    9:>>,/ ;::s   '
5c                0    [         R                  (       a  gg)Nr    r6   )r"   ru  r	  s    rV   get_wrapper_call_indent,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rX   c              #  \   #    U R                   n Xl         Uv   X l         g ! X l         f = f7frJ   r   )r
  newolds      rV   set_writeline"PythonWrapperCodegen.set_writeline  s'     nn	! NI NSNs   ,
! ,),c                    U R                   R                  n[        R                  R                  (       a  U R
                  R                  U5        g U R                  R                  U5        g rJ   )r  kernel_defsr"   r   r   r  r   r  )r
  r)  s     rV   _write_multi_kernel_defs-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+rX   c                
   [         R                  (       a  U R                  5         [        R                  " 5        nUR                  U R                  R                  5       5        [         R                  (       a  U R                  U5        [         R                  (       a  U R                  5         U R                  U5        [         R                  R                  (       a/  [         R                  R                  (       d  U R                  5         U R!                  U R                  R"                  5         U R$                   HP  n['        U[(        5      (       a  UR+                  U R                  5        M5  U R                  R#                  U5        MR     S S S 5        U R-                  5         U R/                  5       nU R1                  5         [         R                  R2                  (       aA  U R                  R#                  [4        R6                  R8                  R;                  5       5        [         R                  (       a  U R=                  5         [         R                  R>                  (       a  U RA                  5         [         R                  R                  (       a/  [         R                  R                  (       d  U RC                  5         [         R                  R                  (       a  U RE                  5         [         RF                  (       a0  [         RH                  (       d  U R                  R#                  S5        U RK                  U5        S S S 5        [M        5       nURO                  U RP                  5        UR#                  S5        URO                  U RR                  5        [4        R6                  RT                  (       aH  [4        R6                  RH                  (       a)  [4        R6                  RV                  (       a
  [M        5       nURO                  U RX                  5        U R[                  5         URO                  U R\                  5        U R_                  5       nUR                  U5         URO                  U R                  5        S S S 5        U Ra                  U5        URO                  U Rb                  5        U Re                  U5        U Rg                  U5        U Ri                  U5        URk                  5       U Rl                  Rk                  5       4$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= f)Nz+nvtx._device_range_end(training_annotation)r  )7r"   profile_bandwidthrC  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesr   store_cubinr   !generate_reset_kernel_saved_flagsr&  r   r  r[   rH   rP  r*  rV  rY  r  r5   rK   r  r  generate_end_graphr%  generate_proton_finalize generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr$  r  r  r-   r   r  r  r  is_const_graphr  finalize_prefixr  r   r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r
  r  stackr   r  r  wrapper_call_indents          rV   r  PythonWrapperCodegen._generate  si   ##))+!!#u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ JJD!$44T%6%67))33D9 ' A ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}----/}}((1W1W557}}55446 ''0B0B!!++A   -] $b  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./MM$++, 0 	##F+dkk"""6*&!""6* &&($$88:
 	
} A@ $#B 0/s2   C9T:4A!T(F?T:;U(
T7	2T::
U	
Uc                |  ^ U R                   R                  S5        0 n[        R                  R                  (       af  [
        R                  R                  (       aG  [        [
        R                  R                  5       VVs0 s H  u  p#U R                  U5      U_M     nnnU R                   R                  5       S-   U R                  R                  5       -   m[        R                  [        R                  :X  aj  [         R"                  " [%        5       SSS9 nUR'                  TR)                  S5      5        UR*                  nSSS5        [        R,                  " SW5        [/        S	S
 U4S jS9   [1        TU5        gs  snnf ! , (       d  f       NI= f! [2         a  n[5        SU 35      UeSnAff = f)z
Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
code and execute it to trigger Triton kernel compilation and auto-tuning
zQ
            async_compile.wait(globals())
            del async_compile
        r  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sartifactc                     SSS.$ )N&inductor_autotune_at_compile_time_codestring)r   encodingr   r   rX   rV   r   FPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>  s    @$!rX   c                    > T $ rJ   r   )tuning_codes   rV   r   rI    s    {rX   )metadata_fn
payload_fnz%Failed to run autotuning code block: )r  r   r"   r   r   r5   rK   autotuning_inputsr  get_autotuning_input_namer   r   r$   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder   debugr   exec	ExceptionRuntimeError)r
  scoper  vf	file_patherK  s          @rV   r9  4PythonWrapperCodegen.generate_and_run_autotune_blockh  s   
 	!!((	
 ==11agg6O6O ((A(ABBFC ..s3Q6B  
 %%..0((1134 	
   GMM1 ,,Ke**734FF		
 !!0 	 +	
	Se$? (  	S!FqcJKQRR	Ss*   F-F9F 
F
F;'F66F;c                \    SSK Jn  U" U 5      R                  U R                  5      U l        g )Nr6   )MemoryPlanner)r  rb  r  r  )r
  rb  s     rV   memory_plan PythonWrapperCodegen.memory_plan  s     2"4(--djj9
rX   c                D   U R                  5       n[        R                  R                  U5      nU R                  (       a  [        U R                  S   [        5      (       a  U R                  S   R                  R                  U;  av  U R                  R                  5         U R                  (       aK  [        U R                  S   [        5      (       a)  U R                  S   R                  R                  U;  a  Mv  [        5       /n/ n[        [        U R                  5      5       H  nU R                  U   n[        U[        5      (       a#  UR                  US   5      U R                  U'   MJ  [        U[        5      (       a  UR                  [        5       5        Mz  [        U[         5      (       d  M  UR                  UR                  5       5        M     UR                  UR                  5       5        [        U5      S:X  d   e[#        S U 5       5      ng )Nr  r   c              3  8   #    U  H  oR                   v   M     g 7frJ   )r  )r   ss     rV   r   9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>  s      +
3Ga))3Gs   )rT  r5   rK   _get_output_namesr  r[   r  rS   r   r#  r  ranger   r  rD  rq   rr  sum)r
  r  	out_namesplanning_statespast_planning_statesr  r   _total_allocated_buffer_sizes           rV   memory_plan_reuse&PythonWrapperCodegen.memory_plan_reuse  s   ((*GG--g6	 JJ4::b>+=>>

2##((	9 JJNN JJ4::b>+=>>

2##((	9 /01!s4::'A::a=D$ 233 $		/"*= >

1D"344&&':'<=D"233$++O,?,?,AB ( 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rX   c                    U(       a&  [         R                  (       a  U R                  5         g [         R                  (       a  [	        5       U l        U R                  5         g rJ   )r"   r  rc  r6  r  r,  rp  r  s     rV   r3  *PythonWrapperCodegen.run_wrapper_ir_passes  s9    F22((%:%<"""$rX   c           	       ^	 U R                   m	[        R                  U	4S j5       n[        R                  U	4S j5       n[        U[        R
                  5      (       aM  [        U[        R                  5      (       a  X#;   a  g T	R                  U SU 35        UR                  U5        g [        U[        R                  5      (       a  [        UR                  5       5       H^  u  pg[        U[        R                  5      (       d  M&  Xs;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     [        UR                  5       5       H^  u  ph[        U[        R                  5      (       d  M&  X;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     g [        U[        R                  5      (       a  g [        U[        R                  5      (       a  g [         R"                  R$                  R&                  (       a  g [)        S[+        U5       35      e)Nc                <   > TR                  U  SU  S35        U  S3$ )Nz_size = z.size()_sizer#  r   rO  s    rV   sizeofDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s(    NNdV8D69:V5>!rX   c                <   > TR                  U  SU  S35        U  S3$ )Nz
_stride = z	.stride()_strider#  rw  s    rV   strideofFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s)    NNdV:dV9=>V7##rX   r   r  r  zUnknown value type: )r  r9  r  r[   rz   r   Symbolr   r   r#   ro   r  rC  rD  rb  rd  r  r  r"   ru  r  r   )
r
  r   r   
bound_varsrx  r|  r  rl   rm   rO  s
            @rV   codegen_input_symbol_assignment4PythonWrapperCodegen.codegen_input_symbol_assignment  s    {{		" 
	" 
	$ 
	$ eUZZ((eU\\22e6INNeWCv./NN5!r||,,&u~~'78	dELL11d6LNNdV3vd|nAcU!#DENN4( 9  ))9)9);<fell338PNNfXS$0@#a#HINN6*  = r1122r0011%%55$';DK=%IJJrX   c           	     F   [         [        R                     " 5       nU R                  5       nUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       d  M&  X44PM+     snnUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       a  M&  X44PM+     snn-   nU H  u  pgU R                  XgU5        M         SS jnU H.  u  p[        U[        R                  5      (       d  M&  U" Xq5        M0     gs  snnf s  snnf )z$Assign all symbolic shapes to localsc                x   [         R                  " U R                  5       U R                  5       /5       H}  n[	        U[
        5      (       a  [	        U[        R                  5      (       a  M9  UR                   Vs/ s H  o3U;  d  M
  UPM     nn[        U5      S:  d  Mm  [        SU SU S35      e   g s  snf )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterablerC  rD  r[   r   rz   r~  free_symbolsr   r  )r   r  exprsymundefined_symbolss        rV   _verify_input_symbol_assignmentLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS!$--D%,,1O1O $(#4#4%#4C:8MC#4 " % ()A-(tfK0A/BB[\  T%s   <	B7	B7N)r   ir.TensorBoxr  OrderedSet[sympy.Symbol])	r   rz   r~  r]  r  r[   r  r#   ro   )
r
  r  r  kr\  r  r   r   r  r  s
             rV   r  #PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
3tqz!U\\7RFQF3
 , 2 2 4X 4Jq%,,<WVaV 4XY "KD00jI "		0	& HAeR\\22+E> 3
Xs    %D)D%D/Dc                   [        U[        R                  5      (       a  [        U[        R
                  5      (       a  XR                  ;   a  g U R                  R                  U5        [        R                  R                  R                  U   n[        X5      nU R                  [        X[        R                  5      5        g g g rJ   )r[   rz   r~  r   r   PRECOMPUTED_SIZErH  r   r5   rK   rQ   inv_precomputed_replacementsr  r   r  )r
  r  r  r  s       rV   ensure_size_computed)PythonWrapperCodegen.ensure_size_computed  s    c5<<((^CAVAV-W-W)))##C(77##@@ED!#,CNN.t!''BC .X(rX   c                    g rJ   r   r	  s    rV   r;  $PythonWrapperCodegen.finalize_prefix$  r9  rX   rR   c                   [        S5      e)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)rZ  r
  r   rR   s      rV   codegen_cpp_sizevar(PythonWrapperCodegen.codegen_cpp_sizevar'  s    UVVrX   c                   [        XS9$ )Nr  )pexprr  s      rV   codegen_python_sizevar+PythonWrapperCodegen.codegen_python_sizevar*  s    Q**rX   c                $    U R                  U5      $ rJ   )r  rU  s     rV   codegen_sizevar$PythonWrapperCodegen.codegen_sizevar-  s    **1--rX   c                    U SU S3$ )Nr  r  r   )r
  r  r   r(  s       rV   r  )PythonWrapperCodegen.codegen_tuple_access0  s    1UG1%%rX   c                    / [        U R                  U5      Qn[        U5      S:X  a  g[        U5      S:X  a	  SUS    S3$ SSR                  U5       S3$ )Nr   ()r6   r  r  r  r   )r  r  r   r   )r
  rJ  partss      rV   r   /PythonWrapperCodegen.codegen_python_shape_tuple3  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rX   c                $    U R                  U5      $ rJ   )r   )r
  rJ  s     rV   rG  (PythonWrapperCodegen.codegen_shape_tuple;  s    ..u55rX   c                    SR                  SR                  U[        U5      [        U5      U R	                  U5      U R	                  U5      /5      5      / 4$ )Nzalloc_from_pool({})r  )formatr   r  r   r   )r
  r   rn   r   rJ  rm   s         rV   codegen_alloc_from_pool,PythonWrapperCodegen.codegen_alloc_from_pool>  s_     %++II&MJ33E:33F;

 
 
	rX   c                  ^  [        U5      u  pxpnU 4S jnUR                  5       nU=(       a    XI:H  nU(       a  X':H  =(       a    X8:H  nU
nObX!R                  R                  :H  =(       a7    X1R                  R                  :H  =(       a    XAR                  R
                  :H  nUR                  nU(       a  Ub  UU:w  a	  SU SU S3$ U $ U" XX4UU5      $ )Nc           	        > T
R                  U5      nT
R                  U5      nT
R                  U5      nSU  SU SU SU S3	n	Ub  XE:w  a	  SU	 SU S3$ U	$ )Nzreinterpret_tensor(r  r   aten.view.dtype()r   r  )r   tgt_size
tgt_stride
tgt_offset
cast_dtype
base_dtyperg  stoffr  r
  s             rV   apply_reinterpretHPythonWrapperCodegen.codegen_reinterpret_view.<locals>.apply_reinterpretl  sz     //9A00<B&&z2C(b2bTC5BD%**B)$r*Q??KrX   r  r  r   )rw   rM   r_   rl   rm   rn   r   )r
  rr   rl   rm   rn   r   r   d_sized_strided_offsetd_dtypecollapsibler  r   	collapsedsame_layoutr  s   `                rV   codegen_reinterpret_view-PythonWrapperCodegen.codegen_reinterpret_viewM  s    8 ,D1 	9([		 }}6F$6	.?V-?K J ((( 1kk0001kk000 
 J Uj%8)$r%::V VUJOOrX   c                8    U R                  U SU SU S35        g )Nz.copy_(r  r   r#  )r
  r   dstnon_blockings       rV   codegen_device_copy(PythonWrapperCodegen.codegen_device_copy  s!    #gcU"\N!<=rX   c                    UR                  5       nUR                  S5      nU R                  [        XX1R                  5      5        g r  )rM   
input_namer   r{  r  )r
  rS   r|  r}  s       rV   codegen_multi_output)PythonWrapperCodegen.codegen_multi_output  s2    mmo??1%t(LLQRrX   c           
     t   UR                    SUR                   SUR                    SUR                    3nU(       a  SUR                   SU S3nU R                  UR                   SUR                   SUR
                   SU S	35        U R                  R                  [        UR                  5      5        g )
N +  if z
 < 0 else zmax(0, min(r  z))r   z * (r   )	r(  rl   r   unbacked_offset_symbolbase_offsetbase_dim_strider  r   r   )r
  rS   clamp	index_strs       rV   codegen_dynamic_select_index1PythonWrapperCodegen.codegen_dynamic_select_index  s    zzl#dii[TZZL
4::,W	%dii[9+R@I**+3t/?/?.@DDXDXCYY]^g]hhij	
 	""&&s4+F+F'GHrX   c                  ^ ^ UU 4S jnU 4S jnTR                   nU" TR                  5      nU" TR                  5      nT R                  U SU 35        T R                  U SU 35        U" U S3U S3TR                  5      nT R                  U SU S35        T R
                  R                  [        TR                   5      5        g )	Nc           	     h  > TR                  [        R                  " S[        R                  " U TR                  5      5      5      nTR                  [        R                  " S[        R                  " U TR                  -   TR                  5      5      5      nTR                  U 5      nU SU SU 3$ )Nr   r  z >= 0 else )r  rz   MaxMinrl   )r   posnegx_condrS   r
  s       rV   clamp_indexDPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.clamp_index  s    &&uyyEIIa4K'LMC&&		!UYYq499}dii@AC ))!,FU$vhk#77rX   c                \   > US:X  a  U SU  3$ TR                  U5      nSU SU  SU SU 3$ )Nr6   z - r  r  z	 - 1) // )r  )	start_varend_varstepstep_r
  s       rV   codegen_with_stepJPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.codegen_with_step  sJ    qy!#i[11((.Ewis9+SyHHrX   z	_start = z_end = _start_endz
 = max(0, r   )unbacked_size_symbolstartendr   r  r  r   r   )r
  rS   r  r  r  r  r  	with_steps   ``      rV   codegen_dynamic_slice_size/PythonWrapperCodegen.codegen_dynamic_slice_size  s    	8	I ''DJJ'$((##iw/0#gcU+,%VnTlDIIN	#j156""&&s4+D+D'EFrX   c                8    U R                  [        X5      5        g rJ   )r   rj  r  s     rV   codegen_dynamic_scalar+PythonWrapperCodegen.codegen_dynamic_scalar  s    (45rX   c                   S UR                    5       u  n[        UR                  5      S:X  a#  U R                  UR                   SU S35        GOw[        UR                  5      S:X  aE  [        UR                  S   [        5      (       a#  U R                  UR                   SU S35        GO[        UR                  5      S:X  a  [        UR                  S   [        5      (       a  U R                  UR                   SU S35        U R                  S	UR                   S
UR                  S   R                   SUR                   SUR                  S   R                   S3	5        U R                  UR                   SUR                   SUR                  S   R                   35        O[        SUR                   35      eU R                  UR                  5        S35        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r   ?PythonWrapperCodegen._codegen_dynamic_scalar.<locals>.<genexpr>  s     >+Q&&((+r  r   r   .item()r6   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath  = None)r  r   keypathr   r  r[   r   r   divisorr  rM   )r
  rS   rr   s      rV   rl  ,PythonWrapperCodegen._codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(R(RNNdhhZxv^DE!#
4<<?K(P(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rX   c           
       ^ ^ UU 4S jnU4S jnU4S jnTR                  / SQ5        TR                  5          TR                  SSS9  [        R                  R
                  R                  5        HT  u  pVTR                  SU 35        U" XVR                  5       UR                  5       UR                  UR                  5        MV     [        [        R                  R                  5      S	:  a^  TR                  S
5        [        R                  R                  R                  5        H!  u  pWTR                  SU 35        U" XW5        M#     [        R                  R                  R                  5        GH9  u  pV[        U[         R"                  5      (       aI  [        [        R                  R$                  R&                  R)                  US5      [*        5      (       a  Mn  [        U[,        R.                  5      (       ad  [        [        R                  R                  5      S	:X  a  TR                  S
5        TR                  SU 35        U" XVR1                  5       5        M  [        U[         R2                  5      (       a2  U" U[        R                  R$                  R5                  USS95        GMB  [        U[,        R6                  5      (       a$  U" USUR                  R8                   S35        GM  UR;                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n	nUR=                  5        Vs/ s H+  n[        R                  R$                  R?                  USS9PM-     n
nU" UU	U
URA                  5       URC                  5       5        GM<     TR                  SSRE                  [        R                  R                  RG                  5       5       S35        SSS5        TR                  / SQ5        TR                  5          TR                  SSS9  SSS5        gs  snf s  snf ! , (       d  f       NU= f! , (       d  f       g= f)z2Write out codegen for benchmarking the output codec                   > TR                  U  STR                  U5       STR                  U5       SU SU S3
5        g )Nz = rand_strided(r  
, device='	', dtype=r   )r   r   )r   rJ  rm   r  r   r   r
  s        rV   add_fake_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rX   c                2   > TR                  U  SU 35        g r  r#  )r   r  r   s     rV   add_expr_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rX   c                8  > Uc  TR                  U  S35        g SS Kn TR                  U  SUR                  U5      < S35        g ! [        [        UR
                  4 a7  nTR                  S[        U5       SU  S[        U5       S35         S nAg S nAff = f)	Nr  r   z = pickle.loads(r   z.raise TypeError("Failed to pickle opaque type z for variable r   z"))r   pickledumps	TypeErrorr"  PicklingErrorr   r   )r   r   r  r_  r   s       rV   add_torchbind_inputKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  s    }  D6!12  D6)9&,,u:M9PPQ!RS~v/C/CD   DT%[MQ_`d_eeghklmhngooqr s   'A B"-BB)r  r  zdef get_args():zP
                from torch._dynamo.testing import rand_strided
                Tr   zglobal r   zimport pickleN*   r  ztorch.cuda.default_generators[z].graphsafe_get_state()zreturn [r  r  )r  r  z9def benchmark_compiled_module(args, times=10, repeat=10):z
                from torch._inductor.utils import print_performance
                fn = lambda: call(list(args))
                return print_performance(fn, times=times, repeat=repeat)
                )$
writelinesr   r   r5   rK   	constantsr  r   rl   rm   r  r   r   torchbind_constantsr  r[   rz   r~  rQ   backed_var_to_valr   r   r#   rb  get_real_objr   r  rd  r(  rC  rD  r  r  rP   r   r  )r
  r   r  r  r	  r   r   torchbind_objr   rJ  rm   s   ``         rV   benchmark_compiled_module.PythonWrapperCodegen.benchmark_compiled_module  s   		0	 	56]]_MM 	    !ww00668   74&!12**,ekk	  9 177../!3  1+,77+F+F+L+L+N'D $$wtf%56'<	 ,O  !ww3399;eU\\22zGG$$66::5$G8 8 eR%7%7881776671<((9$$wtf%56'.@.@.BCuzz22
 #agg..@@QS@T  r'8'899"89K9K8LLcd "'!1!1A ((::1r:J!1   "'!1!1!3!3A ((221r2B!3   #((*)I  <X x		!''2F2F2K2K2M(N'OqQRI N 	Q	
 ]]_MM
    _-q _T _s7   KQ2Q Q2Q	A9Q*Q
Q
Q
Q-c                   [         R                  (       d  gU R                  U5        UR                  / SQ5        UR	                  5          UR                  SSS[        5        S3/5        SSS5        g! , (       d  f       g= f)z<
Append a benchmark harness to generated code for debugging
N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzargs = get_args()zcompiled_module_main('zU', lambda times, repeat: benchmark_compiled_module(args, times=times, repeat=repeat)))r"   benchmark_harnessr  r  r   r+   r
  r   s     rV   r<  *PythonWrapperCodegen.add_benchmark_harness@	  sr     ''&&v.@A]]_X'01C1E0F Gm m		 __s   !A55
Bc                >    U R                  [        U UUUUUS95        g r  )r   r  )r
  r  r  r  r  r  s         rV   define_kernel"PythonWrapperCodegen.define_kernelV	  s*     	 !-		
rX   c                    [         R                  R                  (       a,  U(       a%  [        R                  " SSU[        R
                  S9nU(       a  U S3OSnSU U  SU 3nU$ )Nz^// z# )flagsr  r  z

r   )r"   r   r   resub	MULTILINE)r  r  r  metadata_commentbodys        rV   _format_kernel_definition.PythonWrapperCodegen._format_kernel_definitioni	  sZ     ==11h vvgtXR\\JH.6hZr?B&'}C}ErX   c                <   [         R                  R                  (       aR  U(       aK  U R                  XUS9nU R                  R                  U5        [        R                  R                  (       a  g U R                  XUS9nU R                  R                  U5        g )N)r  )
r"   r   r   r"  r  r   r5   rK   r  r  )r
  r  r  r  r  r  r!  s          rV   r  *PythonWrapperCodegen._define_kernel_helperu	  s     ==11c118 2 D %%,,T2ww""--x . 
 	4 rX   c                N    U R                   R                  UR                  5        g rJ   )r  r   r   )r
  r   subgraph_codes      rV   define_subgraph_launcher_fn0PythonWrapperCodegen.define_subgraph_launcher_fn	  s    !!(()<)<=rX   c                    SSK Jn  U$ )Nr6   )TritonKernel)r   r+  )clsr+  s     rV   _get_triton_info_kernel_cls0PythonWrapperCodegen._get_triton_info_kernel_cls	  s     	)rX   c                X  ^^/^0^1^2^3^4 SSK JnJnJn	  SSKJm/Jn
JnJnJ	n  UR                  n/ m40 m2/ m1/ nU1U44S jm0S-U/U0U2U4S jjnUR                   Vs/ s H  nUR                  PM     nnUR                   Vs/ s H"  nUR                  (       d  M  UR                  PM$     nn[        U5       GH  u  nnUU;   a  U" UT/" US9SS	9  M  UT;  a  M#  TU   nTU   c  U" UT/" US9SS
9  M<  [!        U["        R$                  5      (       a^  [!        U["        R&                  5      (       a'  SUR(                  UR*                  R-                  5       4OSu  nnnU" UU" UUUUS95        M  [!        U["        R.                  5      (       a.  U" UU" UUR1                  5       UR-                  5       S95        GM  [!        U["        R2                  5      (       aM  U" UU" UUR4                  R1                  5       UR-                  5       UR6                  R8                  S95        GMr  [!        U[:        [<        R>                  45      =(       a*    [@        RB                  RD                  RG                  US5      nU" UU" UU5      US9  GM     [I        T4S T1URJ                   Vs/ s H  n[M        U5      PM     snS9nU[N        RP                  " [@        RB                  RS                  5       5      0 T2E[T        RW                  US5      E[Y        T4T1S9/S.nU(       a  [[        U5      US'   U(       a  [[        U5      US'   []        U5      S:X  a0  UR_                  5       n/ [a        [<        Rb                  US   5      QnOS.U34S jjn 0 m3U V!s/ s H  n!/ [a        U U!5      QPM     nn!U(       a  []        U5      []        U5      :X  d   e/ n"[e        [g        Xb5      S SS9 HR  u  n!n#U"Ri                  U" U#5      / [a        [j        U!5      Q/ [a        [l        U!5      Q/ [a        [j        U!5      QS.5        MT     U	R                  U"/ [a        [n        T3Rq                  5       5      QS.n/ T3Rs                  5       Qn[u        URv                  5      /n$[]        U5      S:  aY  TRq                  5        HE  n[!        U["        R.                  ["        R2                  45      (       a  M4  U$Ri                  U5        MG     U$Ri                  [o        U5      5        U$Ry                  [o        U5      5        [[        U$5      n$U$U Rz                  ;   a  U Rz                  U$   u  n%nn&U%UU&U4$ U S[]        U Rz                  5       3n%[}        5       n'[~        R                  R                  (       a  U'R                  SU%< S35        OU'R                  SU< S35        U%US'   U R                  5       n(UR                  U(R                  5       5        U'R                  U(R                  5       5        [~        R                  R                  (       a  U'R                  S 5        U'R                  S!/ [a        Xr5      Q< S"U< S#U< S$35        [        U5      n)[~        R                  R                  (       a  U)R                  S%U S&3S%U% S&35      n)U)R                  S'S(5      n)U'R                  U)5        [@        RB                  RS                  5       n*U'R                  S)U*R                   S*35        [        R                  " URv                  5      u  n+n,[        R                  " URv                  5      n-S+U- S,U, 3n.U R                  U%U'R                  5       U.5        U%UU4U Rz                  U$'   U%UUU4$ s  snf s  snf s  snf s  sn!f )/Nr    )config_to_dict	FixedGridPrecomputedGridr6   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArgc                J   > TR                  U5        TR                  U 5        g rJ   )rq   )r  r  arg_indices	signatures     rV   add_to_signaturePPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature	  s    S!s#rX   c                  > U(       aE  [        5       (       a  T" X5        UR                  T;   a  TUR                     TUR                  '   g g UR                  T;   d   eU(       a?  [        5       (       a  T" U T" UR                  S95        OT" X5        STUR                  '   g U(       a6  [        5       (       a  T" U T" UR                  S95        S TUR                  '   g T" X5        g )Nr   r6   )r4   r   )	r  r  is_constexprequals_1equals_noner3  r;  r  r   s	        rV   add_argGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg	  s    133 %S.88v% +1*:Ichh' & xx6)))577
 )l.IJ(2*+Ichh' 577 )l.IJ*.Ichh'$S.rX   r>  T)r?  )rA  stable)experimentalNN)r   api_typer  r   )r   bufferr   )r   rG  r   rn   )r@  )
size_dtyper  argdefs)r  )r:  r  r  r   restore_valuereset_to_zeror   c                t  > [        U [        R                  5      (       al  / U R                  QnU(       d  U $ UR	                  [
        S9  U H0  nUT;   a  M  [        R                  " S[        T5       35      TU'   M2     [        U T5      $ [        U [        5      (       d   e[        R                  " U 5      $ )N)r   _launcher_s)r[   rz   r   r  sortr   r~  r   r3   r   r{   )r  symbolsr  extra_launcher_argss      rV   rename_sizes_for_launcherYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherA
  s    dEJJ//2 1 12G"#LLSL)&"55$38<<)#.A*B)CD4+C0  ' &d,?@@!$,,,,}}T**rX   c                2    [        U S   R                  5      $ r   r   r   s    rV   r   HPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>W
  s    3qt{{3CrX   r   )r"   r  cpppython_slow)	grid_typeprecomputed_gridsrP  r  zasync_compile.triton(z, '''r  r  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r~  z# Original path: r  )FFF)r  r   r   r   )Qruntime.triton_heuristicsr0  r1  r2  commonr3  r4  r5  r6  r7  r   paramsr   r?  numr  r[   r#   TMADescriptorr  r  r  rP   rj   rM   r&   rr   r_   rn   r   rz   r{   r5   rK   rQ   statically_known_equalsrA   	arg_namesr7   r(   r  get_current_device_or_throwr  fromkeysr?   r   r   setup_grid_as_argsr  sympifyr   r   rq   r  r=   r   r  r  idr   extendr  r-   r"   r   unique_user_kernel_namesr   r-  updateinductor_meta_commonr   gen_common_triton_importsr%  r  replacer   inspectgetsourcelinesgetsourcefiler  r   )5r
  r  r   r   restore_value_argsreset_to_zero_argsr   r0  r1  r2  r4  r5  r6  r7  original_nameequal_to_1_argsrB  pr_  
constexprsr  r   r  rF  r  r   r@  r   triton_signaturer  r  extra_launcher_call_argsrQ  r   rX  cfg	cache_keyr   cached_inductor_metar   triton_info_kernel_cls
kernel_srccurrent_devicer  linenosrcfiler  r3  r;  r9  r  rP  r:  s5      `                                           @@@@@@rV   !define_user_defined_triton_kernel6PythonWrapperCodegen.define_user_defined_triton_kernel	  s*   	
 	

	
 	
 )+	$&	!#%'	$"	/ "	/H &,]]3]QVV]	3%+]]E]anneaee]
E!),HCj \s3$G& +Cc{"\s3Fc2#3#344 &c2+A+ABB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY//!!$#&<<>"%--/  R%7%788 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs -v -)/)9)9:)9AWQZ)9:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<EOSZ3w<777 "#E#)CT	c "(("0"5"5Ct$4"52UD!12':UD)9':	 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}}!#		23E3E'FGG$$S) ' 	[)*]+,)$	6666:6T6T73D+3 $(	   #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$!%!A!A!C3HHJK5OOQR==))%%&DE83~78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4V&&y1[-1IIIa 4E@ ;j Ps   ^:^^^"^'c                    U SUR                    S3nUb  USU 3-  n[        R                  " USSS9n[        XRR                  5      nUS:H  nU(       d)  U R                  [        X[        R                  5      5        U$ )Nr  numelT)
is_integeris_positiver  )	r  rz   r~  r  r  r   r  r5   rK   )r
  r  treer  sym_namer  r  is_benchmark_kernels           rV   generate_numel_expr(PythonWrapperCodegen.generate_numel_expr
  s{    !]!DKK=6!F8$Hll8$G c::.)R/"NN.t!''BC
rX   c                j    U R                  UR                   S[        UR                  5       35        g r  )r   r  r  r  )r
  r  rK   s      rV   r  7PythonWrapperCodegen._generate_symbolic_call_arg_helper
  s)     	#))Ccnn(='>?@rX   c                   UR                  5       n[        X5      nUR                  [        R                  :X  a  U R                  U5        GOBUR                  [        R                  :X  a2  U R                  U5        U R                  U R                  U5      5        OUR                  [        R                  :X  a  U R                  R                  U5      nU(       a]  [        U[        5      (       a  [        UR                  [        5      (       d   e[        R                  " UR                  U5      Ul        OUU R                  U5        U R                  U R                  U5      5        X0R                  U'   O[        UR                  5      e[         R"                  R$                  (       a  U R&                  R                  [(        R+                  U UUR,                  UR.                  [0        R2                  R4                  R7                  UR8                  5      4SS95        UR                  [        R                  :w  a/  U R&                  R                  [(        R                  X5      5        g g g )N)r6   )rJ  rm   )rM   r"  	zero_moder<   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr   r   r[   rS   r;   maximumr  r"   r   r   r   rE  make_allocationr  r   r5   rK   rQ   r  r   )r
  wsr   r   priors        rV   generate_workspace_allocation2PythonWrapperCodegen.generate_workspace_allocation
  s   {{}D%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%66:JJ< <   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rX   c                v    UR                   [        R                  :w  a  U R                  [	        X5      5        g g rJ   )r  r<   r  r   r4  )r
  r  s     rV   generate_workspace_deallocation4PythonWrapperCodegen.generate_workspace_deallocation
  s,    <<,;;;NN.t89 <rX   c                $    U SU R                    3$ )Nz.zero_())r  )r
  r   s     rV   r  %PythonWrapperCodegen.make_zero_buffer
  s    x}--rX   c                H    U SSR                  U5       SU R                   3$ )Nr  r  r   )r   r  )r
  r   r  s      rV   r  %PythonWrapperCodegen.wrap_kernel_call
  s'    q9-.a}==rX   c                    U R                   R                  S5        U R                   R                  S[        R                  R                   S35        UR                  U R                   R                  5       5        g )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r5   rK   graph_idr/  r   )r
  r>  s     rV   r1  8PythonWrapperCodegen.generate_profiler_mark_wrapper_call
  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rX   c                :    U R                   R                  S5        g )Nzstart_graph())r  r   r	  s    rV   r2  )PythonWrapperCodegen.generate_start_graph   s    ##O4rX   c                `    U R                   R                  S[        R                  < S35        g )Nz
end_graph(r   )r  r   r"   profile_bandwidth_outputr	  s    rV   r6  'PythonWrapperCodegen.generate_end_graph  s'    ##j1P1P0SST$UVrX   c                    U R                   R                  [        R                  R                  R                  5       5        g)z<Synchronize GPU to ensure proton captures all kernel events.N)r  r   r5   rK   r  r  r	  s    rV   r7  -PythonWrapperCodegen.generate_proton_finalize  s)    ##AGG$6$6$B$B$DErX   c                ^    U R                   R                  S[        R                   S35        g )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r'   r   r	  s    rV   r5  6PythonWrapperCodegen.generate_reset_kernel_saved_flags
  s2      ''8'A'A&B C	
rX   c                ^    U R                   R                  S[        R                   S35        g)a  
Precompile and save the CUBINs of the Triton kernels that haven't
been precompiled and saved as a side effect of running the generated
JIT model (Python wrapper). This can happen when the model contains
control flow: only one pass through the control flow operators covers
the kernels that are saved, the remaining kernels are not launched,
hence not saved. The main purpose of this codegen is to compile and
save the Triton kernels outside the active control flow path for
subsequent AOTInductor code generation and compilation.
r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r	  s    rV   r8  5PythonWrapperCodegen.generate_save_uncompiled_kernels  s4     	  ''8'A'A&B C
	
rX   c                B    S nU Vs/ s H
  o2" U5      PM     sn$ s  snf )Nc                   [        U [        5      (       a  [        U 5      (       a  U S-   $ U $ [        U [        [        [
        [        45      (       a  [        U 5      $ [        [        R                  R                  R                  U 5      5      $ )Nr  )r[   r   r@   r   floatr   r  r  r5   rK   rQ   rR   )r  s    rV   wrap_argAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg-  sg    #s##*B3*G*GsYPSPC#udO!DEE3xQWW--66s;<<rX   r   )r
  r  r  r  s       rV   prepare_triton_kernel_call/PythonWrapperCodegen.prepare_triton_kernel_call,  s%    	= *33#333s   c                  ^  [        U[        5      (       GaY  [        U[        R                  5      (       a.  UR	                  5       R                  5       nT R                  U   nObT R                  R                  U5      (       a  UnT R                  U   nO0Uc   S5       eST R                   3nUnT =R                  S-  sl        Uc
   SU 35       e[        R                  R                  R                  UR                  5       5      n[        R                  R                  R                  [        R                  R                  U5      5      n[        R                  R                  R                  UR                  5       5      nUR!                  5       n	UR#                  5       n
[        R                  R                  R%                  UR'                  5       R(                  5      nSU SU SU	 SU
 SU SU S	3nT R*                  R-                  U S
U 35        [        U[        R                  5      (       a2  T R/                  USS9nUnT R*                  R-                  U S
U 35        U$ [1        U[2        R4                  5      (       d  [        U[6        5      (       a  [        U[8        5      (       a  UT R:                  ;   a  U$ Uc  gUn[        U[6        5      (       a  UR<                  nU[        R                  R                  R>                  ;   a'  [        R                  R                  R>                  U   n[9        [        R                  R                  R%                  U5      5      $ [        U[8        [@        [B        [D        45      (       a  [9        U5      $ [        U[F        5      (       a  SSRI                  U 4S jU 5       5       S3$ [K        S[M        U5       35      e)NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r6   z Failed to find a buffer for arg zgenerate_example_value(r  z, 'z', r   r   T)r  r  r5  r  c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frJ   r   )r   ar
  s     rV   r   BPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>s  s(      ZVYQR!@!@DG!L!LVYr   r  zUnsupported type )'r[   torch_dtyper#   r]  
get_tensorrM   r  r   r  r5   rK   rQ   r  rC  get_allocation_sizerD  r  rP   r  rk   rn   r   r   r  r  rz   Basicr  r   r  r  r  r   r  r   r  r   r;  r   )r
  r  arg_typeraw_argr  re  rl   allocation_sizerm   r  r   rn   r   s   `            rV   r   /PythonWrapperCodegen.generate_example_arg_value8  s<   h,,'2#3#344"--/88:**3/%%))#..**3/* X* &d&F&F%GH00A50?L&Fse$LL?77##66s||~FDgg..AA++C0O WW%%889IJF^^%FMMOEWW%%778H8O8OPF-dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#344 :: %) ;  **44zUG5LMO%++..*S/2R2R#s##$//)J?!#//nnagg&&CCCgg&&CCCHqww''99#>??c3t455s8OT""tyy ZVY ZZ[[\]]%(9$s)&EFFrX   c                   ^  [        U[        5      (       a!  SSR                  U 4S jU 5       5      -   S-   $ [        U5      $ )Nr  r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frJ   )_grid_dim_str)r   r|   r
  s     rV   r   5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>z  s     R\T 2 24 8 8\s   !r  )r[   r  r   r  )r
  grid_per_dims   ` rV   r  "PythonWrapperCodegen._grid_dim_strw  s?    lD))diiR\RRRUXX &&rX   )r  r   r  r  r  r  r  r   c                  U R                   R                  U Vs0 s H:  n[        U[        5      (       d  M  U[        R
                  R                  U5      _M<     sn5        U=(       d    [        R
                  R                  5       nU R                  [        U UUUUUUUU	U[        R
                  R                  U
S95        gs  snf )z
Generates kernel call code.

triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
        and C++ when gpu=False.
)r  r  r  r  r  r   r  r  r  r  r   N)r  rg  r[   r   r5   rK   try_get_bufferr`  r   r  r   )r
  r  r  r  r   r  r  r  r  r  r   r  s               rV   generate_kernel_call)PythonWrapperCodegen.generate_kernel_call  s    . 	## %$Cc3' 1QWW++C00$	
 @177>>@'#!!#'+77<<%9#	
s
   C$Cr  )	r  r   r  r  r  r  r  r  r   c       	   
     J  ^ ^^ U=(       d    [         R                  R                  5       nU(       d  UR                  S:w  a  UR                  S:X  a"  T R	                  T R                  TU5      5        g UR                  S:X  a%  T R	                  T R                  T S3U5      5        g [        SUR                   S35      eT R                  U5      nSR                  U5      n[        R                  T UR                  U
5      nU(       d$  SU S	3nT R	                  T S
T SU SU S	35        g T R                  5         [        R                  R                  (       Ga  TT R                   ;  Ga  Ub  [#        U5      [#        U5      :X  d   S5       eS mU(       aI  [         R                  R$                  (       a*  [         R                  R$                  R'                  US 5      mSUU 4S jjnUU 4S jn/ nUc)  Ub   S5       eS /[#        U5      -  nS /[#        U5      -  nO[#        U5      [#        U5      :X  d   S5       e0 n[)        [+        X%Xg5      5       GH  u  nu  nnnnS n[-        U[.        5      (       a#  S[/        U5      ;   a  UR1                  S5      u  nnS nT(       a  UT;   a  T R3                  TU   5      nU(       aQ  Un[-        U[4        5      (       d9  [7        U[8        R:                  5      (       d  [-        U[<        5      (       a  UUU'   OUS:X  a  U" XgUU5      (       a  UU   nO[-        U[4        5      (       ag  [>        R@                  " SU5      (       a  UnO6UT RB                  ;  a  T RE                  UUU5      nOT RB                  U   S   nUT4T RB                  U'   OT RE                  UUU5      n[-        U[.        5      (       a  [G        U5      (       a  US-  nURI                  Uc  UOU SU 35        GM     T RJ                  R	                  S[         R                  RL                  RO                  UR                  5       S35        T RJ                  RQ                  5         T RJ                  R	                  T SSR                  U5       SU S	35        T RJ                  RS                  5         T RJ                  R	                  [U        SUS5      5        T R                   RW                  T5        [         R                  RX                  (       a  g [         R                  RZ                  R\                  nUR_                  UTUS 5        U   T R	                  T SU SU S	35        S S S 5        T R                  5         g ! , (       d  f       N= f)Ncudar4  mpsz.generated_kernelzdevice z nyir  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                    > TR                   R                  5        V Vs/ s H  u  pUT:X  d  M  U PM     nn nU(       a  SSR                  U5       S3$ gs  snn f )zAfter all the autotune kernel calls have been written (i.e.
self.kernel_autotune_example_args is complete), returns a deletion call
for all autotune example tensors that are unnecessary after kernel_name
is called.del r  r  r  )r  r  r   )r  kntensors_to_deleter  r
  s      rV   get_autotune_deletion_callUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call  se     '+&G&G&N&N&P%&P
[( &P " %
 %!$)),=">!?rBB%s
   AAc                  > X   nXC;   a  g[        [        X5      5       H  u  nu  pgXR:X  d  [        U[        5      (       d  M$  SnT(       a  UT;   a  TR	                  TU   5      nUS:X  a  MO   UR                  5       n	[        U	R                  5       H  u  pX:X  d  M  U SU
 S3X4'       g   M     g! [         a     M  f = f)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
This is particularly useful for jagged cases, where the dimension is often
being passed in as an input.Tr  z.shape[r  F)r  r   r[   r%   rO  rk   rl   r;  )r  r  r  reused_args
target_argr  raw_keyr  triton_inputr_   r  rg  autotune_argsr
  s               rV   infer_arg_by_inputsNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs  s    
 &]
,-6s87N-O)A)xz'6'B'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&<FC =IN'RUQVVW:X 7'+ '= .P, 	 / ! !!s   0-B5!B5.B55
CCzkeys are not None but args arez#call_args and raw_args do not matchr  r  z^(workspace|semaphore)r   r  r  r  z.run(z	, stream=z
<del_call>r  )0r5   rK   r`  r   r   r  rZ  r  r   rE  r  r(  rC  r"   r   r   r   r   autotuning_mappingr   r  r   r[   r   splitrO  r  r  rz   r  r  r  matchr  r   r@   rq   r   r  r  rM  rx  r*   r   r  r  r  r  )r
  r  r  r  r   r  r  r  r  r  r  r   call_args_strstream_name
stream_ptrr  r  all_argsr  r  r  r  r  r  r   r  arg_strr  r  s   ``                          @rV   r  1PythonWrapperCodegen._generate_kernel_call_helper  s    @177>>@&++/{{e#t44[)LM  %))[M9J*KYW
  #WV[[M#>?? 77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM2224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($! B H'I)II' 6C	N2 6C	N28}I6 96 K8AI(=944C7G c3''C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<<"8U[[99%c?;;/6G,]':;( (
 *'2G+66 xx 93??"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUGc3'',DS,I,Iy(G3;se1WI<NOY9^ &&00**77EFaH &&002&&00-uTYYx%8$9;-qQ &&224&&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW"NNk]%i}TUVW #%%' #"s    V
V"c                :    U R                   R                  U5        g rJ   )r  rq   r
  r   s     rV   r   PythonWrapperCodegen.writelinep  s    

$rX   c                8    U H  nU R                  U5        M     g rJ   r#  )r
  r  r   s      rV   r  PythonWrapperCodegen.writeliness  s    DNN4  rX   c                L    U R                   R                  [        U5      5        g rJ   )r  rq   r0   )r
  ctxs     rV   r/  "PythonWrapperCodegen.enter_contextw  s    

+c*+rX   c                @  ^ ^	 SSK Jn  U" 5       (       a  SS Kn[        U[        5      (       a  [        UR                  R                  5      $ [        U[        R                  5      (       a  [        U5      $ [        U[        [        45      (       aB  [        R                   " S S5      5       m	[        [        U5      " U	U 4S jU 5       5      5      $ [        U[         R"                  R$                  5      (       a  ['        U5      $ [        U[(        R*                  [(        R,                  [.        45      (       a  UR1                  5       $ U" 5       (       a0  [        UWR2                  R4                  5      (       a  [        U5      $ [        U[(        R6                  5      (       a  UR1                  5       $ [9        [        U5      5      (       aE  [;        U5      u  pVUR=                  5        H"  u  pxU[>        R@                  RB                  U'   M$     U$ [        U5      $ )Nr   )has_triton_packagec                  &    \ rS rSr% S\S'   S rSrg)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimi  r   refc                    U R                   $ rJ   )r  r	  s    rV   __repr__:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__  s    88OrX   r   N)r   r   r  r  r   r  r  r   rX   rV   Shimr    s    $rX   r  c              3  \   >#    U  H!  nT" [         R                  TU5      5      v   M#     g 7frJ   r  )r   r  r  r
  s     rV   r   6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>  s)     VTUq1@@qIJJTUs   ),)"torch.utils._tritonr  r   r[   r   r  rS   r  rz   r   r   r  r  	dataclassrO  r   r  _ops
OpOverloadr   r#   rj   
MutableBoxr&   r  languager   rd  r   r   r  r5   rK   opaque_value_type_classes)
r
  rg  type_r  r   obj_repropaque_typesnr  r  s
   `        @rV   r  #PythonWrapperCodegen.val_to_arg_strz  s   :a""%%5::&&8OE4=))""$ $ #$ QVTUVV  5::0011&q))BIIr}}oFGG&&((!!jFOO4I4I&J&J7N2,,--&&((!$q'**%8%;"H$**,7811!4 -O7NrX   c           	     Z   UR                  5       nUR                  5       n[        UR                  5       5      n[        [        R
                  R                  U5      5      n[        UR                  5       5      nUR                  5       nU R                  UR                  5       X#XFXW5      $ rJ   )r  rP   r   rC  r5   rK   r  rD  get_is_pinnedr  rM   )r
  rG  r  r   rJ  allocation_shaperm   	is_pinneds           rV   r@  +PythonWrapperCodegen.make_buffer_allocation  s    ""$  "foo'( !<!<V!DEv((*+((*	##OOve=M
 	
rX   c                |    Sn[         R                  R                  (       d  U R                  R	                  USS9  g g )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r5   rK   r  r  r   rA  s     rV   "write_memory_track_allocation_once7PythonWrapperCodegen.write_memory_track_allocation_once  s4    
 ww""LL
$7 #rX   c                   Uc  UnU R                  U5      nU R                  U5      n	U R                  U5      n
[        R                  R                  R                  R
                  (       a  U SU	 SU
 SU SUR                   SU S3nOmUR                  S:X  a  U(       a  U SU	 SU
 SU S	3nOGUR                  S
;   a  U SUR                   SU	 SU
 SU S	3
nOU SU	 SU
 SUR                   SU S	3
nX:w  a  USU SU
 S	3-   nU$ )Nz = tracked_empty_strided(r  z, dtype=r  z	', name='r~  r4  z = empty_strided_cpu_pinned(r   )r4  r  xpumtiaz = empty_strided_r  z = empty_strided(r  z.as_strided()r   r  r  r"   r  r   r   )r
  r   r  r   rJ  rm   r  r  rG  codegen_allocation_shape_tuplecodegen_stride_tupler  s               rV   r  $PythonWrapperCodegen.make_allocation  s    #$"==eD)-)H)H*
&  $>>vF??!!..EE&112"'( )  !;;- (b"  [[E!i&412"'('  [[:: &)&++a12"'('  &)12"'( )!;;-yq:  @,':&;2>R=SSTUUC
rX   c                8    U R                  [        U5      5        g rJ   )r   rc  r  s     rV   make_comment!PythonWrapperCodegen.make_comment  s    {4()rX   c           	     `    U R                    U SU U R                   SU R                   SU 3	$ )Nr      )r  r  r  )r
  new_nameold_namer  s       rV   make_tensor_alias&PythonWrapperCodegen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^rX   c                (    SUR                  5        3$ )Nr  )rM   )r
  rG  s     rV   r  %PythonWrapperCodegen.make_buffer_free  s    foo'())rX   c                8    SSR                  S U 5       5       3$ )Nr  r  c              3  $   #    U  H  ov   M     g 7frJ   r   )r   r   s     rV   r   :PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s     >s   )r   )r
  names_to_dels     rV   make_free_by_names'PythonWrapperCodegen.make_free_by_names  s    dii>>>?@@rX   c           	     `    U R                    U SU U U R                   SU R                   S3	$ )Nr   r   reuse)r  r  r  )r
  r  r  del_lines       rV   codegen_exact_buffer_reuse/PythonWrapperCodegen.codegen_exact_buffer_reuse  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrX   c                R    Ub$  U R                  U R                   SU SU 35        g g )Nz [Provenance debug handles] r  )r   r  )r
  r  debug_handles      rV   write_provenance_debug_handle2PythonWrapperCodegen.write_provenance_debug_handle  s4    
 #NN<<. <[M<.Y $rX   c                   UR                  5       UR                  5       :X  d   eUR                  5       nUR                  5       nSnU[        R                  R	                  5       ;  a  U(       a  SU R                  U5       3nUR                  5       UR                  5       :X  a4  UR                  5       UR                  5       :X  a  U R                  XEU5      $ U R                  XR                  5       UR                  5       SU R                  R                  5      nU R                   U SU U SU R                   S3$ )N;z; r   r   r  r&  )rP   rM   r5   rK   r  r  rC  rD  r(  r  r  r   r  r  )r
  r%  r$  rm  r  r  r'  reinterpret_views           rV   rq  &PythonWrapperCodegen.make_buffer_reuse  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228xPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rX   c                    U R                  [        UU R                   U SUR                  5        U R                   SU R
                   S35      5        g )Nr   r  z alias)r   r9   r  r  r  r  )r
  r   rf  s      rV   rg  0PythonWrapperCodegen.codegen_deferred_allocation  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rX   c                  ^ UR                  5       nU[        R                  R                  ;   dM  X R                  ;   d>  [        U[        R                  [        R                  [        R                  45      (       a  g U R                  R                  U5        [        UR                  5       [        R                  [        R                  45      (       a  UR                  5       (       d  g UR                  5       n[        U[        R                   5      (       a  g [        U[        R"                  5      (       a  g [        U[        R$                  5      (       Ga?  [        UR&                  [        R(                  5      (       d*   S[+        UR&                  5       SUR&                   35       eUR&                  R,                  n[        U[        R.                  5      (       d   [+        U5      5       eUR,                  n[        U[        R0                  [        R(                  45      (       d   [+        U5      5       e[        U[        R(                  5      (       a  SU4S jjmT" U5      nU R3                  U5        U R5                  [7        XX5      5        g [        U[        R8                  5      (       a  U R5                  [;        XSS95        g U R5                  [;        X5      5        g )Nzunexpected r   c                2  > [        U [        R                  5      (       a  T" U R                  5       5      $ [        U [        R                  5      (       a  T" U R
                  5      $ [        U [        R                  5      (       d   [        U 5      5       eU $ rJ   )r[   r#   BaseViewunwrap_viewr  rr   rj   r   )targetunwrap_viewss    rV   r9  =PythonWrapperCodegen.codegen_allocation.<locals>.unwrap_views:  sm    !&"++66+F,>,>,@AA!&"--88+FKK88%fbii88F$v,F8!MrX   Tr3  )r   	ir.Buffer)rM   r5   rK   r  r  r[   r#   DonatedBufferSubgraphBufferInputBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocaterZ   MutationLayoutSHOULDREMOVEr  re  rf  r&   r   rr   rp   rj   codegen_allocationr   r`  r\   r"  )r
  rG  r   r_   boxinput_bufferr9  s         @rV   rD  'PythonWrapperCodegen.codegen_allocation  s$     AGG+++~~%&2#3#3R5F5F"WXX4 &&(%%r~~6  **,,'')fb;;<<fbmm,,fb0011fkk2+=+=>> d6;;/06;;-@> ++""Cc2==11<49<188LlRYY8J8J,KLL dO L ,(:(:;;"  ,L9##L1NN?4vNOfb1122NN<$GH|D12rX   c                   UR                  5       n[        U[        R                  [        R                  45      (       a  U R                  [        X5      5        g [        UR                  5       [        R                  5      (       a  U R                  [        XSS95        g U R                  U5      (       d  g U R                  R                  U5        U R                  [        X5      5        g )NTr3  )rM   r[   r#   r>  rb  r   r  rZ   r\   r4  	can_reuser  r   )r
  rG  r   s      rV   codegen_free!PythonWrapperCodegen.codegen_freeM  s      fr~~r/A/ABCCNN8D12f,,.0C0CDD NN.tNO~~f%%

t*489rX   c                2   UR                  5       nU[        R                  R                  ;   =(       d    U[        R                  R                  ;   =(       a:    [        [        R                  R                  U   [        R                  5      (       + =(       dz    U[        R                  R                  ;   =(       dV    U[        R                  R                  ;   =(       d2    U[        R                  R                  ;   =(       d    X0R                  ;   (       + $ rJ   )rM   r5   rK   r  r  r[   graph_inputs_originalr#   r<  r  r  never_reuse_buffersr  )r
  rF  output_bufferr   s       rV   rI  PythonWrapperCodegen.can_reusea  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" zz!
 	
rX   c                    UR                  5       U R                  ;   =(       a.    U R                  UR                  5          UR                  5       :H  $ rJ   )rM   r  )r
  rG  reused_buffers      rV   	did_reusePythonWrapperCodegen.did_reuseq  sC     OO, KFOO-.-2H2H2JJ	
rX   c                z   [        X5      (       d   eU R                  U5        U R                  R                  UR	                  5       5        U R
                  R                  UR	                  5       5        UR	                  5       U R                  UR	                  5       '   U R                  [        XU5      5        g rJ   )	rh   rD  r  r   rM   r  r  r   r7  )r
  rF  rO  s      rV   codegen_inplace_reuse*PythonWrapperCodegen.codegen_inplace_reusey  s    $\AAAA-

|,,./=11340<0E0E0GM**,-y]CDrX   c                    [        U5      nX R                  ;   a  U$ U R                  R                  U5        U R                  U-   $ rJ   )r   r  r   r  )r
  r   r   s      rV   codegen_unbacked_symbol_decl1PythonWrapperCodegen.codegen_unbacked_symbol_decl  sA    6{---K &&**40<<$&&rX   c                    [        [        R                  R                  R                  U5      nU R                  [        XX#5      5        g rJ   )r   r5   rK   rQ   	shape_envr   r  )r
  r  r  r  s       rV   (codegen_unbacked_symbol_defs_for_outputs=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs  s=     6GG&&(9
 	"4gQ	
rX   c                   ^^^^ U(       d  g UR                  5        HM  u  nmSU4S jjmUUUU4S jnU R                  U R                  U5       SU" 5        U R                   35        MO     g )Nc                  > US:X  a  U $ [        U5      S:  ai  [        US   [        5      (       aQ  [        US   [        R                  5      (       a/  T" U  SUS   R
                   SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R
                   S3USS  5      $ [        US   [        R                  5      (       a^  [        R                  R                  (       a   T" S	US   R                   S
U  S3USS  5      $ T" U  SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R                   S3USS  5      $ [        SU 35      e)Nr   r    r   r6   r   r  r   r  z	std::get<z>(r  r  z.__floordiv__(r  )r   r[   r   pytreeSequenceKeyr   r  r5   rK   r  r   r  r  )r  r  gos     rV   rc  JPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go  s   b=K LA%"71:}=="71:v/A/ABB&'!*//!2!GAJNN3C1Ewqr{   
M::a
'8;WQR[II
F,>,>?? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K88 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrX   c                   > [         R                  R                  (       a  [        T5      S:X  a`  TS   n T" TS   R	                  5       [        U [        R                  5      (       a"  [        U R                  5      S:w  a	  TSS  5      $ T5      $ [        TS   [        R                  5      (       d   eT" TTS   R                     R	                  5       TSS  5      $ T" TT5      $ )Nr6   r   )r5   rK   r  r   rM   r[   r#   rA  r  ra  rb  r  )r  rc  r  r  r  s    rV   go_outerPPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer  s    77&&
 7|q(%aj  "#AJ//1)#r~~>>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIIII!''!*.."9"B"B"DgabkRRk733rX   r   )r  r   r  zpytree.KeyPath)r  r   rY  r  )r
  r  r  r  rg  rf  rc  r  s    ``   @@rV   r  >PythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs  sg     ! ,113JAw
L<4 4. NN44Q78HJ<}Uu 4rX   c                  ^ ^^^ UU U4S jnUU U4S jn T R                  TR                  5        T R                  T R                   STR                   35        U" 5         [
        R                  n[
        R                  " TR                  5         TR                  R                  US9  S S S 5        U" 5         T R                  5         g ! , (       d  f       N&= f! T R                  5         f = f)Nc                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H3  u  pTR	                  TR
                   U  SU TR                   35        M5     g r  )r   rK   r  r   r   r  r  )inner_inputouter_inputouter_inputsr
  subgraphs     rV   _codegen_subgraph_prefixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix  sr    x~~223s<7HHHH,/++\-( ||n[M[M$++O-rX   c                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H5  u  pTR	                  U SU R                  5        TR                   35        M7     g r  )r   rK   r
  r   r   r  r  )inner_outputouter_outputouter_outputsr
  rn  s     rV   _codegen_subgraph_suffixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  st    x~~334M8JJJJ.1,,m/* #nC(F(F(H'I$++W/rX    subgraph: )parent_graph)	rL  rK   r   r  r   r5   set_graph_handlercodegen_subgraphrw  )r
  rn  rm  rt  ro  ru  rx  s   ````   rV   codegen_subgraph_by_inlining1PythonWrapperCodegen.codegen_subgraph_by_inlining  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4//!- 0  5 %&$$& 54 $$&s$   A<C C,C 
CC C/c           	        UR                   nUR                  n[        UR                  5       5      UR                   Vs/ s H  oUR
                  PM     sn-   nSR                  U5      [        U5      S:X  a  SOS-   nU Vs/ s H  oR                  5       PM     n	nSR                  U	5      [        U5      S:X  a  SOS-   n
U R                  SU SU S35        UR                  5        VVs/ s H  u  pU(       d  M  UPM     nnnU(       a#  U R                  SSR                  U5       35        U R                  S	U
 S
U SU S35        U R                  SU S35        gs  snf s  snf s  snnf )z'Generate code to call a graph partitionr  r6   ro  r  	partition	_args = [r  r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr  r  symbol_inputsr   r   r   rM   r   r  )r
  partition_idr  r  r  symbol_inputrp  r  rS   output_namesr  r   
deallocater"  s                 rV   codegen_partition_call+PythonWrapperCodegen.codegen_partition_call  sn    2DD+88-22452F2T2T9
2T,2T9
 
 ;'#k2Ba2G3RP4@ALDLA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
)C%TzD)C 	 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E"E'-E,>E,c                V    [        U5       Vs/ s H  nSU 3PM
     snU l        g s  snf )N
partition_)rj  r  )r
  num_partitionsr  s      rV   set_all_partition_names,PythonWrapperCodegen.set_all_partition_names1  s*    BGBW#XBW3j$6BW#X #Xs   &c           	     t   SR                  U5      [        U5      S:X  a  SOS-   nSR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        U R                  SU SUR                  R                   SUR                  R                   S	35        g )
Nr  r6   ro  r  r  r  r  z) = r  )r   r   r   rK   r   )r
  rn  rm  outer_flattened_outputsouter_output_namesouter_input_namess         rV   ,codegen_subgraph_call_with_flattened_outputsAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs4  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rX   c                x   SR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        [
        R                  R                  R                  5         U R                  U SUR                  R                   SUR                  R                   S	35        g )
Nr  r6   ro  r  r  r  r   r  r  )r   r   r   rK   r   r5   r   free_buffers)r
  rn  rm  outer_buffer_namer  s        rV   codegen_subgraph_call*PythonWrapperCodegen.codegen_subgraph_callF  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rX   c                   U R                  UR                  5        U R                  S5        U R                  U R                   SUR                   35        [
        R                  nUR                  UR                  l        UR                  UR                  l        UR                  R                  U R                  ;  a  [
        R                  " UR                  5         [        R                  " SS5         UR                  R                  5       u  p4S S S 5        S S S 5        UR                  R                  nU R                  R                  U5        U R                  UW5        g g ! , (       d  f       N[= f! , (       d  f       Nd= f)Nr  rw  ru  F)rL  rK   r  r  r   r5   r  
fx_wrapperr  ry  r"   patchrP  r   r(  )r
  rn  rx  r'  r  r  s         rV   codegen_subgraph_common,PythonWrapperCodegen.codegen_subgraph_commonW  s   !!(..1"T\\N+hmm_EFww%1%=%="$0$;$;!>>d&F&FF $$X^^4\\"3U;'/~~'='='?$M < 5
 %NN//M,,00?,,]MJ G
 <; 54s$   !E<9E+E<+
E9	5E<<
F
c                J    U R                  U5        U R                  XU5        g rJ   )r  r  )r
  rn  rm  r  s       rV   'codegen_subgraph_with_flattened_outputs<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputsl  s&     	$$X.99$;	
rX   c                J    U R                  U5        U R                  XU5        g rJ   )r  r  )r
  rn  rm  r  s       rV   rz  %PythonWrapperCodegen.codegen_subgrapht  s#     	$$X.""8;LMrX   c                   UR                  5       nU R                  U S[        UR                  5       35        UR                   Vs/ s H  o3R                  5       PM     nn[        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  oR SU S3PM     nnU R                  UR                  XF5        g U R                  UR                  XB5        g s  snf s  snf )N = [None] * r  r  )rM   r   r   r  r  r  r5   rK   r  rj  r{  rn  rz  )r
  invoke_subgraphr   re  rm  r  rt  s          rV   codegen_invoke_subgraph,PythonWrapperCodegen.codegen_invoke_subgraphz  s    '')$|C0G0G,H+IJK;J;Q;QR;QC--/;QR77(-c/2I2I.J(K(K1&!A(K   --((, !!/":":LO Ss   C/"C4c                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nnUR                  R                  5       n[	        UR                  [
        R                  5      (       d  U S3nU R                  U S[        UR                  5       35        U R                  SU S35        U R                  [        XR                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR                  XG5        OU R#                  UR                  XB5        U R                  [%        U 5      5        U R                  S5        U R                  [        XR&                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR&                  XG5        OU R#                  UR&                  XB5        U R                  [%        U 5      5        g s  snf s  snf s  snf )Nr  r  r   r  r  r  zelse:)rM   operandsr  	predicater[   r#   ShapeAsConstantBufferr   r   r  rD  true_subgraphrK   r5   r  rj  r{  rz  rr  false_subgraph)r
  conditionalr   re  rm  r  r  rt  s           rV   codegen_conditional(PythonWrapperCodegen.codegen_conditional  s   ##%;F;O;OP;OC--/;OP));;=	+//1I1IJJ$+W-I$|C0C0C,D+EFGYKq)*(/H/H/N/NOP775:3{?R?R;S5TU5TvQqc^5TMU--))< !!+";";\P'-.w(/I/I/O/OPQ775:3{?R?R;S5TU5TvQqc^5TMU--**L !!+"<"<lQ'-.9 Q V Vs   I8-I=Jc                ,	  ^  U 4S jnUR                  5       nUR                   Vs/ s H  oUR                  5       PM     nnUR                   Vs/ s H  oUR                  5       PM     nn[	        U5      nT R                  U S[	        U5       35        U(       a   T R                  U S[	        U5       S35        [        U5       H  u  pT R                  U SU	 SU
 35        M      / [        [	        U5      5       V	s/ s H
  o SU	 S3PM     sn	QUQnU S3/n[        U5      nUS	[	        U5       nU" UR                  X5        T R                  S
US    35        T R                  S5        U(       ax  [        U5       Hh  u  pT R                  [        T UR                  R                  5      5        T R                  U SU	 SU S35        T R                  [        T 5      5        Mj     Ow[        U5       Hh  u  pT R                  [        T UR                  R                  5      5        T R                  U SU	 SU S35        T R                  [        T 5      5        Mj     T R                  S5        T R                  [        T UR                  R                  5      5        U" UR                  X5        T R                  [        T 5      5        U(       a  T R                  [        T UR                  R                  5      5        [        [	        U5      5       H"  n	T R                  U SX-    SU SU	 S35        M$     T R                  [        T 5      5        T R                  [        T UR                  R                  5      5        U" UR                  X5        T R                  [        T 5      5        T R                  SUS    35        U(       a  T R                  S5        [        [	        U5      5       H  n	T R                  SU SX-    S35        T R                  [        T UR                  R                  5      5        T R                  U SU	 SU SX-    S35        T R                  [        T 5      5        M     g	g	s  snf s  snf s  sn	f )z1while_loop is codegened as a host side while_loopc                   > [         R                  R                  (       a  TR                  XU5        gTR	                  XU5        g)z3Helper method to deduplicate subgraph codegen logicN)r5   rK   r  r{  r  )rn  rm  rt  r
  s      rV   rz  APythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraph  s3    ww11(-X<<MrX   r  z.extend([[] for _ in range(z)])r  z] = r  _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()r  zwhile should_loop:z	].append(z])z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rM   carried_inputsr  additional_inputsr   r   r  rj  r  cond_subgraphrD  body_subgraphrK   rr  )r
  
while_loopstack_outputrz  r   re  outer_carried_inputsouter_additional_inputs
ckp_offsetr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputs   `               rV   codegen_while_loop'PythonWrapperCodegen.codegen_while_loop  s]   	 ""$/9/H/H 
/H!!#/H 	  
 0:/K/K#
/K!!#/K 	  #
 -.
$|C0D,E+FGHNN&3C8L4M3NcR   45FANNdV1QCtC512 6
&+C0D,E&FG&Fas!n&FG
$
 "&l34 
 //J5I1JK$$&7	
 	(:1(='>?@,-$-.B$C 0z7O7O7U7UVW$q4>STU/56 %D
 %..B$C 0z7O7O7U7UVW$q4hGH/56 %D
 	+,(z/G/G/M/MNO$$&7	
 	'-. NN,T:3K3K3Q3QRS3345$q(8	$q2NO 6NN+D12 	(z/G/G/M/MNO$$&7	
 	'-.+,>q,A+BCD NNBC3345a/?wGH0z7O7O7U7UVWfAaS 0a7GyQ /56 6 Q 
#
  Hs   RR?Rc                     [        U SS 5      (       a  g [        U [        5      (       a  U $ [        R                  R
                  R                  U 5      nUc  U$ [        U5      $ ! [         a     g f = f)Nr  )r  r[   r   r5   rK   
_shape_env_maybe_evaluate_staticrY  )r   r  s     rV   statically_known_int_or_none1PythonWrapperCodegen.statically_known_int_or_none
  sl    	q.$// !S!!''$$;;A>C{
s8O 		s!   A% A% -A% 
A% %
A21A2c                r    / nU  H.  n[         R                  U5      nUc    g UR                  U5        M0     U$ rJ   )rE  r  rq   )lstr  r   r\  s       rV   %statically_known_list_of_ints_or_none:PythonWrapperCodegen.statically_known_list_of_ints_or_none  s<    A&CCAFC{MM#	 
 rX   c                0    [         R                  U 5      S L$ rJ   )rE  r  )r  s    rV    is_statically_known_list_of_ints5PythonWrapperCodegen.is_statically_known_list_of_ints$  s     !FFsKSWW	
rX   c                H    [         R                  U R                  5       5      $ rJ   )rE  r  rC  rG  s    rV   r8  4PythonWrapperCodegen.static_shape_for_buffer_or_none*  s    #IIOO
 	
rX   c                0    [         R                  U 5      S L$ rJ   )rE  r8  r  s    rV   !can_prove_buffer_has_static_shape6PythonWrapperCodegen.can_prove_buffer_has_static_shape0  s    #CCFKSWWWrX   c                    g rJ   r   )r
  r  node_schedules      rV   write_kernel_context_guard/PythonWrapperCodegen.write_kernel_context_guard4  s    
 	rX   c                    g)z,
Mark the beginning of kernel context guard
Nr   r	  s    rV    write_kernel_context_guard_begin5PythonWrapperCodegen.write_kernel_context_guard_begin;       	rX   c                    g)z&
Mark the end of kernel context guard
Nr   r	  s    rV   write_kernel_context_guard_end3PythonWrapperCodegen.write_kernel_context_guard_endC  r  rX   )/r  r  r  r  r  r  r  r   r  r  r  r  rH  r  r  r  r  r  r,  r  r  r  r   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   rJ   )r
  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]rU  )r   r   r  r   r   r5  )r  r   )rN  TritonMetaParamsr   r   r   r  )r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])rp  r  r   r   )r  r   r  r   r   r   r  )r  r   r   r5  )r  r  r   r5  r  r-   r   r5  )rS   zir.FallbackKernelr   r5  )rS   r  )rS   r  r   r5  )r  r   r  r   r  r   r  r  r  r   r  zOptional[OrderedSet[str]]r   r5  )F)rS   r  )rS   r  r   r5  )r  r   r  r   r  zCallable[[], Sequence[str]]r  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]r  r~  r  zSequence[ir.Buffer]r   r5  )r$  Callable[..., None]r   zIterator[Callable[..., None]])r  r   )r   r   r   r  r  r  )r  r  )r   r   rR   r   r   r   )r   r   r   r   )r  r   r   r   r(  r   r   r   )rJ  zSequence[Expr]r   r   )r   ztuple[str, list[str]])r   r  r   r   )r  zUnion[bool, str])rS   zir.MultiOutput)NTN)
r  r   r  r   r  r   r  r   r  r   )r  r   r  r   r  r   )r   r   )r   z"list[list[Union[int, sympy.Expr]]])r  r   r  r   )r  r  rK   rD   r   r5  )r  r;   )r  r   )rG  r!  )NF)r  )rG  r  )r"  r  )r  r   r  r   r'  r   )r+  r  )r%  r!  r$  r!  rm  r   )r   r   rf  zir.ReinterpretViewr   r5  rG  r;  )rF  r;  rO  r;  )r  r   r  r   r  r  r   r5  )r  r   r  zir.GraphPartitionSignature)r  r   )r  r   r  z0Union[Sequence[BaseSchedulerNode], ExternKernel])r   r   r  r  rQ  r  r   r  ra  r  r  r  r  r7  r  r)   rC  rH  rK  rQ  rV  rY  r]  rT  rf  ri  rl  rr  rx  r{  r  r  r  r  rL  rw  rG  rt  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   contextmanagerr&  r*  r  r9  rc  rp  r3  r  r  r  r;  r  r  r  r  r   rG  r  r  r  r  r  r  r  rl  r  r<  r  r"  r  r(  classmethodr-  r~  r  r  r  r  r  r  r1  r2  r6  r7  r5  r8  r  r   r  r  r  r   r  r/  r  r@  r  r  r  r  r  r#  r(  r,  rq  rg  rD  rJ  rI  rS  rV  rY  r]  r  r{  r  r  r  r  r  r  rz  r  r  r  r  r  r  r8  r  r  r  r  r  r7  r8  s   @rV   rE  rE  5  s    "d!_#B 
 FJ	&&$& 7& C	& &'<oBb8  "	B + +! 
 

$	G$
%S$	(
.8):6-.10J
/+7&5
5$:,:8 8 
8 37;; ;  	;
 ; ; 0; 
; (	U
8(B<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,S
j.S`:
 
D%(K(K (K -	(KT'?RD @D W CG +.&'6	, <P '<P 
<P|>S
IG264*pd4 #'(,

 
  	

 
 &
& FJ		'*	6C	 	 #'(,!! !  	!
 ! &!,>  TJ 2TJl*A"A+8A	A
%N:.>85WF

2
4=G~' !4
4
v !y(y(v !,%P	
 8 8 TY.`*_*Au '+ $` 
53n:(
 
E'

 
 H	

 

JJ J H	J
 
JX+'Z<< 9<BY
$
"K*
NP /B]7~     
 

 
 

 X X H rX   rE  c                    ^  \ rS rSrSr S     SU 4S jjjrSS jrSS jrS rS r	S r
SS	 jrSS
 jrSS jrSS jr  SS jrSS jrSS jrSU 4S jjr\SS j5       r\SS j5       r\S S j5       rS rSrU =r$ )!r	  iL  z
A wrapper codegen that generates code for a subgraph. For most of the
methods, we rely on the implementation in the PythonWrapperCodegen. But we
override a few functions to produce cleaner code (like avoiding writing
imports twice in the output code)
c                   > Xl         X l        X0l        [        TU ]  5         U R                  5       nUR                  U l        UR                  U l        UR                  U l        UR                  U l	        g rJ   )
r  r  r  r  r  get_root_graphr  r   r  r  )r
  r  r  r  rootr  s        rV   r  %SubgraphPythonWrapperCodegen.__init__T  sh     +,$8!""$$($=$=!%)%?%?"!//)-)G)G&rX   c                &    U R                   U l        g rJ   )r  r  r	  s    rV   r  1SubgraphPythonWrapperCodegen.set_launcher_fn_namek  s     !% 2 2rX   c                    g rJ   r   r	  s    rV   r  )SubgraphPythonWrapperCodegen.write_headerq  r9  rX   c                    g rJ   r   r  s     rV   r<  2SubgraphPythonWrapperCodegen.add_benchmark_harnesst  r9  rX   c                    g rJ   r   r  s     rV   r  6SubgraphPythonWrapperCodegen.benchmark_compiled_modulew  r9  rX   c                    g rJ   r   r	  s    rV   rl  5SubgraphPythonWrapperCodegen.write_async_compile_waitz  r9  rX   c                6    U R                   R                  5       $ rJ   )r  r  r	  s    rV   r  /SubgraphPythonWrapperCodegen.next_kernel_suffix}  s    ""5577rX   c                    g rJ   r   r  s     rV   r  2SubgraphPythonWrapperCodegen.generate_after_suffix  r[  rX   c                \    U R                   R                  SU R                   S35        SnU$ )Nz
            def z(args):
            r6   )r  r   r  rv  s     rV   rx  >SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rX   c                    gr   r   r	  s    rV   r   4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rX   c                    U R                   =n(       a6  UR                  UR                   Vs0 s H  n[        U5      U_M     sn-  nU$ [        R
                  R                  nU$ s  snf rJ   )r  input_nodesr  r   r5   rK   r  )r
  r:  rg  r  s       rV   r]  -SubgraphPythonWrapperCodegen.get_graph_inputs  sm     11191**#,#:#:.#:aA	#:. F
  WW))F.s   A&c                   U R                   =n(       aL  [        UR                  R                  5       5      UR                   Vs/ s H  o"R
                  PM     sn-   nU$ [        R                  R                  nU$ s  snf rJ   )	r  r  r  r  r  r   r5   rK   rc  )r
  r:  r  namess       rV   r{  2SubgraphPythonWrapperCodegen.get_graph_input_names  su    11191..33566?6M6M:6Ml!!6M: E
  GG--E:s   A<c                |    U R                   =n(       a  UR                  nU$ [        R                  R                  nU$ rJ   )r  r  r5   rK   r
  )r
  r:  r  s      rV   rT  .SubgraphPythonWrapperCodegen.get_graph_outputs  s;    11191,,G  gg++GrX   c                   > UR                  5       nU R                  =n(       a  X#R                  ;   a  g [        TU ]  U5        g rJ   )rM   r  r  r  rD  )r
  rG  r   r:  r  s       rV   rD  /SubgraphPythonWrapperCodegen.codegen_allocation  s=     222I2@U@U8U "6*rX   c                8    U R                   R                  5         g rJ   )r  rC  r	  s    rV   rC  5SubgraphPythonWrapperCodegen.write_triton_header_once  s     	446rX   c                8    U R                   R                  5         g rJ   )r  rK  r	  s    rV   rK  =SubgraphPythonWrapperCodegen.write_get_raw_stream_header_once  s     	<<>rX   c                    U n[        U[        5      (       a#  UR                  n[        U[        5      (       a  M#  [        U[        5      (       d   eU$ rJ   )r[   r	  r  rE  )r
  r  s     rV   r  +SubgraphPythonWrapperCodegen.get_root_graph  sK    DH;<<&&D ;<< $ 45555rX   c                    g rJ   r   r	  s    rV   r9  <SubgraphPythonWrapperCodegen.generate_and_run_autotune_block  s    rX   )r   r  r  r  r  r  r  r  rJ   )r  r   r  rE  r  r  rU  r  r  r  )r   zDdict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]r  r  r  )r   rE  )r   r   r  r  rQ  r  r  r  r<  r  rl  r  r  rx  r   r]  r{  rT  rD  r)   rC  rK  r  r9  r  r7  r8  s   @rV   r	  r	  L  s     FJ	HH -H C	H H.38		M	+ 7 7 ? ?   rX   r	  )rS   r!  r   r3  )rS   r!  r   r6  )rd   r!  re   r!  )NN)r   r   r   zlist[triton.Config]r   zlist[TritonGrid]r   r  r   r   r   ztuple[str, str]r  )
__future__r   r  r   r  r   r9  rk  rQ  r  r'  rH  r  rS  collections.abcr   	itertoolsr   r   typingr   r   r	   r
   rz   r   r  
torch._opstorch.utils._pytreeutils_pytreera  r   r  torch._dynamo.utilsr   r   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   torch._library.opaque_objectr   r   torch._loggingr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r!   r"   r#   	codecacher$   r%   r&   r   r'   runtime.hintsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   virtualizedr5   rZ  r7   r8   r9   r:   r;   r<   	cpp_utilsr=   custom_extern_kernel_codegenr>   triton_utilsr?   r@   rA   rB   rC   r   rK   rD   rE   r   rF   wrapper_fxirrG   	getLoggerr   logdoprintr  r   r  r   r   r3  r6  rj   r!  rA  rW   r`   rh   rw   r  r   r  r   r   r  r  r  r  rH   rD  rY  rc  rj  rr  r~  r  r  r  r  r  r  r  r  r"  r4  r`  r7  r5  r{  r  r  r  r  r  LinerE  r	  r   rX   rV   <module>r7     sc   "    
     	  	  $ " 6 6     $ $ & E E C A ; R +  . / 9 : ( ( ' ( ' ,       F P P 2%!-) ! u{{C565<<c;NPSST 299l*+
]OT12 >QF S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.k&
k& k& k& ,	k&
 (k& k&\U&p   %6 %6PY Y
 2 2 2 	/k 	/ 	/ ++ + + 	2 	2 	2 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5< 	({ 	( 	( /[ / /B 5; 5 5* ; ; ;2%
 %
P g,% g, g,T +6, +6 +6\ /( / /& )" ) )0(! (
 #0k #0 #0L 6; 6 6, 4+ 4 48 	5+ 	5 	5 8[ 8 8 
,-T,7 T,nXG#7 GrX   