
    N j(I                   8   % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJrJrJrJrJ r   S SK!J!r!  S SKJ"r"  S SK	J#r#  S SK$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3J4r4J5r5  S S	KJ6r6  S SK7r7S SK8r8S SK9J:s  J;r<  S S
K=J>r>  S SK?J@r@  S SKAJBrB  S SKCJDrD  S SKEJFrF  S SK9JGrGJHrH  SS/rIS SKJJKrKJLrLJMrMJNrN  \-(       ah  S SKJOrOJPrPJQrQ  S SKRJSrS  S SK8JTrTJUrUJVrV  S SKWJXrX  S SKYJZrZ  S SK[J\r\  S SK]J^r^  SSK_J`r`  SSKaJbrb  SSKcJdrd  SSKeJfrf  SSKgJhrhJiriJjrjJkrkJlrlJmrm  SS KnJoro  SS!KpJqrqJrrr  / S"Qrs\0" S#5      rt\R                  GSGS$ j5       rvS S%KwJxrx  S S&KyJzrz  S S'K{J|r|  S S(K}J~r~  S S)KJr  S S*KJr  S S+KJrJrJrJrJr  S S,KJrJr  S S-KJrJr  SS.KJr  SS/KJr  \R                  S0:H  r\GR*                  " \5      r\0" S15      r\\7GR4                  \7GR4                  4   r\+\1\8GR8                  \\8R                  4      r\(       a  S2O\GR>                  " S3S45      rS5S6S7\ 3S8.rS9rS9rS9rS:r\F" \8GRL                  \8GRN                  \8GRP                  \8GRR                  \8GRT                  \8GRV                  \8GRX                  \8GRZ                  \8GR\                  \8GR^                  \8GR`                  \8GRb                  \8GRd                  \8GRf                  \8GRh                  /5      rS;\S<'   S=r\\S-
  -  S :X  a  \S>:  d   S?5       eGSHS@ jrGSISA jr " SB SC\7GRt                  5      r\GRx                  " SDSE9 " SF SG5      5       rGSJGSKSH jjr   GSL         GSMSJ jjr   GSL         GSMSK jjr\R                  GSNSL j5       rGSOSM jrGSPSN jrGSQSO jrGSRSP jr      GSSSQ jrGSTSR jr    GSUSS jrGSVST jrGSWSU jr    GSXSV jrGSYSW jrSX 4     GSZSY jjr        GS[S[ jrGS\GS]S\ jjr  GS^         GS_S] jjr     GS`             GSaS^ jjrGSbS_ jrGScS` jrGSdSa jrGSeSb jrGSfSc jr\4" Sd5      r\0" SeSDSf9r\\'\%\4   \4   r " Sg Sh\,\(\\4   5      rGSgSi jr    GSgSj jr    GShSk jr    GSiSl jr      GSjSm jr      GSkSn jr GSl     GSmSo jjr      GSnSp jrGSoSq jrGSpSr jrGSqSs jrGSrSt jrGSsSu jrGStSv jrGSuSw jrGSvSx jrGSwSy jr\" / SzQ5      r    GSxS{ jrGSyS| jrGSzS} jrS SKrGS{S~ jr/ rSZ\S'   GS|S jrGS{S jr\GR                  GS}S j5       r\GR                     GS~       GSS jj5       r\r\r\rSIS.GSS jjrSIS.       GSS jjr\RD                  " S>5      GSS j5       r " S S\*5      r\GRx                   " S S5      5       r " S S5      Gr  " S SG\ 5      Gr\GR                  GSS j5       Gr " S S5      Gr " S SG\5      Gr\R                  GSGSS jj5       Gr\RD                  GSS j5       Gr\RD                  GSNS j5       GrGSS jGr GSl       GSS jjGr	      GSS jGr
GSS jGrGSS jGrSISISDS.         GSS jjGrSSIS.       GSS jjGrSIS.       GSS jjGrSIS.       GSS jjGr        GSS jGr\RD                  " SS9GSNS j5       Gr\RD                  " SS9GSNS j5       Gr\RD                  " SS9GSNS j5       Gr                  GSS jGrGSS jGr  GS                 GSS jjGrGSS jGr\1\\7GR4                  4   GrS\S'   \R                   GS         GSS jj5       Gr\R                  GSS j5       Gr\R                  GSS j5       Gr\R                  GSS j5       Gr\R                  GSS j5       GrGSS jGrGSS jGr GSS jGr!GSS jGr"GSS jGr#        GSS jGr$    GS               GSS jjGr%GSNS jGr& " S S5      Gr'        GSS jGr(        GSS jGr)GSS jGr*GSS jGr+GSS jGr,        GSS jGr-        GSS jGr.\GR                        GSS j5       Gr/ GSl     GSS jjGr0GSS jGr1GSS jGr2GSS jGr3GSS jGr4GSS jGr5GSS jGr6\GR                  GSS j5       Gr7GSS jGr8\R                  GSS j5       Gr9\R                  GSS j5       Gr:\R                  GSS j5       Gr;GSS jGr<GSS jGr=GSS jGr>GSS jGr?GSNS jGr@GSNS jGrAGSS jGrBGSwS jGrC " S S\GR                  5      GrE          GSS jGrFGSS jGrG    GSS jGrH GSl     GSS jjGrIGSS jGrJ GSl     GSS jjGrKGSS jGrL      GSS jGrM        GSS jGrNS 4           GSS jjGrOS 4           GSS jjGrPGSS jGrQGSS jGrR\GRx                   " S S5      5       GrS\GR                  GSS j5       GrTGSS jGrUGSS jGrVGSNS jGrWGSS jGrXGSS jGrY              GSS jGrZGSS jGr[GSS jGr\GSS jGr]GSS jGr^        GSS jGr_GSS jGr`        GSS jGraGSS jGrb GSl       GSS jjGrc      GSS jGrdGSS jGre      GSS jGrfGSNGS  jGrgGSGS jGrhGSGSGSGSGSGSGSGS.GriG\iGR                  5        V Vs0 s H  u  pX_M	     snn Grk\GR                  " GS	5      GrmGSGS
 jGrnGSGS jGroGSGS jGrpGSGS jGrq\R                  GSGS j5       Grr\GRx                   " GS GS5      5       Grs0 GrtGS\GS'           GSGS jGru\F" 5       GrvGS\GS'   GSGS jGrwGSlGSGS jjGrxGSGS jGry\0" GS5      Grz\0" GS5      Gr{ " GS GS\G\zG\{4   5      Gr|\3" SDGS9GSlSDSE.GSGS jjj5       Gr}GSGS jGr~ " GS  GS!\GR                  5      Gr\R                  GSGS" j5       GrGSNGS# jGrGSGS$ jGrGSGS% jGrGSGS& jGrGSGS' jGrGSGGS( jGrGSGS) jGrGSNGS* jGrGSGS+ jGrGS,GrGSGS- jGrGSGS. jGrGSGS/ jGr  GS         GSGS0 jjGrGSGS1 jGrGSGS2 jGrGSNGS3 jGrGSGS4 jGrGSGS5 jGr\GRx                  " SDSE9 " GS6 GS75      5       Gr\GS8\%4   Gr\G\G\/G\4   Gr " GS9 GS:5      GrG\" 5       GrGSGS; jGrGSGS< jGrGSGS= jGrGSGS> jGrGSGS? jGr\F" / GS@Q5      GrGSGSA jGr\"GSGSB j5       GrGSGSC jGr              GSGSD jGr      GSGSE jGr GS       GSGSF jjGrgs  snn f (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     [          V s/ s H*  n [        [        U 5      R                  5       (       d  M(  U PM,     nn [	        U5      S::  d   e[	        U5      S:X  a  SnU$ UR                  5       nU$ s  sn f )Nr7   r   rH   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      f/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/utils.pyget_gpu_typerX   j   sh    &KY'%*;*H*H*J!YJKz?aZA-vHO 4>>>3CHO Ls
   'A2A2)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rH   hiprJ         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2c                *    U [         -   S-
  [         * -  $ )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)nbytess    rW   _alignrz      s    [ 1$44    c                   [        U [        R                  [        R                  45      (       a#  [	        [        [        U R                  5      5      $ [        U [        5      =(       d"    [        R                  " U [        5      [        :H  $ )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrx   )vs    rW   r   r      sT    !eii+,,3{AFF+,,aK599Q#<#KKr{   c                  4    \ rS rSrSrSrSr\SS j5       rSr	g)	r      z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tc                    [        U[        [        R                  45      (       a  [	        [        U5      5      $ [        U5      (       a  U$ g N)r}   intr~   Integerrz   r   )clsvalues     rW   eval
align.eval   s<    ec5==122#e*%%uL r{    N)r   
sympy.ExprreturnzOptional[sympy.Expr])
__name__
__module____qualname____firstlineno____doc__nargs
is_integerclassmethodr   __static_attributes__r   r{   rW   r   r      s!    FEJ r{   r   T)frozenc                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)GraphPartitionMap   zH
Mapping from the partition info (e.g., input/output) to the graph info
r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesr   Nr   r   r   r   r   __annotations__r   r   r{   rW   r   r      s$    
 	G -,-- r{   r   c           
        U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R
                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[         R                  R                  [         R                  R                  R                  /S9 n
[         R                  R                  5         [        U	5       Hp  nUR                  5         XK   R                  5         [         R                  R                   R                  S	5         U " 5         S
S
S
5        X[   R                  5         Mr     [         R                  R                  5         [         R"                  " [%        XE5       VVs/ s H  u  pUR                  U5      PM     snn5      nS
S
S
5        [         R&                  " W5      R)                  5       n[*        R-                  S5        [*        R-                  W
R/                  5       R1                  SSS95        [3        U
R5                  5        Vs/ s HI  nUR6                  [8        R                  :X  d  M#  [:        R<                  " SUR>                  5      c  MG  UPMK     sn5      nU(       a#  U[@        R&                  " S U 5       5      S-  -  n[*        R-                  SU5        U$ s  snf s  snf ! , (       d  f       GN= fs  snnf ! , (       d  f       GNK= fs  snf ):  
Returns benchmark results by examining torch profiler events.
This could be more accurate as it doesn't count CPU side overhead.
However, this also requires manually excluding irrelevant event, e.g.
vectorized_elementwise_kernel which is used to fill L2 cache,
various CUDA events, etc, so could also be fragile.
    ArH   dtypedeviceTenable_timing   r7   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  8   #    U  H  oR                   v   M     g 7fr   device_time_total.0events     rW   	<genexpr>fp8_bench.<locals>.<genexpr>(  s     Q33        @@profiling results: %s ms)!rP   rH   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestabler\   eventsdevice_typer[   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rW   	fp8_benchr      s>    D	JJKKJu}}VLE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  BGxQA5::##$#7KQ?DXO!!!!5IO			NN++00
 
  
 


 xAKKMN!!#&&7 8L! ! 	

 +.{+FG+F41Q^^A+FG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
#!!Z__4  HH0%**=	 #	
	O OOQQQ	

 II(#.JO RO 87
 H
 
*	
sP   'P'P!;A8P>3P&;AP>P82P>"Q) QQ&
P50P>>
QFc                4    SSK Jn  U" [        5      " XX#5      $ )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rW   do_bench_using_profilingr   0  s     " U*+DE
C r{   c           
     (   U(       d  SSK Jn  U" 5         [        5       nUR                  5       n[	        U5      nU " 5         UR                  5         [        R                  " [        S5      [        R                  US9nUR                  SS9n	UR                  SS9n
U	R                  5         [        S5       H  nUR                  5         U " 5         M     U
R                  5         UR                  5         U	R                  U
5      S-  n[        S[        X-  5      5      n[        S[        X,-  5      5      n[        U5       H
  nU " 5         M     UR                  5         [        R                  R!                  [#        [        R                  R$                  U5      /S	9 n[        U5       H  nUR                  5         U " 5         M     UR                  5         S
S
S
5        [&        R)                  S5        [&        R)                  WR+                  5       R-                  SSS95        [/        UR1                  5        Vs/ s H7  nUR2                  [#        [4        U5      :X  d  M#  UR6                  S:w  d  M5  UPM9     sn5      n[9        U5      U-  S:w  a  [;        SU[9        U5      U5      e[9        U5      U-  n[/        [=        U5       VVs/ s H  u  nnUU-  S:w  d  M  UPM     snn5      nUR?                  5         UR+                  5       n[&        R)                  S5        [&        R)                  UR-                  SS95        [A        S U 5       5      S-  U-  n[&        R)                  SU5        U$ ! , (       d  f       GN= fs  snf s  snnf )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   zContext SynczWFailed to divide all profiling events into #repeat groups. #%s events: %d, #repeats: %szprofiling time breakdown)r   c              3  8   #    U  H  oR                   v   M     g 7fr   r   r   s     rW   r   ,_do_bench_using_profiling.<locals>.<genexpr>  s     A=%%%=r   r   r   )!r   r   rX   upperrY   r   rP   r   r   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r\   r   r   r[   r   rR   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   device_type_upperdevice_interfacer   r   r   r   r   r   r   r   r   r   num_event_per_groupr   actual_eventsr   s                         rW   r   r   H  s     "M.K#))+/<D  "KKJuyyME #((t(<K &&T&:I1X
    "**959K 1c&./0H1c#+,-H 8_
    "			ENN335FG
 
  
 
xAKKMD	 ! 	$$&
 IIlIIann$$-EQS$TU 	
#  GJ8I$JJ  

n, #	
O ?h&!++ 
 	
 o.9 &o6	
65&&!+ 6	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.Jc
 
$	
"	
s*    :M7?"N	%N	7N	N
"N
7
Nc                     SSK Jn   [        R                  R	                  SS5        U S L=(       a%    [        [        [        R                  SS 5      S5      $ ! [         a     g[         a  nS[        U5      ;   d   e S nAgS nAff = f)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr	  Fztorchvision::nms does not exist)torchvision.opsr	  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr   str)r	  r   s     rW   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 
B$	B-BBc                t   U c   [         R                  " S5      R                  $ [        U [        5      (       a  [         R                  " U 5      n U R
                  S;  aY  U R                  cL  [        U R
                  5      n[         R                  " U R
                  UR                  R                  5       S9$ U $ )Ng        )cpumeta)index)
rP   r   r   r}   r  typer  rY   Workercurrent_devicer   r  s     rW   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMr{   c                ~    [         R                  " [        R                  U [        R
                  R                  5      $ r   )	functoolsreduceoperatormulr~   SOne)its    rW   sympy_productr%    s#    HLL"eggkk::r{   c           	         [        U 5      [        U5      :X  d   e[        R                  " [        S [	        X5       5       5      5      $ )Nc              3  .   #    U  H  u  pX-  v   M     g 7fr   r   )r   abs      rW   r   sympy_dot.<locals>.<genexpr>  s     >odaAEos   )rR   r~   expandr  r   )seq1seq2s     rW   	sympy_dotr.    s6    t9D	!!!<<>c$o>>??r{   c                b    U  Vs0 s H  n[        U5      U_M     snR                  5       $ s  snf r   )r   values)r$  rT   s     rW   uniquer1    s+     !bBqE1Hb!((**!s   ,c           
        [        U [        R                  5      (       d  [        U[        R                  5      (       a4  [        [        R                  " U 5      [        R                  " U5      5      $ [        U [
        5      (       a  [        U[
        5      (       d$   U  S[        U 5       SU S[        U5       35       e[        X5      $ )Nz: , )r}   r~   Exprr_   sympifyr   r  runtime_ceildiv)numberdenoms     rW   rj   rj     s     &%**%%E5::)F)Fu}}V,emmE.BCC fc""z%'='= ("T&\N"UG2d5k];= 6))r{   c                t   U c  g[        U 5      R                  S5      S   n0 SS_SS_SS	_S
S_SS_SS_SS	_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_SS#S$S%S&.EnUR                  [        UR	                  5       5       Vs0 s H  o3U_M     sn5        [        U [         5      (       a  U $ S'X!    3$ s  snf )(Nz*i8rp   r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r  splitupdatelistr0  r}   )key	dtype_strtysr   s       rW   _type_ofrf    sW   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<0101012S#&&3@a/?,@@ 2s   B5c                Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf )z
Gets the shape and stride of a tensor. For non-symbolic tensors, this is
trivial. But for symbolic tensors, we need to map from SymIntNode into
sympy.Expr.
)r~   r5  lstr   s     rW   convert_shape_to_inductorrj    s%     '**cEMM!c***s    (c                p    [        U [        R                  5      (       a  U R                  R                  $ U $ )z
Convert SymInt to sympy.Expr, leave int as is.

Unlike sympy.sympify() which converts int to sympy.Integer,
this function preserves int as int and only converts SymInt to Expr.
)r}   rP   r2   nodeexprvals    rW   convert_symint_to_exprrp    s(     #u||$$xx}}Jr{   c                    SSK Jn  [        U [        5      (       a  U $ [        U [        R
                  5      (       a  [        U 5      $ UR                  R                  R                  R                  U SS9$ )zD
Like convert_shape_to_symint, but operates on a single expression.
r7   VN)hint)
virtualizedrs  r}   r   r~   r   graphsizevars	shape_envcreate_symintnode)r   rs  s     rW   convert_to_symintrz    sk      a 	

 !U]]++ F	 !!++==ad=Kr{   c                D    U  Vs/ s H  n[        U5      PM     sn$ s  snf )zn
Takes a list of shapes from Inductor and converts them into symints (or just
ints if all shapes are static).
)rz  rh  s     rW   convert_shape_to_symintr|  .  s"     +..#Qa #...s   c                N    [        S U R                  R                   5       5      $ )z%
Does this op overload have aliasing
c              3  <   #    U  H  oR                   S Lv   M     g 7fr   )
alias_infor   r(  s     rW   r   is_view.<locals>.<genexpr><  s     F1EA||4'1Es   )any_schema	argumentsops    rW   is_viewr  8  s     F1E1EFFFr{   c                    gNFr   )r   s    rW   <lambda>r  A  s    r{   c                  ^ U R                   S:w  a  g[        U R                  [        R                  R
                  5      (       d  U R                  [        R                  L d  g[        [        R                  R
                  U R                  5      nU[        R                  L d  [        U5      (       a  [        U4S jU R                   5       5      $ [        R                  R                  UR                  ;   =(       d    T" U5      $ )z
Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

Uses in views ops will follow the views uses
call_functionFc              3  <   >#    U  H  n[        UT5      v   M     g 7fr   )is_pointwise_use)r   uis_pointwise_fns     rW   r   #is_pointwise_use.<locals>.<genexpr>R  s     KA#A77s   )r  r}   targetrP   _ops
OpOverloadr   getitemr   r  r   usersTag	pointwisetags)user  r  s    ` rW   r  r  ?  s     vv 3::uzz4455xGWGW9W%**''4F!!!WV__KKKK99&++-H1HHr{   	list[Any]c           	       ^^ [         R                  R                  5       m/ mSUU4S jjnTR                  " U /[	        [         R
                  X1U45      Q76 n[        U R                  R                  5      S:X  a3  [        U R                  R                  S   R                  5      S:X  a  U4nTR                  U5        [         R                  R                  0 T5      nUT4$ )Nc                `   > TR                  U 5        TR                  S[        T5       35      $ )Narg)appendplaceholderrR   )r  g
graph_argss    rW   add_tensor_arg)gen_gm_and_inputs.<locals>.add_tensor_arg]  s,    #}}s3z?"3455r{   r7   r   Tensor)r  torch.Tensorr   r5   )rP   fxGraphr  r%   r  rR   r  returnsr  r  outputr4   )r  r   kwargsr  rl  gmr  r  s         @@rW   gen_gm_and_inputsr  W  s     	A%'J6 6 ??u||^F^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>r{   c                t    U S:X  a  g [        U 5      nUR                  5       (       a  UR                  5         g g Nr  )rY   rQ   r   r  s     rW   r   r   o  s7    /7$$&&$$& 'r{   c                    [        U5        [        R                  " S5        [        R                  " 5       n[        U5       H  nU " U6 n[        U5        M     [        R                  " 5       nWc   eXt-
  $ )Ni9  )r   rP   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rW   timedr  w  sk     	d				B5\'F  
			B7Nr{   c                    [         R                  " [        U5       Vs/ s H  n[        XX%5      PM     sn5      n[         R                  " U5      U-  n[        X-  S 5        UR                  5       $ s  snf )Nz.6f)rP   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rW   print_performancer    se     ll>CFmLmue	4mLG << 5(D	T_S!#99;	 	Ms   A3c                F   ^ [        X5      " 5       m[        XU4S j5        g)zKReplace obj.method() with a new method that returns a precomputed constant.c                    > T $ r   r   )r  s   rW   r  #precompute_method.<locals>.<lambda>  s    r{   N)rO   setattr)objmethodr  s     @rW   precompute_methodr    s    S!#FC(r{   c                ,    U H  n[        X5        M     g)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rW   precompute_methodsr    s    #& r{   c                8    [        X:  5      [        X:  5      -
  $ r   )r   r(  r)  s     rW   cmpr    s    qu:AE
""r{   c                    [        U [        5      (       a  U /U-  $ [        U 5      S:X  a  [        U 5      " U S   /5      U-  $ U $ )Nr7   r   )r}   r   rR   r  )rT   sizes     rW   pad_listliker    sD    !SsTz
1v{Aw!v%%Hr{   c                @    [        U 5      S:X  a  / $ SS jn[        XS9$ )Nr   c                    [        U [        5      (       a  U $ SSKJn  [        X5      (       d   eU R	                  5       $ )Nr7   )rF   )r}   r  	schedulerrF   get_name)elemrF   s     rW   	sort_functuple_sorted.<locals>.sort_func  s4    dC  K0$2222}}r{   rc  )r  rl   r   r  )rR   sorted)rT   r  s     rW   tuple_sortedr    s$    
1v{	 !##r{   PRV)	covariantc                  2    \ rS rSr\SS j5       rSS jrSrg)CachedMethodi  c                    g r   r   )r   s    rW   clear_cacheCachedMethod.clear_cache  s    ),r{   c                    g r   r   selfr   r  s      rW   __call__CachedMethod.__call__  s    r{   r   N)r   r   r   None)r   P.argsr  P.kwargsr   r  )r   r   r   r   staticmethodr  r  r   r   r{   rW   r  r    s    , ,Dr{   r  c           	        ^ U R                   nSU S3mSU 0n[        SU ST ST S3R                  5       U5        [        R                  " U 5      " X! S3   5      nS
U4S	 jjnXCl        U$ )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                B   > [        U T5      (       a  [        U T5        g g r   r  delattrr  rc  s    rW   r  "cache_on_self.<locals>.clear_cache  s    4D# r{   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rc  s        @rW   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH oob!#n&=">?G &Nr{   c                    [        U 5      $ )zU
Variant of cache_on_self for properties. The only difference is the type signature.
)r  )r   s    rW   cache_property_on_selfr    s     r{   c                    ^      SU 4S jjnU$ )Nc           	        >^ ST SU R                    S3mSU 0n[        ST ST ST S3R                  5       U5        [        R                  " U 5      " US	   5      nSU4S
 jjnX2l        U$ )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                B   > [        U T5      (       a  [        U T5        g g r   r  r  s    rW   r  <cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!!c" "r{   r  r  )r   r  r  r  rc  
class_names       @rW   r  'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, #CL1	# (r{   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rW   cache_on_self_and_argsr    s     
$$	$L Nr{   c           
        SSK Jn  [        U [        5      (       ay  [        R
                  " [        R                  U  Vs/ s H?  n[        US5      (       d  M  UR                  (       d  M)  UR                  R                  PMA     sn[        5       5      $ [        XR                  5      (       a  U R                  $ [        5       $ s  snf )Nr7   irrl  ) r  r}   rb  r  r  r   or_r  rl  originsr#   r?   )node_scheduler  rl  s      rW   aggregate_originsr  $  s     -&&LL * *D4( "-1YY "		!!) L	
 		
 
M??	3	3$$$|s   C
C
+C
c                `   [        U 5      nUS:X  ag  S nU Vs/ s HA  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   c  M9  U" U5      PMC     nn[        [	        U5      5      nGOUS:X  a  / nU H  nUR                  S:X  d  M  S nSnSUR                  ;   a  UR                  S   S   nO$SUR                  ;   a  UR                  S   S   nS	nU(       d  Mi  [        US
   [        5      (       a  UR                  US
   U-   5        M  UR                  US
   R                  U-   5        M     [        [	        U5      5      nO:US:X  a.  U Vs/ s H   oDR                  S:X  d  M  UR                  PM"     nnO[        eSR                  S/U-   5      $ s  snf s  snf )Noriginal_atenc                .   U R                   S   nSn[        U[        R                  R                  5      (       a  UR
                  R                  nU$ [        U[        R                  R                  5      (       a  [        UR                  5       5      nU$ )Nr  r  )
r  r}   rP   r  r  _overloadpacketr   HigherOrderOperatorr  r   )originr  rc  s      rW   get_origin_meta_str2get_fused_kernel_name.<locals>.get_origin_meta_strA  su    "KK8MC-)>)>??#33<< J M5::+I+IJJ-,,./Jr{   r  rP   r  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_noder   fused)r  r  r  r  r#   r}   r  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr  r  sources	source_fnsuffixs           rW   get_fused_kernel_namer%  :  s    $M2KO+	 &
%yyO+ (  6;;. ( O,	 ('% 	 
 G,-	g	%!FyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC00NN9Q<&#89NN9Q<#8#86#AB "" G,-	o	-&1
&1FYY/5QKFKKk 	 
 "!88WI'((G
<
s"   F&F&F&F&!F+8F+c                  ^^ ^! [        U 5      nU Vs/ s H  o3R                  S:X  d  M  UPM     nn[        R                  " [        5      n[        R                  " [        5      nSm U(       a  [        S U 5       5      n[        U5      S:X  ac  US   R                  m [        T S5      (       d0  [        T R                  5       VV	s0 s H  u  pX_M	     n
nn	U
T l        UR                  U 4S jS9  U GHo  nS	UR                  ;   a  UR                  S	   b  UR                  S	   nSn[        U[        R                   R"                  5      (       a  [%        UR&                  5      nOB[        U[        R                   R(                  5      (       a  [%        UR+                  5       5      nU(       a  Xm   R-                  UR*                  5        S
UR                  ;   a<  UR                  S
   S   R*                  nX]   R-                  UR*                  5        GM&  UR                  R/                  S5      S:X  d  GMH  X[R*                     R-                  UR*                  5        GMr     T b  SOSnUR0                   SU SSR3                  UR5                  5       5       SSR3                  UR5                  5       5       S3nUR0                   S3/n[7        UR9                  5       5       HA  u  nnUR-                  UR0                   SU SSR3                  [7        U5      5       35        MC     T Gb  SSKJm  UR-                  UR0                   S35        [        5       n/ n[        U TR>                  5      (       Gd  SSK J!n        S)U4S jjnS*S jm!S+U!4S jjnU  GH  n	[        U	S5      (       a  U	RD                  c  M$  [        U	RD                  S5      (       a  U	RD                  RF                  b  U	RD                  RF                   H  nUR*                  U;   a  M  URI                  UR*                  5        UR                  RK                  UR*                  5      nUc  MZ  U" UUR*                  5      u  nnUR-                  UR0                   SU S U" U5       S!U S35        M     [        U	RD                  S"5      (       d  GM+  U	RD                  RL                  c  GME  U	RD                  RL                   HW  nUR                  RK                  UR*                  5      nUc  M-  U" UUR*                  5      u  nnUR-                  S#U-   5        MY     GM     U H0  nUR-                  UR0                   SURO                  S$S%9 35        M2     UR-                  UR0                   S&S'R3                  U5       35        US(R3                  U5      4$ s  snf s  sn	nf ),a  
Retrieves metadata information for a kernel.
Args:
    node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
        Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
    wrapper (PythonWrapperCodegen):
        An instance of PythonWrapperCodegen, used to define the code comment format.
Returns:
    tuple[str, str]:
        A tuple containing two strings:
            - The first string represents the kernel's metadata.
            - The second string represent the kernel's detailed metadata.
r  Nc              3  8   #    U  H  oR                   v   M     g 7fr   )rv  )r   ns     rW   r   &get_kernel_metadata.<locals>.<genexpr>  s     "CNq77Nr   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                "   > TR                   U    $ r   )r*  )r(  single_graphs    rW   r  %get_kernel_metadata.<locals>.<lambda>  s    lTTUVWr{   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r3  z], Original ATen: []z" Source node to ATen node mapping:z   z => r
  z Graph fragment:rr  c                R  > [        U TR                  5      (       aF  [        U R                  TR                  5      (       a!  U R                  R                  R                  nOU R                  nUc  UnOUR
                  n U R                  5       nX44$ ! [         a    S n X44$ f = fr   )r}   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer8  r   layoutr  s        rW   get_buffer_info,get_kernel_metadata.<locals>.get_buffer_info  s     fbll33
KK9 9 #)++"2"2">">K"("4"4K&"D&++D"#..0F |# + "!F|#"s   B B&%B&c           	     j    SSR                  U  Vs/ s H  n[        U5      PM     sn5       S3$ s  snf )N[r3  r3  )r  r  )shaperT   s     rW   stringify_shape,get_kernel_metadata.<locals>.stringify_shape  s1    499e%<ec!fe%<=>a@@%<s   0
c                   > U c  gT" U R                   5       nT" U R                  5       nU R                   nS[        U R                      U U U S3$ )Nr  ")r  strider   r"   r   )r<  shape_annotationstride_annotationdevice_annotationrB  s       rW   stringfy_layout,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?r{   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r:  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r;  r  r   ztuple[str, ir.Layout | None])rA  zIterable[int]r   r  )r<  zir.Layout | Noner   r  )(r  r  collectionsdefaultdictrb  r#   rR   rv  r  r  nodesr*  sortr  r}   rP   r  r  r  r  r  r   r  getcommentr  keysr  itemsr  r  r?   ru  rs  rL  rM  addtry_get_bufferrN  format_node)"r  r  r!  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr(  node_to_idx_maprl  r  rc  sort_strmetadatadetailed_metadataoriginal_noderU  	all_reads
all_writesrs  r=  rJ  rr:  
input_namer<  woutput_namer   r  r,  rB  s"                                  @@@rW   get_kernel_metadatarn  q  s   $ $M2K+6W;)):Vf;NW ,,T2N$006
 L""CN"CC}")!,22L<)TUU8A,BTBT8U"V8Ufc168U"VIXFW    dii'DIIo,F,R IIo6MC-)>)>??-778M5::+I+IJJ-,,./"'..tyy9$))#))K(+00C&&tyy1YY]],->99%,,TYY7   *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= >u  s=/diiu6N5OP	
 !?   GOO#44D!EF%/\	 "
-99&$J$UX$-$(A
 #q-00AMM4I1=='22q}}7J7J7V]]0066Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ 1 AMM844,,8]]11!"!7!7!?!>$)8)HQ"))#*;< 2- #< #D$$??#3t'7'7PT'7'U&VW #
 	  GOO#4Jsxx
?S>T!UVTYY0111I X #Ws   WWWc                   [        U 5      n [        U 5      nU (       ak  U R                  5       nUR                   HB  nU(       a  U" U5      (       a  M  XB;  d  M   UR	                  U5        U R                  U5        MD     U (       a  Mk  U$ )zJReturns the set of nodes whose values depend on those within initial_queue)rb  r#   rS   r  r[  r  )initial_queueskip_filterdominated_setrl  users        rW   dominated_nodesrt    sx    
 'M}-M
  "JJD{400(!!$'$$T*  - r{   c                Z  ^^	 SSK Jm  SUU	4S jjm	[        U5      u  p#U Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[        U 5      u  pcU Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[	        [
        R                  " / UQUQ76 5      $ s  snf s  snf )Nr7   r
  c                l  > [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      =(       a=    [        U TR
                  TR                  TR                  TR                  45      (       + $ r   )	r}   r5  r6  r7  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)r(  r  is_unrealized_nodes    rW   r{  *gather_origins.<locals>.is_unrealized_node$  s    a&&%aff--a''%aff--!RYY' 

!!!!	1
 -
 	
r{   )r(  r@   r   r:  )r  r  r$   r  r#   	itertoolschain)
r   r  kwargs_flattenr   ro  kwargs_originsargs_flattenargs_originsr  r{  s
           @@rW   gather_originsr    s     
 
" %V,N-;W^c?QRU?Vkckk^NW"4(OL+7S<C;Mc;RKCKK<LSiooE|EnEFF XSs   B#B#B(0B(c                X   ^^^^ SS jmSUU4S jjmSUU4S jjmSU4S jjmT" U 5      $ )z
Normal sympy str is very slow, this is a lot faster.  The result are
somewhat worse, as it doesn't do as much simplification.  So don't
use this for final codegen.
c                    [        U [        R                  5      =(       a1    [        U R                  5      S:H  =(       a    U R                  S   S:H  $ )N   r   r   )r}   r~   MulrR   r   )rm  s    rW   is_neg_leadsympy_str.<locals>.is_neg_leadC  s:    tUYY'VC		Na,?VDIIaLTVDV	
r{   c                v  > [        U [        R                  5      (       a  [        U R                  5      S:X  aT  T" U R                  S   5      (       a:  T" U R                  S   5       ST" U R                  S   R                  S   5       3$ SR                  [        TU R                  5      5      $ T" U 5      $ )Nr  r7   r   z - z + )r}   r~   r   rR   r   r  r   )rm  r  sympy_str_muls    rW   sympy_str_add sympy_str.<locals>.sympy_str_addH  s    dEII&& 499~"{499Q<'@'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&r{   c                   > [        U [        R                  5      (       aJ  T" U 5      (       a  ST" U R                  S   5       3$ SR	                  [        TU R                  5      5      $ T" U 5      $ )N-r7   z * )r}   r~   r  r   r  r   )rm  r  sympy_str_atoms    rW   r   sympy_str.<locals>.sympy_str_mulS  sa    dEII&&4   >$))A,7899zz#ndii"@AA!$''r{   c                  > [        U [        R                  5      (       a  U R                  $ [        U [        R                  [        R
                  45      (       a  ST" U 5       S3$ [        U [        [        [        [        45      (       aC  U R                  R                   SSR                  [        [        U R                  5      5       S3$ [!        U 5      $ )N()r3  )r}   r~   Symbolr   r   r  rc   r`   ra   rb   funcr   r  r   	sympy_strr   r  )rm  r  s    rW   r  !sympy_str.<locals>.sympy_str_atom^  s    dELL))99uyy%))455}T*+1--(HMNNii(()499SDII5N+O*PPQRRt9r{   )rm  r   r   r:  rm  r   r   r  r   )rm  r  r  r  r  s    @@@@rW   r  r  <  s.    

	' 	'	( 	( r{   c                    SSK Jn  [        R                  (       a9  [	        UR
                  SS 5      =n(       a  UR                  S:w  a  [        U 5      $ [        R                  " 5       $ )Nr7   rr  current_node
index_expr)
ru  rs  ri   compute_all_boundsrO   interpreterr  rf   rg   unknown)r  rs  fx_nodes      rW   get_bounds_index_exprr  k  sN     	!!~tDDWDNNl*5!!""$$r{   c                    U S   S:H  $ )Nr   rj  r   )prefixs    rW   prefix_is_reductionr  y  s    !9r{   c                D    U [         R                  :w  d   e[        XSSS9$ )1
Used to generate an integer-nonnegative symbol.
Tintegernonnegative)re   SIZErd   )r  rb  s     rW   sympy_index_symbol_with_prefixr  }  s'     TYY vDdCCr{   c                b    U =(       d    [         R                  =(       a    [         R                  $ r   )ri   debug_index_assertsassert_indirect_indexing)checks    rW   generate_assertr    s    /V//TV5T5TTr{   c                D    U S   S:w  d   e[         R                  " U SSS9$ )r  r   r   Tr  )r~   r  r   s    rW   sympy_index_symbolr    s)     7c>> <<d==r{   c                          SS jn[         R                  " U 5      R                  UR                  5        VVs0 s H  u  p4X2" X45      _M     snn5      $ s  snnf )z
When the passed replacement symbol v is a string, it is converted to a symbol with name v that
have the same replaced expression integer and nonnegative properties.
c                    [        U [        R                  5      (       d   e[        U[        5      (       a*  [        R                  " UU R
                  U R                  S9$ U$ )Nr  )r}   r~   r4  r  r  r   is_nonnegative)replacedreplacements     rW   	to_symbolsympy_subs.<locals>.to_symbol  sV     (EJJ////k3''<< ++$33  r{   )r  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)r~   r5  xreplacerZ  )rm  replacementsr  kr   s        rW   
sympy_subsr    sh    +A	 ==''(4(:(:(<=(<IaO	(<= =s   A
c                    [        U [        R                  5      =(       d-    [        U [        R                  5      =(       a    U R                  $ r   )r}   rP   r2   r  _has_symbolic_sizes_strides)r(  s    rW   is_symbolicr    s3    a& 1ell#E(E(Er{   c                 &    [        S U  5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   )r  r  s     rW   r   "any_is_symbolic.<locals>.<genexpr>  s     ,t!{1~~tr   r  )r   s    rW   any_is_symbolicr    s    ,t,,,r{   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    SSK Jn  U R                  R                   HH  n[	        U5      (       a  Us  $ UR
                  R                  S5      =nc  M7  U" U5      (       d  MF  Us  $    g )Nr   )r)   ro  )%torch.fx.experimental.symbolic_shapesr)   rv  rU  is_cudagraph_unsafe_fx_noder  rW  )r  r)   rl  ro  s       rW   %get_first_incompatible_cudagraph_noder    sW     L&t,,K99==''C49Ns9S9SK  r{   c                    [        [        [        U R                  R                  5      5      5      nUR
                  S:X  d   eU$ )z$Get the output node from an FX graphr  )nextiterreversedrv  rU  r  )r  	last_nodes     rW   output_noder    s6    T(288>>234I<<8###r{   c                    U R                   R                  SS9n[        S U 5       5      n[        U 5      R                  S   n[        U[        5      (       a  UOU4n[        S U 5       5      nX%-  $ )Nr  r  c              3     #    U  HX  n[        UR                  R                  S 5      [        R                  5      (       d  M=  UR                  S    R
                  v   MZ     g7fro  N)r}   r  rW  rP   r  r   )r   rl  s     rW   r   "get_all_devices.<locals>.<genexpr>  sC      9%DdiimmE*ELL9 	 		%%s   <A" A"r   c              3    #    U  H  n[        U[        R                  R                  5      (       d  M.  [        UR                  R                  S 5      [        R                  5      (       d  Mh  UR                  S    R                  v   M     g7fr  )r}   rP   r  r5   r  rW  r  r   )r   r  s     rW   r   r    s[      7Cc588==) 	 sxx||E*ELL9 	s   -B6B- B)rv  
find_nodesr#   r  r   r}   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rW   get_all_devicesr    s~    ++}+=.8 9%9 /M "o""1%G$We44w7*H,6 77 -K &&r{   c                    [        [        R                  R                  5       5       GH5  n U R	                  S5      (       d  M  [        R                  U    nUR
                   H  nUR	                  S5      (       d  M  [        X5      n[        U[        R                  R                  R                  R                  5      (       d  Me  UR                   Hp  n[        U[        R                  R                  R                  R                  5      (       d  MB  UR                  R                   R"                  R%                  5         Mr     M     [        R                  U 	 GM8     S[        R                  ;   aR  [        R                  S   n['        UR(                  R*                  R,                  5      ?UR(                  R*                  ?[0        R2                  " 5         g )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rb  sysmodulesrY  
startswith__dict__rO   r}   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rW   unload_xpu_triton_pydsr    sJ   CKK,,./%%&NOOKK$I##I.. .EOO33EEVV  #)"8"8%"!OO33EEYY 
 #MM--1199; #9 $ KK$# 0( #++-kk12""(()2JJ#JJLr{   _registered_cachesc                    [        U S5      (       a  [        U R                  5      (       d  [        U  S35      e[        R                  U 5        U $ )z\
Use this decorator to register any caches that should be cache_clear'd
with fresh_cache().
cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  s    rW   clear_on_fresh_cacher    sE    
 3&&hs.G.Gu$GHIIc"Jr{   c                 >    [          H  n U R                  5         M     g)z
Clear all registered caches.
N)r  r  r   s    rW   clear_cachesr  )  s     " "r{   c              #  `  #    [         R                  R                  U 5      n U[         R                  U '   Sv   Uc!  [         R                  R                  U S5        gU[         R                  U '   g! Uc!  [         R                  R                  U S5        f U[         R                  U '   f = f7f)a  Thread-safe env var set/restore using atomic C-level lookups.

We avoid mock.patch.dict(os.environ, ...) because it internally calls
os.environ.copy(), which iterates all env var keys then fetches values in
separate steps. That approach is not atomic and can race with background threads
(e.g. Triton async compilation) modifying the environment, causing KeyError,
so we use os.environ.get() for individual keys which is an atomic C-level lookup.
N)osenvironrW  rS   )rc  r   olds      rW   _set_envr  1  sy      **..
C"

3;JJNN3%!BJJsO ;JJNN3%!BJJsOs    B.A2 8B.29B++B.c              #  `  ^#    [        5         SSKJn  U" [        R                  " US95      m [        ST5         [        R                  ST5        U" [        R                  R                  TS5      5      n[        SU5         Sv   [        U [        5      (       a  [        U 5      S:X  d   S	5       e[        R                  R                  U5      (       a{  [        R                  " U5      nU R!                  U Vs0 s HH  nS
U;  d  M  U[        R                  R#                  [        R                  R                  XF5      5      _MJ     sn5        SSS5        SSS5        U(       a^  [%        5       (       a-  [&        R(                  R+                  5       (       a
  [-        5         [.        R0                  " T[%        5       U4S jS9  [        5         gs  snf ! , (       d  f       N= f! , (       d  f       N= f! [2         a    [        R5                  ST5        e f = f! [        5         f = f7f)z
Contextmanager that provides a clean tmp cachedir for pt2 caches.

Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
generated with this cache instance.
r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                .   > [         R                  STUS9$ )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  pathr  inductor_cache_dirs      rW   r  fresh_cache.<locals>.<lambda>t  s    S[[@&% 6A 6r{   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr
  tempfilemkdtempr  r   r   r  r  r  r}   dictrR   existslistdirra  getsize
is_windowsrP   rJ   rQ   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter
  triton_cache_dirfilesfr  s          @rW   fresh_cacher'  F  s     ND1(2B2Bs2KL'/1CDII35GH7/:  ,.>?mT22}-2W4WW2ww~~&677 "

+; <%,, */).A#*!#3 !V277??277<<@P3T#U U). @ E$ ||		 6 6 8 8&(MM" )l  	5 @? EDD  >@RS 	ss   +H.G: A	G)A9G=
GAGGG)A-G: H.G
G&	"G))
G73G: :"HH H++H.)reversec                   U R                   n[        [        U 5      5      n[        [	        X2SS95      nU(       d  [        [        U5      5      $ U$ )NTrc  r(  )__getitem__r   rR   rb  r  r  )seqr(  gettera_rsort_idxs        rW   argsortr0    sC    __F
C/C F3D9:HHX&''Or{   c          	     F  ^  SU 4S jjn[        U5       VVs/ s H>  u  pEU[        U[        R                  5      (       a  UR                  R
                  OU4PM@     nnn[        U[        R                  " U5      US9nU VVs/ s H  u  pGUPM	     nnnU$ s  snnf s  snnf )Nc                ~   > U u  p#Uu  pESU4S jjnU" X5:  5      (       a  gU" X5:  5      (       a  gX$:  a  gX$:  a  gg)Nc                R   > [        U [        5      (       a  U $ TR                  U SS9$ )NT)size_oblivious)r}   r:  evaluate_expr)rm  rx  s    rW   evaluate*argsort_sym.<locals>.cmp.<locals>.evaluate  s+    $%%**4*EEr{   r   r7   r   )rm  z%Union[bool, torch.SymInt, sympy.Expr]r   r:  r   )r(  r)  a_idxa_valb_idxb_valr6  rx  s          rW   r  argsort_sym.<locals>.cmp  sN    	F
 EM""EM""
 ==r{   r*  )r(  tuple[int, sympy.Expr]r)  r=  r   r   )	r  r}   rP   r2   rl  rm  r  r  
cmp_to_key)	rx  r,  r(  r  rb  r   exprsr   r  s	   `        rW   argsort_symr@    s    4  n$FC 
Z5<<88affkka@$ 
  5i2237IE %&fccF&M
 's   ABBc                r    U [         R                  :X  a  g[         R                  " SU S9R                  5       $ )Nrv   r   r   )rP   r^  r   element_sizerB  s    rW   get_dtype_sizerD    s-     ;;r'4466r{   c                       \ rS rSr% S\S'   Srg)LineContexti  r   contextr   Nr   r   r   r   r   r   r   r{   rW   rF  rF    s    Lr{   rF  c                  *    \ rS rSr% S\S'   S\S'   Srg)ValueWithLineMapi  r  r   zlist[tuple[int, LineContext]]line_mapr   NrH  r   r{   rW   rJ  rJ    s    J++r{   rJ  c                     \ rS rSrSrSSS jjr\R                  SS j5       rSS jr	SS jr
SS jrSS jrSS	 jrSS
 jrSS jrSS jr    S S jrS!S"S jjrS!S#S jjrS!S#S jjr S$     S%S jjrS&S jrSS jrS'S jrS(S jrSrg))IndentedBufferi     c                    / U l         Xl        g r   )_lines_indent)r  initial_indents     rW   __init__IndentedBuffer.__init__  s    GI%r{   c              #  \   #    U R                   n Xl         S v   X l         g ! X l         f = f7fr   )tabwidth)r  rV  prevs      rW   set_tabwidthIndentedBuffer.set_tabwidth  s%     }}	!$M MDMs   ,
! ,),c                   [        5       nSn/ nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O5[        U[        5      (       a  UR                  X$R                  45        MX  Un[        U[        5      (       d   eUR                  U5        UR                  S5        USUR                  S5      -   -  nM     [        UR                  5       U5      $ )Nr7   rR  )r   rP  r}   DeferredLineBaserF  r  rG  r  writecountrJ  getvalue)r  bufr   linemaplilines         rW   getvaluewithlinemap"IndentedBuffer.getvaluewithlinemap  s    j13++B".//t<  B,,::/dC((((IIdOIIdOTZZ%%%A   88r{   c                6    U R                  5       R                  $ r   )rc  r   r  s    rW   r^  IndentedBuffer.getvalue  s    '')///r{   c                   [        5       nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O[        U[        5      (       a  M<  Un[        U[
        5      (       d   eUR                  S5      (       a  UR                  US S 5        M  UR                  U5        UR                  S5        M     UR                  5       $ )N\r   rR  )	r   rP  r}   r[  rF  r  endswithr\  r^  )r  r_  ra  rb  s       rW   getrawvalueIndentedBuffer.getrawvalue  s    j++B".//t<  B,,dC((((}}T""		$s)$		$		$   ||~r{   c                8    U R                   R                  5         g r   )rP  clearrf  s    rW   rn  IndentedBuffer.clear  s    r{   c                ,    [        U R                  5      $ r   )r:  rP  rf  s    rW   __bool__IndentedBuffer.__bool__  s    DKK  r{   c                :    SU R                   U R                  -  -  $ )Nr2  )rQ  rV  rf  s    rW   r  IndentedBuffer.prefix  s    dllT]]233r{   c                &    U R                  S5        g )NrR  	writelinerf  s    rW   newlineIndentedBuffer.newline  s    tr{   c                   [        U[        5      (       a  U R                  R                  U5        g [        U[        5      (       a9  U R                  R                  UR                  U R                  5       5      5        g UR                  5       (       a.  U R                  R                  U R                  5        U 35        g U R                  R                  S5        g Nr  )r}   rF  rP  r  r[  with_prefixr  stripr  rb  s     rW   rw  IndentedBuffer.writeline  s    dK((KKt$.//KKt//>?ZZ\\KK$++-78KKr"r{   c                8    U H  nU R                  U5        M     g r   rv  )r  linesrb  s      rW   
writelinesIndentedBuffer.writelines"  s     DNN4  r{   c                L   ^ ^ [         R                  SUU 4S jj5       nU" 5       $ )Nc               3     >#    T=R                   T -  sl          S v   T=R                   T -  sl         g ! T=R                   T -  sl         f = f7fr   rQ  )offsetr  s   rW   r  "IndentedBuffer.indent.<locals>.ctx)  s8     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r  r  s   `` rW   indentIndentedBuffer.indent(  s$    		"	"	' 
#	' ur{   c                .    U =R                   U-  sl         g r   r  r  r  s     rW   	do_indentIndentedBuffer.do_indent3      r{   c                .    U =R                   U-  sl         g r   r  r  s     rW   do_unindentIndentedBuffer.do_unindent6  r  r{   c           	        [        U[        5      (       a  [        S5      nUR                   HR  n[        U[        5      (       a  M  U(       d  M#  [        U[        U5      [        UR                  5       5      -
  5      nMT     [        R                  " U5      (       a  SnUR                   HV  n[        U[        5      (       a  U R                  R                  U5        M5  [        R                  X[        U5      S  5        MX     g [        R                  " U5      nU(       a  UR                  5       nU(       d  g UR                  5       nUR!                  S5       H  nU R                  U5        M     g )Ninfr   rR  )r}   rM  floatrP  rF  minrR   r  mathisinfr  rw  r   textwrapdedentrstripr`  )r  
other_coder}  r  rb  r   s         rW   spliceIndentedBuffer.splice9  s    j.115\F"))!$44 TS5G)GHF * zz&!!"))dK00KK&&t,",,TF3FG	 * "4J'..0
#**,J%%d+q! ,r{   c                    [        U R                  S9nU R                   Vs/ s H
  o1" U5      PM     snUl        U$ s  snf N)rR  )rM  rQ  rP  )r  r  r   rb  s       rW   r   IndentedBuffer.mapS  s8    DLL9-1[[9[Td4j[9

 :s   =c                @    [        U 5       SU R                  5        S3$ )Nr  r  )r  r^  rf  s    rW   __repr__IndentedBuffer.__repr__X  s     t*Qt}}/q11r{   c                    U R                   UR                   :X  d   e[        U R                   S9nUR                  U R                  5        UR                  UR                  5        U$ r  )rQ  rM  r  rP  )r  otherr   s      rW   __add__IndentedBuffer.__add__[  sK    ||u}},,,DLL9t{{#u||$
r{   c                    XR                   ;   $ r   )rP  )r  new_lines     rW   containsIndentedBuffer.containsc  s    ;;&&r{   )rQ  rP  rV  Nr   )rR  r   r   r  )rV  r   r   r  )r   rJ  r   r  r   r  r   r:  )rb  z)Union[LineContext, DeferredLineBase, str]r   r  )r  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )r  r   r   'contextlib.AbstractContextManager[None])r  r   r   r  F)r  zUnion[IndentedBuffer, str]r}  r:  r   r  )r  zCallable[[Any], Any]r   rM  )r  r   r   rM  )r  z)Union[DeferredLineBase, LineContext, str]r   r:  )r   r   r   r   rV  rS  r  r  rX  rc  r^  rk  rn  rq  r  rx  rw  r  r  r  r  r  r   r  r  r  r   r   r{   rW   rM  rM    s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"4
2'r{   rM  c                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )FakeIndentedBufferig  c                "   > [         TU ]  5         g r   )superrS  )r  	__class__s    rW   rS  FakeIndentedBuffer.__init__h  s    r{   c                V    US:X  a  [         R                  X5      $ [        SU S35      e)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rW   r  #FakeIndentedBuffer.__getattribute__k  s9    ;**466!$ (= =
 	
r{   r   r  )r   r  r   r   )r   r   r   r   rS  r  r   __classcell__r  s   @rW   r  r  g  s    
 
r{   r  c               #     #    [         R                  [         R                  p S v   Xs[         l        [         l        g ! Xs[         l        [         l        f = f7fr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rW   restore_stdout_stderrr  v  s9     %(ZZN@!/
CJ
CJs    A> AAAc                  h    \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSrg)r[  i  z.A line that can be 'unwritten' at a later timec                >    UR                  5       (       d  SnXl        g r{  )r}  rb  r~  s     rW   rS  DeferredLineBase.__init__  s    zz||D	r{   c                    [         e)zJReturns either self.line or None to indicate the line has been 'unwritten'r  rf  s    rW   r  DeferredLineBase.__call__      !!r{   c                    [         e)z3Returns a new deferred line with the same conditionr  r~  s     rW   	_new_lineDeferredLineBase._new_line  r  r{   c                @    U R                  U U R                   35      $ r   r  rb  )r  r  s     rW   r|  DeferredLineBase.with_prefix  s    ~~455r{   c                T    U R                  U R                  R                  5       5      $ r   )r  rb  r  rf  s    rW   r  DeferredLineBase.lstrip  s    ~~dii..011r{   c                >    U R                  U R                  U   5      $ r   r  )r  r  s     rW   r+  DeferredLineBase.__getitem__  s    ~~dii.//r{   c                ,    [        U R                  5      $ r   )r:  rb  rf  s    rW   rq  DeferredLineBase.__bool__  s    DIIr{   c                ,    [        U R                  5      $ r   )rR   rb  rf  s    rW   __len__DeferredLineBase.__len__  s    499~r{   )rb  N)rb  r  )r   zUnion[str, None])rb  r  r   r   )r  r  r   r   )r   r   )r  zUnion[int, slice]r   r   r  r   r   )r   r   r   r   r   rS  r  r  r|  r  r+  rq  r  r   r   r{   rW   r[  r[    s-    8
""620r{   r[  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DelayReplaceLinei  z6At end of codegen call `line.replace(key, value_fn())`c                <   > [         TU ]  U5        Xl        X l        g r   )r  rS  rc  value_fn)r  rc  r  rb  r  s       rW   rS  DelayReplaceLine.__init__  s     r{   c                j    U R                   R                  U R                  U R                  5       5      $ r   )rb  replacerc  r  rf  s    rW   r  DelayReplaceLine.__call__  s#    yy  4==?;;r{   c                D    [        U R                  U R                  U5      $ r   )r  rc  r  r~  s     rW   r  DelayReplaceLine._new_line  s    $-->>r{   )rc  r  )rc  r  r  zCallable[[], str]rb  r  r  )rb  r  r   r  )
r   r   r   r   r   rS  r  r  r   r  r  s   @rW   r  r    s    @!
<? ?r{   r  c                   [        U [        R                  5      (       a  U nO[        R                  " [        5       U 5      n[        R
                  " U5      n[        R                  R                  (       aF  UR                  c   eUR                  S:  d  UR                  S:X  a  [        R                  S5        ggUR                  S:X  a  SOSnUR                  nXC:  a  [        R                  S	X4S
.S9  gg)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rr   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r}   rP   r   rX   r    createversionrq   majorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rW   
is_big_gpur    s    /5<<00 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I:%> 	 	
 r{   c                     [         R                  R                  5       (       a(  [         R                  R                  5       R                  $ [         R
                  R                  S5      R                  $ )NrH   )rP   rJ   rQ   get_device_propertiesgpu_subslice_countrH   r  r   r{   rW   get_max_num_smsr    sI    yyyy..0CCC::++F3IIIr{   c                     [         R                  R                  5       (       d  g[         R                  R                  [         R                  R	                  5       5      n U R
                  S:H  $ )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rH   rQ   r  r  r  )device_propertiess    rW   
using_b200r    sM     ::""$$

889R9R9TU""b((r{   c                     [         R                  R                  5       (       a
  [        5       $ [         R                  R                  5       n [        5       U b  U -
  $ S-
  $ )zFHandle experimental carveout if set otherwise return hardware SM countr   )rP   rJ   rQ   r  r  _get_sm_carveout_experimental)carveouts    rW   get_num_smsr    sM     yy  xx557HH,@HHaHHr{   c                    SSK JnJn  Uc
  [        5       nUR	                  S5      nX -  [
        -  nU" UUUUR                  " 5       S9$ )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeF)r]  	zero_moder   
outer_name)codegen.commonr8   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr8   r  r  r  s          rW   get_tma_workspace_argr    sU     @"}!++E2I-0CCD++-	 r{   c                   U R                   U;  a!  [        R                  SU R                   U5        [        U R                  R
                  5      =(       a+    U R                   U;   =(       a    [        U R                  5      $ )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r<  allowed_layout_dtypess     rW   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%r{   c                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf NrQ  )r   ri   max_autotune_gemm_backendsr`  r}  backendrT   s     rW   _use_autotune_backendr  	  P    ==?!<<BBDJJ3OOa	O      Ac                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf r  )r   ri   max_autotune_conv_backendsr`  r}  r  s     rW   _use_conv_autotune_backendr    r  r  )enable_int32enable_float8check_max_autotunec                  SSK JnJn  [        R                  [        R
                  [        R                  /nU(       a>  [        R                  [        R
                  [        R                  [        R                  /nU(       a/  UR                  [        R                  [        R                  /5        [        U R                  R                  5      =(       a    [        X5      =(       d/    U R                  R                  S:H  =(       a    U R                  U;   =(       ak    [         R"                  =(       d    [         R$                  =(       d    U(       + =(       a/    ['        S5      =(       a    U" U R                  UR(                  5      $ )Nr7   )BackendFeaturehas_backend_featurer  TRITON)r  r!  r"  rP   r   rJ  rL  rT  extendrD  rE  r  r   r  r  r   ri   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r<  r  r  r  r!  r"  layout_dtypess          rW   use_triton_templater)    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOr{   output_layout
add_guardsc                  ^^^^^^	 SSK Jn  SSKJm  SU4S jjmSUU4S jjnSUUU	4S jjm        SUUU4S jjm        SU4S	 jjm	U" 5       =(       a$    [	        U4S
 jU 5       5      =(       a    U" U 5      $ )u.  
Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
that Triton relies on today.
* https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

A tensor is accepted when:
  * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
  * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
  * Base pointer 16-byte aligned
  * Exactly one contiguous ("inner") dim with stride 1
  * All "outer" dims have 16-byte aligned strides
  * Inner dim size × itemsize is a multiple of 16
  * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
r   )has_triton_tma_devicer7   rr  c                X   > TR                   R                  R                  U [        5      $ r   )rv  rw  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrs  s    rW   _alignedcan_use_tma.<locals>._alignedG  s     ww<<ZWWr{   c                   > U c  gU R                   nU R                  nU R                  nT" U R                  5      (       d  gT" XU5      $ )NTF)r  rF  r   r  )r<  sizesstridesr   r3  _is_tma_compatibles       rW   _is_tma_compatible_layout.can_use_tma.<locals>._is_tma_compatible_layoutJ  sG    >-- &&!%%88r{   c                   > U R                  5       nU R                  5       nU R                  5       nU R                  5       TR                  R
                  ;   a  gU R                  5       =nb  UR                  S:X  a	  T" XU5      $ T" XU5      $ )NFrJ   )get_size
get_stride	get_dtyper  rv  unaligned_buffers
get_devicer  )r  r6  r7  r   m_devicers  r8  _is_tma_compatible_xpus        rW   _is_tma_compatible_matrix.can_use_tma.<locals>._is_tma_compatible_matrixW  sw    

,,. ::<177444&H38N)%%@@!%%88r{   c                  > [        U 5      nUR                  nUS:  d  US:  a  gU[        ;  a  gT(       aK  TR                  R                  R                  U 5      nTR                  R                  R                  U5      nOjU  Vs/ s H(  nTR                  R                  R                  U5      PM*     nnU Vs/ s H(  nTR                  R                  R                  U5      PM*     nn[        U5       V	Vs/ s H4  u  pTR                  R                  R                  US5      (       d  M2  U	PM6     n
n	n[        U
5      S:w  a  gU
S   n[        U5       H  u  pX:X  a  M  T" X-  5      (       a  M    g   X[   nT" X-  5      (       d  gUS:X  a,  TR                  R                  R                  US5      (       d  ggs  snf s  snf s  snn	f )Nr7   r   Fr       T)
rR   itemsizert   rv  rw  guard_int_seqsymbolic_hintr  statically_known_equalsstatically_known_geq)r6  r7  r   rankrG  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrs  r3  r,  s                rW   r8  'can_use_tma.<locals>._is_tma_compatiblee  s   
 5z>>!8tax--gg&&44U;G((66w?IBGH%Qqww''55a8%GHFMNg))77;gIN
 #9-
-ww77A> - 	 

 u:?!H	 y)EA~BM**	 * &		,-- q=!1!1!F!FyRT!U!U; IN
s   /G</G;1G0Gc                j  > US   nTR                   R                  R                  U5      nTR                   R                  R                  US5      (       d  gSnU  HT  nTR                   R                  R                  U5      nTR                   R                  R	                  Xu5      (       d  MT    g   g)Nr   r7   Fl    T)rv  rw  rI  rJ  statically_known_gt)	r6  r7  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintrs  s	           rW   rB  +can_use_tma.<locals>._is_tma_compatible_xpu  s     bk77++99+Fww778H!LL 
D((66t<Iww33IJJ 
 r{   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r  rC  s     rW   r   can_use_tma.<locals>.<genexpr>  s     ?h)!,,h   )r2  Union[int, sympy.Expr]r   r:  )r<  Optional[Layout]r   r:  )r  r@   r   r:  )r6  Sequence[sympy.Expr]r7  zSequence[_IntLike]r   torch.dtyper   r:  )torch.utils._tritonr.  ru  rs  r   )
r+  r,  matricesr.  r9  rs  r3  r8  rC  rB  s
    `   @@@@@rW   can_use_tmarc  2  s    " :X9 99 9/#/#/ / 
	/ /b##  
	* 	 	5?h??	5%m4r{   )r,  c                    [         R                  R                  (       a  U OS n[        S U 5       5      =(       a,    [	        X#US.6=(       a    [         R                  R
                  $ )Nc              3  Z   #    U  H!  n[        UR                  5       5      S :H  v   M#     g7f)r  N)rR   r<  )r   r  s     rW   r   *use_triton_tma_template.<locals>.<genexpr>  s      5HqC

"Hs   )+r*  )ri   r  enable_template_tma_storer   rc  enable_persistent_tma_matmul)r+  r,  rb  r<  s       rW   use_triton_tma_templateri    sK     %mmEE]4F5H55 	7JO	7MM66r{   c                f    [        X US.6(       d  gSSKJn  SSKJn  U" 5       =(       a    U" 5       $ )Nr*  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)ri  ra  rk  codegen.cuda.cuda_envrm  )r+  r,  rb  rk  rm  s        rW   !use_triton_blackwell_tma_templatero    s5     #	:  IC 12U7S7UUr{   c                     X;   =(       a    X;   $ r   r   )scale_option_ascale_option_bscaling_typess      rW   use_triton_scaling_templatert    s    
 *N~/NNr{   )maxsizec                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)zCheck if CuTeDSL is importable; cache the result for reuse.

Call ensure_cute_available.cache_clear() after installing CuTeDSL
in the same interpreter to retry the import.
cutlassNF	importlibutil	find_specr  r   r{   rW   ensure_cute_availabler|    s3    ~~''	2$>>     # 
00c                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
in the same interpreter to retry the import.
cutlass_apiNFrx  r   r{   rW   "ensure_nv_universal_gemm_availabler    s3    ~~''6dBB r}  c                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)a3  Check if nvMatmulHeuristics is importable; cache the result for reuse.

nvMatmulHeuristics provides performance model-based kernel selection
for NVIDIA GEMM operations.

Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
nvMatmulHeuristics in the same interpreter to retry the import.
nvMatmulHeuristicsNFrx  r   r{   rW   $ensure_nvmatmul_heuristics_availabler    s4    ~~''(<=TII r}  c                   [        5       (       d  g[        S5      (       d  gSSKJn  [	        UR
                  R                  5      (       d  gU" 5       (       d  g[        R                  /n	[        X)5      (       d  g[        R                  (       d  [        R                  (       d  g[        XUS9(       d  g[        S X4 5       5      (       a  gU(       a  U(       a  gUc  gUc  Ub  gg)a  
Returns True if we can use the blackwell kernel for grouped mm.
Required conditions:
    1. CuTeDSL backend is enabled
    2. CuTeDSL is available
    3. We are on a blackwell arch
    4. The dtype is bf16
    5. Max autotune or max autotune gemm is enabled
    6. A, B, and the output are 16B aligned
    7. We are not using dynamic shapes
    8. A is 2d
    9. B is 3d
    10. Offsets are provided
    11. Bias and Scale are not provided
FCUTEDSLr7   rl  )r+  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )
is_dynamicr   rT   s     rW   r   3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>0  s     
1.Q:a==.r   T)r|  r  rn  rm  r  r   r  rP   rJ  r  ri   r%  r&  rc  r  )
mat_amat_br<  a_is_2db_is_2doffsbiasscale_resultrm  r(  s
             rW    use_blackwell_cutedsl_grouped_mmr    s    2 !"" ++C&--$$%%'))^^$M 776#;#; u6:

15.
111g|<3r{   c                r   SSK Jn  UR                  R                  R	                  X-  U-  SS9nUS::  d  U[
        R                  R                  :  a  gSSKJ	n  [        R                  R                  (       a  g[        R                  [        R                  [        R                  /n[!        X5      =(       a9    [
        R"                  =(       d    [
        R$                  =(       a    ['        S5      nU(       a;  U" 5       (       d/  [(        R+                  S	[
        R                  R,                  5        gU$ )
Nr7   rr  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)ru  rs  rv  rw  optimization_hintri   rw  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsr  rP   r  rq   r   rJ  rT  r  r%  r&  r  r   r  cutlass_dir)	r<  r  r(  r  rs  	gemm_sizer  r(  r   s	            rW   use_cutlass_templater  ?  s      221519r2JIA~V^^%Q%QQ9 }} ]]ENNEKK@Mf4 	-  <F$<$<	-!),  !##KK4 **	 Jr{   _IntLikec                :  ^
^ SSK Jm  [        5       (       d  g[        5       (       d  g[	        S5      (       d  gSSKJm
  T
R                  (       a  gU R                  R                  S:w  d  [        R                  R                  (       a  g[        R                  (       d  [        R                  (       d  gXU/nUb  UR!                  U5        [#        U4S jU 5       5      (       a  gXE/n	Ub  U	R!                  U5        [#        U
4S	 jU	 5       5      (       a  gg
)a  
Return True if we can use the NVIDIA Universal GEMM Template.

Required conditions:
    1. NVGEMM backend is enabled
    2. cutlass_api is available
    3. We are on a NVIDIA GPU
    4. Max autotune or max autotune gemm is enabled
    5. Not in AOT Inductor mode (requires runtime JIT compilation)
    6. Base pointers are 16-byte aligned
    7. Shape dimensions are not unbacked symbols

Note:
    - Shape and stride constraints are handled internally by
      cutlass_api.get_kernels() which filters incompatible kernels.
    - GroupedGemm currently only supports TN layout (column-major B).
      Any other layout will act as a noop and fall back to ATen.
    - Dynamic shapes are supported as long as they have hints
      (from example inputs).
r   )has_free_unbacked_symbolsFNVGEMMr7   rr  rH   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   dimr  s     rW   r   1use_nv_universal_gemm_template.<locals>.<genexpr>  s     
C]c$S))]r\  c              3  p   >#    U  H+  oR                  5       TR                  R                  ;   v   M-     g 7fr   )r  rv  r?  )r   trs  s     rW   r   r    s&     
O>N::<177444>Ns   36T)r  r  r|  r  r  ru  rs  aot_compilationr   r  rP   r  rq   ri   r%  r&  r  r  )r<  r  r(  r  r  r  r  r  dims_to_checktensors_to_checkrs  r  s             @@rW   use_nv_universal_gemm_templater  `  s    < P ""-// **}}V#u}}'8'86#;#;
 1IM}Q

C]
CCC ~%

O>N
OOOr{   c                    [         R                  R                  R                  5       nUS:X  a  gU R                  5       UR	                  S5       Vs/ s H  o"R                  5       PM     sn;   $ s  snf )z8Check if CUTLASS should be used for the given operation.ALLTrQ  )ri   rw  cutlass_enabled_opsr   r`  r}  )op_nameenabled_opsrT   s      rW   _use_cutlass_for_opr    sY    ..44::<Ke==?+2C2CC2HI2HQwwy2HIIIIs   A0r   c           
        SSK Jn  [        R                  R                  U-  nUR
                  R                  R                  [        R                  " [        R                  " X%U -  5      [        R                  " X%U-  5      5      5      =(       aa    UR
                  R                  (       + =(       a?    UR
                  R                  (       + =(       a    [        R                  R                  S:  $ )Nr   rr  )torch._inductor.virtualizedrs  ri   r  decompose_k_thresholdrv  rw  statically_known_truer~   AndGeaot_modecpp_wrappernum_decompose_k_splits)r  r(  r  threshold_multiplers  r  s         rW   use_decompose_k_choicer    s     ."MM??BTT 	
..IIA56A56	
 	5    	5 ###	5 MM0014
r{   c           
        [         R                  R                  nSSKJn  [        [        R                  R                  5      =(       a    UR                  R                  R                  [        R                  " [        R                  " X#U -  5      [        R                  " X#U-  5      5      5      =(       a=    UR                  R                  (       + =(       a    UR                  R                   (       + $ )z
Check if we should use the contiguous subgraph transform.
This transform makes the second matrix contiguous before the matmul.
r   rr  )ri   rocmcontiguous_thresholdr  rs  r:  rP   r  rq   rv  rw  r  r~   r  r  r  r  )r  r(  r  r  rs  s        rW   use_contiguousr    s     ";;;; . 	U]] 	$GG22II145145
	$    	$ ###
r{   c                6   [         R                  R                  n/ SQn[        U[        R
                  5      (       a  UR                  (       d  U$ US:X  a  / $ [        U [        R
                  5      (       a  U R                  (       a0  [        U[        R
                  5      (       a  UR                  (       d  SnO[        X -  X!-  5      nSn[        R                  " U5      nU Vs/ s H  nX::  d  M
  X:  d  M  UPM     nn/ / / pn	U H`  nX,-  nUS:  a  M  XS-
  -  S:X  a  US:  a  U	R                  U5        M3  US-  S:X  a  U
R                  U5        MO  UR                  U5        Mb     [         R                  S:X  a  X-   U-   $ X-   U-   nUS U $ s  snf )	N)rr   rF  ru   rs      r   r  r  rs   r7   rF  
EXHAUSTIVE)ri   r  r  r}   r~   r4  	is_numberr  divisorsr  max_autotune_gemm_search_space)r  r(  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rW   get_k_splitsr    s    ]]99N .!UZZ  	1		1ejj!!!++1ejj!!!++!&!&)K~~a H  G! 	&-&< 	   =?B> 3; AI!#$$Q'RZ1_%%a( !!!$ " ,,< 5FF#8>IK''=s   (	F5F<Fc                T    [         R                  R                  U 5      R                  $ r   )rP   rH   r  gcnArchNamer   s    rW   _rocm_native_device_arch_namer  	  s    ::++F3???r{   c                      SS K n SSKJnJn  SSKJn  [        R                  R                  U R                  5      nXAX#4$ ! [         a    SS jnSS jn " S S5      nS n N&f = f)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     / $ r   r   r   r{   rW   r  *try_import_ck_lib.<locals>.gen_ops_library/	      Ir{   c                     / $ r   r   r   r{   rW   r  .try_import_ck_lib.<locals>.gen_ops_preselected2	  r  r{   c                      \ rS rSrSrg)*try_import_ck_lib.<locals>.CKGemmOperationi5	  r   N)r   r   r   r   r   r   r{   rW   r  r  5	  s    r{   r  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r  )r  r  r  r  package_dirnames        rW   try_import_ck_libr  	  sh    	
	
 ''//+*>*>? -@QQ  			 	 s   ;A  A$#A$c                J   [         R                  (       d  [         R                  (       d  g[        R                  R
                  (       d  gU R                  R                  S:w  a  g[        U R                  5      n[         R                  R                   Vs0 s H  o"R                  S5      S   U_M     sn=(       d    UR                  S5      S   U0nUR                  5       [         R                  R                  -   Vs/ s H  nX2   PM	     nnU(       d  gU R                  [        R                  [        R                   [        R"                  4;  a  g[%        5       u  n    nU(       d  [&        R)                  S5        gU[         R                  l        gs  snf s  snf )NFrH   :r   z,Please pip install Composable Kernel packageT)ri   r%  r&  rP   r  rq   r   r  r  r  archr`  rY  ck_supported_archr   r   rJ  rL  r  r   r  ck_dir)r<  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rW   use_ck_templater  <	  s@   6#;#;==}}V# 0>K39;;3C3CD3Cawws|A)3CD #q!;IO
 !%%'&++*G*GG!GA 	G  ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   FF c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   rr  CKr   r  r   ru  rs  r  r  rv  rw  r  r<  r  r(  r  rs  s        rW   use_ck_gemm_templater  a	  sP     	d# 	KF#	KGG..quqy2.FJr{   c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   rr  CKTILEr   r  r   r  r  s        rW   use_ck_tile_gemm_templater  k	  sP     	h' 	KF#	KGG..quqy2.FJr{   c                <    [        S5      =(       a    [        U 5      $ )Nr  )r  r  r<  s    rW   use_ck_conv_templater  u	  s    %d+G0GGr{   c                    [         R                  =(       d    [         R                  =(       a    U R                  R                  S:H  $ r  )ri   r%  r&  r   r  r  s    rW   _use_template_for_cpur  y	  s2    7v77&
--


%&r{   c                   SSK Jn  [        UR                  U5      (       d   eUR                  R                  nUR                  R
                  n[        U 5      =(       al    UR                  5       [        R                  :H  =(       aD    [        U5      S:H  =(       a/    [        U5      S:H  =(       a    US   US   :H  =(       a    US   S:H  n[        XUSS9=(       a#    UR                  R                  5       =(       d    U$ )Nr7   )rA      r  F)require_constant_mat2)r  rA   r}   r<  r  rF  r  r>  rP   rL  rR   use_cpp_gemm_templateis_contiguous)r<  mat1mat2rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rW   use_cpp_bmm_templater  	  s     dkk6****
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Dr{   c                   SSK Jn  SSKJn  SSKJn	  SSKJn
  [        U 5      (       a  [        S5      (       d  g[        R                  R                  (       d  gUR                  5       [        R                  [        R                   4;   n[        R"                  [        R$                  [        R&                  [        R                  [        R                   /nU
" UUU(       a  U R(                  OS UUS9u  ppp[+        X45      (       a  g[-        X'R.                  5      (       a  UR1                  5       nU	" UR                  5       5      u  nnU" S	UUUUR                  5       UR                  5       U[3        5       U(       + US
9
nSS jnU R(                  U;   =(       aT    US L=(       aI    U" U5      =(       a:    [-        X'R4                  5      =(       a    UR7                  5       =(       d    U(       + $ )Nr7   r
  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    U R                  5         U R                  5       S   S:H  $ )Nr   r7   )freeze_layoutr=  rT   s    rW   is_last_dim_stride12use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s"    	||~b!Q&&r{   )rT   r@   r   r:  )r  r  codegen.cpp_micro_gemmr
  codegen.cpp_utilsr  kernel.mm_commonr  r  r  ri   cppweight_prepackr>  rP   r[  rP  rL  rJ  halfr   has_free_symbolsr}   BaseViewunwrap_viewparallel_num_threadsr7  is_module_buffer)r<  r  r  r  r   is_woq_int4r  r  r
  r  r  	int8_gemmr(  r  r(  r  r  r   r  r  s                       rW   r  r  	  s    9M) ((0Ee0L0L::$$ U[[%**$==I]]ENNEJJUZZXM")"+&,,'#A!T $$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C t]]+	C ""$A,A(Ar{   c                 ~    [         R                  =(       d    [         R                  (       + =(       d    [        S5      $ )NATEN)ri   r%  r&  r  r   r{   rW   use_aten_gemm_kernelsr,  	  s-    7v77 '	v	&'r{   c                  b    \ rS rSr% \R
                  " S5      rS\S'   S
S jrS
S jr	SS jr
Srg	)DebugDirManageri	  r   r  prev_debug_namec                @    [        [        R                  5      U l        g r   )r  r.  counterr   rf  s    rW   rS  DebugDirManager.__init__	  s    ../r{   c                    [         R                  R                  R                  U l        U R                   SU R
                   3U l        U R                  [         R                  R                  l        g )N_tmp_)rP   _dynamori   debug_dir_rootr/  r   new_namerf  s    rW   	__enter__DebugDirManager.__enter__	  sM    $}}33BB//0dggY?.2mm+r{   c                    [         R                  " U R                  5        U R                  [        R
                  R                  l        g r   )r  r   r7  r/  rP   r5  ri   r6  )r  r   s     rW   __exit__DebugDirManager.__exit__	  s*    dmm$.2.B.B+r{   )r   r7  r/  Nr  )r   r   r   r  )r   r   r   r   r}  r]  r1  r   rS  r8  r;  r   r   r{   rW   r.  r.  	  s&    ooa G0<
Cr{   r.  c                  ^ SSK Jn  [        5       mSU4S jjn[        R                  R                  USU5         [        R                  R                  5         U " U0 UD6nS S S 5        W[        T5      4$ ! , (       d  f       N= f)Nr7   r<   c                (   > TR                  U 5        g r   )r[  codesource_codess    rW   save_output_code*run_and_get_code.<locals>.save_output_code	  s    r{   rB  r@  r  r   r  )
rv  r=   r#   r   patchr  rP   r5  resetrb  )r   r   r  r=   rB  r  rA  s         @rW   run_and_get_coderG  	  so    
 %$.LL 
		=*<>N	OT$V$ 
P 4%%% 
P	Os   'A77
Bc                   UR                  SS5      n[        U /UQ70 UD6u  pE/ nU HU  nUR                  [        R                  " SU[        R
                  5      5        U(       d  MA  U Vs/ s H  oSS PM	     nnMW     XF4$ s  snf )Nremove_quoteFz	'''.*?'''r  )rS   rG  r$  r   findallDOTALL)	r   r   r  rI  r  rA  kernelsr@  r  s	            rW   run_and_get_kernelsrN  
  s     ::ne4L+B@@@FGrzz,bii@A<29:'a|'G:G  ? ;s   -Bc                *   ^  SU 4S jjn[        U5      $ )Nc                 R   > T" 5       n U R                  5       R                  5         U $ r   )r  r  )r  r   s    rW   run_with_backward1run_fw_bw_and_get_code.<locals>.run_with_backward
  s!    

r{   )r   r   )rG  )r   rQ  s   ` rW   run_fw_bw_and_get_coderS  
  s    
 -..r{   c                t  ^^ SSK Jn  / mSU4S jjmS	U4S jjn[        R                  R	                  USU5         [        R                  R	                  UST5         [
        R                  R                  5         U " U0 UD6nSSS5        SSS5        T$ ! , (       d  f       N= f! , (       d  f       T$ = f)
zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   c                (   > TR                  U 5        g r   r  r?  s    rW   rB  "get_code.<locals>.save_output_code
  s    D!r{   c                   >  " S S5      nU R                   (       a  U R                  5       OU R                  5       u  p#T" UR                  5        U(       a  T" UR                  5        U" 5       $ )Nc                  ,    \ rS rSrSrSS jrSS jrSrg)	@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulei"
  z4This is empty to replace the generated triton modulec                    g r   r   rf  s    rW   rS  Iget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__%
  s    r{   c                    g r   r   r  s      rW   callEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call(
  s    r{   r   Nr  r   r   r  r   r   r  )r   r   r   r   r   rS  r^  r   r   r{   rW   DummyModulerZ  "
  s    Fr{   ra  )r  codegen_with_cpp_wrappercodegenr   )r  ra  wrapper_codekernel_coderB  s       rW   patched_compile_to_module+get_code.<locals>.patched_compile_to_module!
  s[    	 	 04/?/?D))+T\\^ 	" 	++,[../}r{   compile_to_modulerB  NrD  )r  r=   r   r   )rv  r=   r   rE  r  rP   r5  rF  )r   r   r  r=   rf  r   rB  rA  s         @@rW   get_coderi  
  s    $ L", 	

.0I	
 	

-);=MN	 	O	
  	ON	
 	
 s#   "B('BB(
B%	!B((
B7c                    [        U /UQ70 UD6nS[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ Nr7   r  z%expected one or two code outputs got r   )ri  rR   )r   r   r  rA  s       rW   get_triton_coderl  C
  sQ    B000LL!&Q& 
/L0A/BC& ?r{   c                    [        U /UQ70 UD6u  p4S[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ rk  )rG  rR   )r   r   r  r   rA  s        rW   run_and_get_triton_codern  M
  sU     'r;D;F;OAL!&Q& 
/L0A/BC& ?r{   c                   ^^^ SSK Jm  SSKJn  UR                  m/ mSUUU4S jjn[
        R                  R                  USU5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr   r<   rD   c                 h   > T" U 0 UD6  U S   n[        UT5      (       d   eTR                  U5        g )Nr  )r}   r  )r   r  rv  r=   graph_lowerings	real_inits      rW   	fake_init-run_and_get_graph_lowering.<locals>.fake_initb
  s:    4"6"Q%////u%r{   rS  r`  )torch._inductor.graphr=   torch._inductor.output_coderE   rS  r   rE  r  )	r   r   r  rE   rs  r  r=   rq  rr  s	         @@@rW   run_and_get_graph_loweringrw  Y
  sv     4;((IO& & 
		?J		BT$V$ 
C ?"" 
C	B ?""s   		A
A/c              #     #    SSK Jn  UR                  U    n [        R                  " X5      UR                  U '   Sv   X2R                  U '   g! X2R                  U '   f = f7f)zs
Override the lowering of aten_op with override_fn.
The first argument of override_fn is the original lowering fn.
r   )loweringN)torch._inductorry  	loweringsr  partial)aten_opoverride_fnry  orig_fns       rW   override_loweringr  n
  sY      )  )G.&/&7&7&M7#&-7#g7#s   A"'A  A"AA"c                   ^ ^^ SSK Jn  UR                  mSUUU 4S jjn[        R                  R
                  R                  USU5      $ )zf
Add hook functions to be called at the beginning and end of Scheduler.__init__.
Used for unit tests.
r   )	Schedulerc                F   > T" X5        T" X5      nT(       a  T" X5        U$ r   r   )r  rU  outr  post_fnpre_fns      rW   r  (add_scheduler_init_hook.<locals>.wrapper
  s%    y i'I%
r{   rS  )r  r   rU  r   r   r   )torch._inductor.schedulerr  rS  unittestr   rE  r  )r  r  r  r  r  s   ``  @rW   add_scheduler_init_hookr  
  s>     4  G  ==%%iWEEr{   c                    [         R                  (       a  [        R                  U 5        g[        R	                  U 5        g)z
Warnings that will be actionable for PyTorch developers, but not
end users.  Allows us to easily disable them in stable releases but
keep them on for nightly builds.
N)ri   developer_warningsr   r  info)msgs    rW   developer_warningr  
  s$       Cr{   c                     [         R                  R                  S5      n U S-   [        [         R                  5      :  aV  [        [         R                  U S-      5      S:  a3  [         R                  U S-      S   S:w  a  [         R                  U S-      $ [         R                   H)  nUR                  S5      (       d  M  U[        S5      S s  $    g! [         a     NJf = f)a  
An experimental API used only when config.benchmark_kernel is true.

The benchmark name is only available at codegen time. So we can not
directly call it in benchmark_all_kernels which is run after codegen.

The function assumes the argument after --only is the benchmark name.
It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
scripts, this function may return None.

There are 2 flavors of --only argument we need handle:
1. --only model_name
2. --only=model_name
z--onlyr7   r   r  z--only=N)r  argvr  rR   
ValueErrorr  )rb  r  s     rW   get_benchmark_namer  
  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$$s9~'((    s   BC 
C"!C"c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7fr7   Nr   r  s     rW   r   is_ones.<locals>.<genexpr>
       %u!Avu   r   rZ  s    rW   is_onesr  
      %u%%%r{   c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7f)r   Nr   r  s     rW   r   is_zeros.<locals>.<genexpr>
  r  r  r  r  s    rW   is_zerosr  
  r  r{   c                &    [        S U  5       5      $ )Nc              3     #    U  HI  n[        U[        R                  5      (       d  M$  UR                  [        R                  " S 5      :H  v   MK     g7f)r  N)r}   rP   r  r   )r   r   s     rW   r    is_cpu_device.<locals>.<genexpr>
  s9      DdELL) 	+u||E**s
   #A*Ar  )inputss    rW   is_cpu_devicer  
  s       r{   c                    [        U [        R                  5      (       d   S5       eU R                  (       a  [        R
                  $ [        R                  $ )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r}   r~   r4  r   rP   rV  rN  rn  s    rW   get_sympy_Expr_dtyper  
  s@    c5::&& B& ~~{{}}r{   c              /     #    U (       a.  [         R                  R                  " U0 UD6 nUv   S S S 5        g S v   g ! , (       d  f       g = f7fr   )rP   r   r   )should_profiler   r  r   s       rW   maybe_profiler  
  s;     ^^##T4V4G 54 	 54s   (A=A
AAc                 p    [         R                  R                  n U S:  a  [        R                  " 5       n U $ Nr7   )ri   r   threadsrP   get_num_threads)r  s    rW   r&  r&  
  s+    jj  G{'')Nr{   c                     SSK Jn   U " 5       nUR                  S[        R                  R
                  (       a  S5      $ S5      $ )Nr7   )get_backend_options
num_stagesr  r  )runtime.triton_helpersr  rW  rP   r  rq   )r  optionss     rW   get_backend_num_stagesr  
  s2    ;!#G;;|%--*;*;QCCCCr{   c                N   [        U [        R                  R                  R                  R
                  S:H  S9nUb  U$ SSKJnJn  [        R                  R                  5       =(       a!    [        R                  R                  5       S:  nU [        R                  [        R                  [        R                  4;   d   e[        R                  " U5      R                   R#                  S5      (       a  SSKJn  U" 5       nU [        R                  [        R                  4;   a  U(       a  U" X5      $ [        R                  R                  R                  R
                  S:X  a  U" [        R                  U5      $ U" [        R                  U5      $ U [        R                  [        R                  4;   a  U(       a  U" U 5      $ [        R                  R                  R                  R
                  S:X  a  U" [        R                  5      $ U" [        R                  5      $ )z
We don't want to throw errors in this function. First check to see if the device is in device_info.py,
then fall back to the inaccurate triton estimation.
tf32)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rv   r   
clock_rate)max_clock_rate)r   rP   backendsrH   matmulfp32_precisiontriton.testingr  r  rQ   get_device_capabilityr   rJ  rL  inspect	signature
parametersrW  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rW   get_device_tflopsr  
  s    u~~**11@@FJG M**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\JJ8!#U]]ENN33,U==>>%%44>,U]]HEE&u}}h??U]]ENN33,U33>>%%44>,U]];;&u}}55r{   c                     SSK Jn   U " 5       $ )Nr   get_dram_gbps)r  r  r  s    rW   get_gpu_dram_gbpsr    s    ,?r{   c                 x    SSK Jn   U R                  R                  R	                  S5      R                  SS5      $ )Nr   r  max_shared_mem)triton.runtimer  r  r  r  rW  r  s    rW   get_gpu_shared_memoryr  &  s.    %==44Q7;;<LaPPr{   c                     [         R                  R                  5       (       aT  [         R                  R                  5       R                  n [         R                  R                  5       R
                  nX-  $ Sn SnX-  $ )NrF  i   )rP   rH   rQ   r  	warp_sizemax_threads_per_block)r  r  s     rW   get_max_numwarpsr  ,  sg    zz  JJ446@@	 %

 @ @ B X X
 !-- 	 $ --r{   c                $    U R                  S5      $ )Nwelford)r  reduction_types    rW   is_welford_reductionr  8  s    $$Y//r{   c                4    [        U 5      (       a  gU S:X  a  gg)Nr  online_softmax_reducer  r7   )r  r  s    rW   reduction_num_outputsr  <  s    N++	2	2r{   c                 2    [         R                  " 5       S:H  $ )NLinux)platformsystemr   r{   rW   is_linuxr  E  s    ??''r{   c                 (    [         R                  S:H  $ )Nrk   )r  r  r   r{   rW   r  r  I  s    <<7""r{   c                &    [        S U  5       5      $ )Nc              3     #    U  H7  n[        U[        R                  5      =(       a    UR                  (       + v   M9     g 7fr   )r}   r~   r4  r  r  s     rW   r   #has_free_symbols.<locals>.<genexpr>N  s)     Jcz!UZZ(<_<cs   ?Ar  )itrs    rW   r#  r#  M  s    JcJJJr{   c            	        SSK Jn  U  H  n[        X!R                  UR                  UR
                  UR                  UR                  45      (       aR  [        UR                  5       =(       d    S5      (       d'  [        UR                  5       =(       d    S5      (       a    gM  [        X!R                  5      (       d  M  [        S[        U5       35      e   g)Nr7   r
  r   Tzunexpected type for is_dynamic F)r  r  r}   r5  r7  r$  rw  r>   r#  maybe_get_sizemaybe_get_strider@   	TypeErrorr  )r   r  r  s      rW   r  r  Q  s    bmmR[[":K:KRYYW
 
   0 0 2 8b99=M""$*> > > Ayy))=d1gYGHH  r{   c                      \ rS rSrSrSrSrg)Placeholderie  KERNEL_NAMEDESCRIPTIVE_NAMEr   N)r   r   r   r   r  r  r   r   r{   rW   r  r  e  s      K *r{   r  c                v   SSK Jn  [        R                  " SSS9 n[        R
                  " 5       n[        R
                  " 5       n[        U[        U5      S9R                  " U6   [        SUR                   3US9  [        UR                  US9  [        R                  " 5       n[        X5         U " UR                  5        S S S 5        [        R                  " 5       U-
  n	U" UR                  5        UR                  R                  5         UR                  5         [        S	UR                   3US9  [        UR                  US9  UR!                  5       UR!                  5       :H  n
["        R%                  S
UUR&                  U
U	5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr7   )stable_topological_sortrl  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   r^   rZ   	propagater  rv  r   nowr]   lint	recompiler^  r   r  r   )r  r  inpr  r  r&  	before_ioafter_io
start_timetime_elapsedr  s              rW   pass_execution_and_saver  o  sE    9		$	$
 
KKM	;;=R#3C#89CCSI	"(($1-bhhY'\\^
#B,N -||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
 -,
 
s%   BF*2FCF*
F'	#F**
F8c                    SSK Jn  [        XR                  5      =(       a     [        U R                  UR
                  5      $ )z:
Check if input buffer is a multi-outputs template buffer
r7   r
  )r  r  r}   CppTemplateBufferr<  MultiOutputLayout	input_bufr  s     rW   is_multi_outputs_templater	    s7     i!5!56 :"..< r{   c                    SSK Jn  [        XR                  5      =(       a7    [	        U R
                  5      S:H  =(       a    [        U R
                  S   5      $ )zD
Check if input buffer is a output of multi-outputs template buffer
r7   r
  r   )r  r  r}   MultiOutputrR   r  r	  r  s     rW   #is_output_of_multi_outputs_templater    sJ      	9nn- 	;	  !Q&	;%i&6&6q&9:r{   c                   U c  gSSK Jn  [        XR                  5      =(       a:    [        XR                  5      (       + =(       a    US L =(       d    U R
                  UL =(       Gd_    [        U 5      UR                  L =(       Ga@    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       d    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       df    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  $ )NFr7   r
  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r}   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r  torchrecr  defaultr  r  rl  r  r  s      rW   is_collectiver    sM    | 	4--. 	3400	34Z14++r1  	T
b''' 	
 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/r{   c                <    SSK Jn  [        U 5      UR                  L $ Nr7   r
  )r  r  r  r  )rl  r  s     rW   is_waitr    s    :''r{   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      =(       a    US L =(       d    U" U 5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_collectiver  s     rW   r   &contains_collective.<locals>.<genexpr>  s     @<a&q))<r   )r  r  r}   r  snodesr  rl  )snode	filter_fnr  s      rW   r   r     sJ     ?%..@5<<@@@$P)t*;*Oy?OPr{   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   r  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_waitr  s     rW   r    contains_wait.<locals>.<genexpr>  s     :\=##\r   )r  r  r}   r  r"  r  rl  )r#  r  s     rW   r'  r'    s4    >%..:U\\:::uzz""r{   c                    SSK Jn  [        U[        R                  R
                  5      (       a  U/n[        XR                  5      =(       a    U R                  U;   $ r  )r  r  r}   rP   r  r  r  r  r  s      rW   is_fallback_opr*    sF     "ejj++,,Td--.I43C3Cr3IIr{   c                @    X!U    R                   R                  5          $ r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rW   buf_name_to_fused_snoder0    s!     (3??HHJKKr{   c                    gr  r   r#  s    rW   r  r        ur{   c           	         U" U 5      (       a  g UR                  U 5        U R                   H-  n[        UR                  X#5      nXa;   a  M   [	        UUUUUS9  M/     g )Ncriteria_cb)r[  unmet_dependenciesr0  r   find_recursive_deps_of_node)r#  collected_node_setr.  r/  r6  depdefining_op_for_deps          rW   r8  r8    sf     55!''5HHk
 4##	
 (r{   c                    gr  r   r2  s    rW   r  r    r3  r{   c           
        U" U 5      (       a  g UR                  U 5        U R                  5        H  nUR                   H  nUR                  c   eUR                  R	                  5       S:X  a  M2  UR                  R	                  5       U;  a  MR  X6R                  R	                  5          nXq;   a  Mu  [        UUUUUS9  M     M     g )NOUTPUTr5  )r[  get_outputsr  rl  r  find_recursive_users_of_node)r#  r9  r.  r/  r6  ors  user_ops           rW   r@  r@    s     55! GGD99(((yy!!#x/yy!!#+==(););)=>G,(""'  !r{   c                j    [         R                  R                  R                  (       a  SOSnX-
  U-
  $ )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rP   
_functorchri   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rW   num_fw_fixed_argumentsrI  4  s3     $$::   69SSSr{   c                   SS jnSn/ nU R                   R                   H8  nUR                  S:X  d  M  U" U5      (       a  UR                  U5        US-  nM:     U[	        [        [        U5      5      5      :X  d   e[        U5      $ )z6
Infers which inputs are static for a backwards graph
c                    SU R                   ;  =(       a;    SU R                   ;  =(       a%    SU R                   ;  =(       a    SU R                   ;  $ )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rW   is_saved_tensor'count_tangents.<locals>.is_saved_tensorD  sH    aff$ .!&&(.!/.  qvv-		
r{   r   r  r7   )rT   r5   r   r:  )rv  rU  r  r  rb  r   rR   )fx_grP  	arg_countstatic_arg_idxsr(  s        rW   count_tangentsrU  ?  s    

 IOZZ44= q!!&&y1NI	  d5_)=#>????r{   c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
	BoxedBooliX  r:  r   c                    U R                   $ r   )r   rf  s    rW   rq  BoxedBool.__bool__\  s    zzr{   c                @    [        U [        5      (       a	  SU l        U $ gr  )r}   rW  r   r   s    rW   disableBoxedBool.disable_  s    c9%%CIJr{   r   Nr  )r  r   r   zUnion[BoxedBool, bool])	r   r   r   r   r   rq  r  r[  r   r   r{   rW   rW  rW  X  s     K  r{   rW  c              #     ^ ^#    SSK Jn  UR                  m   S             SU U4S jjjn[        R                  R                  USU5         S v   S S S 5        g ! , (       d  f       g = f7f)Nr7   r9   c                :   > TR                  U5        T" XX#XE5      $ r   rV  )r  kernel_namere  re  gpucpp_definitionkernel_listorig_define_kernels         rW   define_kernel.collect_defined_kernels.<locals>.define_kernelm  s'     	;'!{c
 	
r{   rd  )NTN)r  r:   r_  r  re  r  re  Optional[str]r`  r:  ra  rf  r   r   )codegen.wrapperr:   rd  r   rE  r  )rb  r:   rd  rc  s   `  @rW   collect_defined_kernelsrh  g  s     5-;; #'(,
"

 
  	

 
 &
 

 
 
		/-	P 
Q	P	Ps   AA2A!	A2!
A/+A2c                    U S-   $ )N__original__r   r  s    rW    get_cloned_parameter_buffer_namerk  ~  s    .  r{   c                    U [         ;   $ r   )rN   r  s    rW   r  r    s    Yr{   c                 :    [         R                  R                  SL$ )z,Check if we're running on ROCm/HIP platform.N)rP   r  rq   r   r{   rW   is_rocmrn    s    ==D((r{   c                0    U S:g  =(       a    [        U 5      $ )NrI   )r  r  s    rW   device_need_guardrp    s    U?-vf~-r{   c                h   U [         R                  :X  aD  [         R                  R                  5       (       a!  [         R                  R	                  5       S:  $ U [         R                  :X  a$  [         R
                  R                  5       (       a  gU [         R                  [         R                  4;   $ )N)r  r   T)rP   rJ  rH   rQ   r  rJ   rV  r:  rB  s    rW   ,needs_fallback_due_to_atomic_add_limitationsrr    sq    5::#:#:#<#<zz//1F::	%..	 UYY%;%;%=%=ejj111r{   c                   U R                   [        R                  R                  R                  [        R                  R                  R
                  4;   a  Uc  gU R                   [        R                  R                  R                  :X  a  SOSnUS U4;  =(       Gd&    U=(       a    [        U5      =(       a    [        U5      =(       d    U R                   [        R                  R                  R                  :H  =(       ap    US:H  =(       ad    U=(       a[    US:H  =(       aO    [        R                  R                  =(       a.    [        R                  R                  =(       d    [        5       S:g  =(       dJ    X:H  =(       a#    U[        R                  [        R                  4;   =(       d    [        R                   " 5       $ )NFr[  r  r  r7   )overloadpacketrP   r  atenscatter_reduce_scatter_reducescatter_r  rr  ri   r   fallback_scatter_reduce_sumdynamic_threadsr&  r:  rV  $are_deterministic_algorithms_enabled)r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rW   use_scatter_fallbackr    s]    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 'SJ5::u{{:S,S	8 557!r{   c                   SSK JnJn  SSKJn  [        S[        U 5       S35        [        U 5       GH.  u  pE[        SUS S35        XRL a  [        S	5        M'  XQL a  [        S
5        M8  [        XS5      (       a  UR                  5       n[        U(       a  SOS S35        U(       a;  UR                  c   e[        SUR                  R                  R                   35        [        S5        UR                  R                   H  n[        U5        M     [        S5        UR                  R                   H  n[        U5        M     GM  [!        S[#        U5       35      e   g)z
An API that can be used in pdb to dump a node_schedule.
Right mainly dump the read/write dependencies but can add more as needed.
r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr2  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rR   r  r}   is_reductionrl  r6  reduction_hintrL  rM  rN  r   r  )r  r  r  r  rb  rl  is_redr:  s           rW   dump_node_scheduler    s&   
 O7	M 236
:;}-	#al"$%%%&,,&&(FfU$/?@yy,,,01N1N0OPQ*''--c
 .+''..c
 / !9$t*FGG+ .r{   c                z    SSK Jn  U" U R                  5       [        U R                  5      -  [
        -  S:H  5      $ )Nr   )r  )r  r  storage_offsetrD  r   GPU_ALIGN_BYTES)r   r  s     rW   tensor_is_alignedr    s:     L 				 >&,,#?	??RVWW r{   c                    [        U R                  R                  5      (       d  g[        R                  =(       d    [        U 5      $ r  )r  r   r  ri   assume_aligned_inputsr  )example_inputs    rW   should_assume_input_alignedr    s5     -&&++,,''K+<]+KKr{   c                 X   [         R                  R                  R                  5       n U (       d  [        R
                  " 5       $ U R                  (       a  U R                  R                  (       d  [        R
                  " 5       $ U R                  R                  nUR                  5       $ r   )	rP   _guardsTracingContexttry_getr  nullcontextr  rx  suppress_guards)tracing_contextrx  s     rW   #maybe_get_suppress_shape_guards_ctxr    sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&r{   c                "   [         R                  R                  R                  [        SS5         [
        R                  R                  5         SS KnSS K	nUR                  " 5       nUR                  " U5      nSSKJn  UR                  U5        UR                  nUR!                  UR"                  5        U " U0 UD6n	UR%                  5       n
UR!                  U5        UR'                  U5        S S S 5        X4$ ! , (       d  f       W	W
4$ = f)Nr   Tr   )output_code_log)r  r   rE  r  ri   rP   r5  rF  r  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr^  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rW   run_and_get_cpp_coder    s     
			#	#FGT	:[[]""#56=""2&$**
  /T$V$'')  ,%%b) 
;  9! 
;	:  19s   CC==
Dc                :   [        U 5      nUb  UR                  $ U  H  n[        U[        R                  5      (       a  UR
                  R                  s  $ [        U[        R                  5      (       d  M[  UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    M     g r   )	rZ   rx  r}   rP   r2   rl  r  r  rF  )r  r  inputr  rF  s        rW   shape_env_from_inputsr     s     (I """ eU\\**::''' eU\\**

dELL1199... %  ,,.fell33!;;000 )  r{   c                B   ^ ^^ [        T5      S:X  a  T $ SUU U4S jjnU$ )Nr   c                   > [        U TT5      u  pT" U 5      n[        U5      (       a  [        R                  " X5        U$ r   )copy_misaligned_inputsrR   rP   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr  inputs_to_checkr  mutated_input_idxss       rW   r  )align_inputs_from_check_idxs.<locals>.runE  sD    #9);$
  J {  :
r{   )r  list[InputType]r   r   )rR   )r  r  r  r  s   ``` rW   align_inputs_from_check_idxsr  =  s(    
 ?q   Jr{   c                X   SU R                  5       ;   a  SnO;[        S [        U R                  5       U R                  5       5       5       5      S-   n[        R
                  " X4S5      R                  5       n[        R
                  " X R                  5       U R                  5       5      $ )Nr   c              3  6   #    U  H  u  pUS -
  U-  v   M     g7fr  r   )r   rA  rF  s      rW   r   )clone_preserve_strides.<locals>.<genexpr>[  s     T:Sf$:Ss   r7   r   )r  r  r   rF  rP   
as_stridedclone)rT   needed_sizer:  s      rW   clone_preserve_stridesr  U  s    AFFH} T#affh
:STTWXX 	 a6<<>FFFFHahhj99r{   c                T   / n/ nUSLnU H  nX   n[        U[        R                  5      (       d   S[        U5       35       eUR	                  5       [
        -  (       d  MW  [        U5      X'   U(       d  Mm  Xb;   d  Mt  UR                  U5        UR                  X   5        M     X44$ )z
Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
cloned tensor which is in `return_pair_idxs`.
Nz Expected tensors only, but got: )r}   rP   r  r  data_ptr	ALIGNMENTr  r  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rW   r  r  a  s     ')K&(K (t3}$-- 	
.tDzl;	
- ==?Y&&248JMA$9""4("":=1  ##r{   c                    / nU HV  nX   n[        U[        R                  5      (       d  M(  UR                  5       [        -  S:X  d  ME  UR                  U5        MX     [        U5      [        U5      :w  a  U$ U$ )zO
We require all inputs to be aligned, so introduce a copy for any
that aren't.
r   )r}   rP   r  r  r  r  rR   )r  static_input_idxsaligned_static_input_idxsrb  r  s        rW   remove_unaligned_input_idxsr    sp     !# eU\\**0@90LQR/R%,,S1 ! $%->)??((r{   c                   SSK Jn  [        R                  " [        R                  5      R
                  nUR                  R                  R                  nUR                  R                  R                  R                  n[        R                  (       a&  UR                  R                  R                  X5        gUR                  R                  R                  X:*  5      (       a  gUR                  (       a.  UR                  R                  R                  U S:  5      (       a  gU" U 5      =(       a    U" U 5      U:*  $ )Nr7   rr  Tg@xDF)ru  rs  rP   iinforT  r   rv  rw  rX  rx  has_hintri   assume_32bit_indexing	check_leqr  r  )r   rs  int_maxrX  r  s        rW   expr_fits_within_32bitr    s    kk%++&**G  **Iww))22H##	""1. 	ww--al;; 	 7711!d(;;  A;29Q<722r{   c                6  ^^^ [         R                  R                  R                  5       nUb  UR                  b  [        UR                  5      S:X  d   e[        U 5      mUR                  c   eUR                   H  nUc  UR                  R                  S 5        M#  Sm[         R                  R                  R                  5       =n(       a  UR                  mSUU4S jjmUR                  R                  [        U4S jU 5       5      5        M     g g g )Nr   Fc                r   > Tc  [        U 5      $ T(       a  TR                  U 5      $ TR                  U 5      $ r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrx  s    rW   map_expr4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88r{   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r   r  s     rW   r   5set_tracing_context_output_strides.<locals>.<genexpr>  s     5u!(1++ur\  )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rP   r  r  r  output_stridesrR   r  r  r  r  )r  compiled_graphrG  r?  r  r  r  rx  s        @@@rW   "set_tracing_context_output_stridesr    s     mm**224Gw55A7))*a///).9	,,888#22E}&&--d3$)!--66>>@@3@(+(=(=%9 9 &&--5u55 3	  Br{   c                 4   [         R                  b  [         R                  $ [         R                  " 5       (       d  g[        R                  R                  5       (       a  g SSKJn   U [        R                  R                  S5      :  $ ! [         a     gf = f)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ri   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rW    should_use_remote_fx_graph_cacher    s    ##/+++,,..H  5#8#8#M#M8$    s   "B
 

BBc                2    [         R                  " SSU 5      $ )Nz[^a-zA-Z0-9_]r   )r   subr  s    rW   normalize_namer    s    66"C..r{   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                j    [         R                  S[        U 5      5      n[        R	                  X5      $ )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r  _triton_type_mappingrW  )r   triton_type_names     rW   triton_typer    s+    &**5#e*=##$4GGr{   c                    [         R                  X 5      nUR                  SS5      n[        [        U5      n[        U[        R                  5      (       d   eU$ )Nr  r  )_torch_triton_mappingrW  r  rO   rP   r}   r   )r   adjusted_type	type_namer  s       rW   triton_type_to_torchr    sM    )--e;M%%eR0Iy)Ii----r{   c                   U R                   (       + =(       a    U R                  5       UR                  5       :H  =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ae    U R                  5       R                  5       UR                  5       R                  5       :H  =(       a!    U R                  5       UR                  5       :H  $ r   )	is_mkldnnr  rF  r   r   untyped_storager  r  r6  r   s     rW   is_same_tensorr    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;r{   c                   U R                   =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       as    U R                  UR                  :H  =(       aS    [        R
                  R                  R                  U 5      [        R
                  R                  R                  U5      :H  $ r   )r  r  r   r   rP   r  mkldnnr  r  s     rW   is_same_mkldnn_tensorr     s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOr{   c                     g)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   r{   rW   boolean_opsr  "  s    r{   c                  *    \ rS rSr% S\S'   S\S'   Srg)OpDtypeRulei6  r3   type_promotion_kindOptional[torch.dtype]override_return_dtyper   NrH  r   r{   rW   r  r  6  s    8800r{   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                (    [        X5      [        U '   g r   )r  r  )r   r  r  s      rW   #register_op_dtype_propagation_rulesr  ?  s    
 (3(t$r{   zOrderedSet[str]op_requires_libdevice_fp64c                .    [         R                  U 5        g r   )r  r[  r  s    rW   #register_op_requires_libdevice_fp64r  L  s    ""4(r{   c                    SSK Jn  U (       d$  UR                  R                  5       R                  n U S:X  a  [
        R                  $ U S:X  a  gU S:X  a  [
        R                  $ [
        R                  $ )Nr   rr  r  rI   rJ   )	r  rs  rv  get_current_device_or_throwr  ri   cpu_backendxpu_backendcuda_backend)r   rs  s     rW   get_current_backendr  P  s_    -gg99;@@e!!!				!!!"""r{   c                    U [         R                  [         R                  4;   a=  [        R                  R
                  (       a  [        5       S:X  a  [         R                  $ U $ )z"Maybe upcast [b]float16 to float32r  )rP   r   rJ  ri   r  codegen_upcast_to_fp32r  rL  rB  s    rW   upcast_compute_typer"  _  s@     	%--00MM00!X-}}Lr{   KeyTypeValTypec                  v    \ rS rSrSrSS jrSS jrSS jrSS jrSSS jjr	SS	 jr
SS
 jrSS jrSS jrSrg)
ScopedDictin  z
A dictionary-like object that allows for scoped updates. It maintains
an original dictionary and a set of new items that can override
the original items within the scope.  The original dictionary is
unmodified.
c                    Xl         0 U l        g r   original_dict	new_items)r  r)  s     rW   rS  ScopedDict.__init__v  s    *13r{   c                \    XR                   ;   a  U R                   U   $ U R                  U   $ r   r*  r)  r  s     rW   r+  ScopedDict.__getitem__z  s,    .. >>#&&!!#&&r{   c                     X R                   U'   g r   )r*  )r  rc  r   s      rW   __setitem__ScopedDict.__setitem__  s    #sr{   c                H    XR                   ;   =(       d    XR                  ;   $ r   r-  r  s     rW   __contains__ScopedDict.__contains__  s    nn$A/A/A(AAr{   Nc                t    XR                   ;   a  U R                   U   $ U R                  R                  X5      $ r   )r*  r)  rW  )r  rc  r  s      rW   rW  ScopedDict.get  s2    .. >>#&&!!%%c33r{   c                    [        U R                  5      nU R                   H  nX R                  ;  d  M  US-  nM     U$ r  )rR   r)  r*  )r  r(  r  s      rW   r  ScopedDict.__len__  s<    ""#A***Q   r{   c              #     #    U R                    S h  vN   U R                   H  nXR                   ;  d  M  Uv   M     g  N-7fr   r(  )r  r  s     rW   __iter__ScopedDict.__iter__  s8     %%%%A***   	&s   AA  A
Ac                R    [        U R                  =(       d    U R                  5      $ r   )r:  r)  r*  rf  s    rW   rq  ScopedDict.__bool__  s    D&&8$..99r{   c                    [         er   r  r  s     rW   __delitem__ScopedDict.__delitem__  s    !!r{   r-  )r)  Mapping[KeyType, ValType])rc  r#  r   r$  )rc  r#  r   r$  r   r  )rc  r  r   r:  r   )rc  r#  r  Optional[ValType]r   rB  r  )r   zIterator[KeyType]r  )rc  r#  r   r  )r   r   r   r   r   rS  r+  r0  r3  rW  r  r:  rq  r?  r   r   r{   rW   r&  r&  n  s5    4'
$B4
:"r{   r&  )frozen_defaultc              .   ^ SU4S jjnU c  U$ U" U 5      $ )Nc                0   > [         R                  " U STS9$ )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rW   wrapir_dataclass.<locals>.wrap  s    $$S$vFFr{   )r   rl   r   rl   r   )r   r   rI  s    ` rW   ir_dataclassrK    s    G {9r{   c                     [         R                  R                  R                  5       n U b'  U R                  (       a  U R                  R
                  $ g r   )rP   r  r  r  fw_metadatabw_donated_idxs)r  s    rW   get_donated_idxsrO    s=    mm22::<O"'B'B**:::r{   c                  (    \ rS rSrSrSrSrSrSrSr	g)	TritonAttrsDescriptorVersioni  r   r7   r  r  rN  r   N)
r   r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r   r{   rW   rQ  rQ    s     LKK	  Gr{   rQ  c                 f   [         R                  R                  S5      c  [        R                  $ SS Kn SS Kn [        U R                  R                  S5      (       a  [        R                  $ [        U R                  R                  S5      (       a  [        R                  $ [        R                  $ )Nr  r   AttrsDescriptor)ry  rz  r{  rQ  rR  triton.backends.compilertriton.compiler.compilerr  r  compilerrT  rS  rV  )r  s    rW   #get_triton_attrs_descriptor_versionr\    s    ~~)1+888##v''):;; ,777	))+<	=	=+777 ,333r{   c                 8    [        5       [        R                  :H  $ r   )r\  rQ  rV  r   r{   rW   triton_version_uses_attrs_dictr^    s    .04P4X4XXXr{   c                    U R                  5       n[        U [        R                  R                  5      (       a  U SU R
                   3OUnX4$ )Nrp   )r   r}   rP   r  r  _overloadname)r  op_overload_packet_nameop_overload_names      rW   get_op_namesrc    sR    #%779 b%**//00 #
#1R%5%5$67$ 
 #44r{   c                |   SSK Jn  U R                  n[        U[        R
                  R                  5      (       d  gU[        R                  R                  R                  R                  [        R                  R                  R                  R                  [        R                  R                  R                  R                  4;   as  U" X R                  U R                  SS9nUbT  Uu  pEUS   nU HE  nUc  M  UR                  S   R                   [        R"                  [        R$                  4;   d  ME    g   g)z
Check if an FX node is cudagraph-unsafe based on its input arguments.

Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
with boolean indices triggers .nonzero() during capture, but integer indices
are safe).
r   )normalize_functionFT)normalize_to_only_use_kwargsindicesro  )torch.fx.operator_schemasre  r  r}   rP   r  r  r  ru  	index_putr  
index_put__unsafe_index_putr   r  r  r   r:  r[  )r  re  r  
normalizedr   r  rg  rb  s           rW   ,_fx_node_is_input_dependent_cudagraph_unsaferm    s     =^^Ffejj3344 		  ((		!!))		((00 
 (LL'..t

 !"IAY'G?sxx'<'<JJKKA (    r{   c                   U R                   n[        U5      [        ;   a  g[        U[        R
                  R                  5      (       a3  [        R                  R                  R                  UR                  ;   a  g[        U 5      (       a  gU R                  R                  S5      =nb]  [        U[        [        45      (       d  U/OUnU H7  n[        U[        R                   5      (       d  M$  UR"                  (       d  M7    g   g)a  
Check if an FX node is cudagraph-unsafe.

This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- Input-dependent unsafe ops (e.g., index_put with boolean indices)
- Ops with sparse tensor outputs
Tro  F)r  r  FORBIDDEN_CUDAGRAPH_OPSr}   rP   r  r  r  r  cudagraph_unsafer  rm  r  rW  rb  r  r  	is_sparse)r  r  ro  valsr   s        rW   r  r    s     ^^F 6{-- 	65::0011HHLL))V[[8 4G<< ||&&3&sT5M::uA!U\\**q{{{  r{   c                    SSK Jn  [        XR                  UR                  45      (       a  g[        XR
                  UR                  45      (       d  g[        U SS5      nUb  [        U5      (       a  gg)aH  
Returns True if the node is an op that is not cudagraphable.
This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- index_put_ with boolean indices (triggers .nonzero() during capture)
- Control flow nodes (Conditional, WhileLoop)
- Ops with sparse tensor outputs
r7   r
  TFr  N)	r  r  r}   Conditional	WhileLoopr  r?   rO   r  )rl  r  r  s      rW   is_cudagraph_unsafe_oprv  *  sf      $677d..@AAdIt,G:7CCr{   c                 6   [         R                  R                  SS5      n [        R                  " 5       (       a^  SSKJn  U" 5       nU(       aJ  [         R                  R                  USS5      nU (       a   [         R                  R                  X0/5      OUn U $ )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  rW  ri   r  libfb.py.parutilry  r  r  pathsep)r  ry  runtime_pathlib_paths       rW   get_ld_library_pathr  D  sh    ::>>+R0D5')ww||L)UCH8<2::??H#34(DKr{   c                N    SSK Jn  [        X5      =(       a    U R                  S L$ )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  r}   partition_signatures)r  r  s     rW   #is_codegen_graph_partition_subgraphr  Q  s'    L 	79 	5((4r{   c                     [         R                  R                  R                  R                  =(       d    [
        R                  S L=(       a$    [         R                  R                  R                  $ r   )rP   r  ri   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   r{   rW   is_using_cudagraph_partitionr  Z  sN    %%00 	F199E1 //
 
 
0
01r{   c                    SSK Jn  UR                  R                  R	                  U S5      (       a;  UR                  R                  R                  U S5      (       a  [        R                  $ [        R                  $ )Nr7   rr  l        i   )	ru  rs  rv  rw  statically_known_ltrK  rP   rT  rV  )r  rs  s     rW   dtype_from_sizer  a  sX    ww++e 
''


/
/h
?
?{{{{r{   )r  rJ   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN BF16.
r  rJ   TF)rP   r  r  _is_mkldnn_bf16_supportedr   s    rW   is_mkldnn_bf16_supportedr  o  3     eyy99;;	+	r{   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN FP16.
r  rJ   TF)rP   r  r  _is_mkldnn_fp16_supportedr  s    rW   is_mkldnn_fp16_supportedr  {  r  r{   c           
     x   U Vs/ s H  n[        [        U5      5      PM     nnU  HS  n[        U5      [        U5      :X  d   e[        U5       H'  u  pR[        X5   [        [        U5      5      5      X5'   M)     MU     / nUR	                  SR                  S [        X5       5       5      5        [        U5      [        U5      S-  -   [        U5      S-
  -   nUR	                  SU-  5        U  H3  nUR	                  SR                  S [        XC5       5       5      5        M5     SR                  U5      $ s  snf )N|c              3  4   #    U  H  u  pS X  S 3v   M     g7fr2  Nr   )r   hrl  s      rW   r   tabulate_2d.<locals>.<genexpr>  s     H3G41AaWA,3G   r  r7   r  c              3  4   #    U  H  u  pS X  S 3v   M     g7fr  r   )r   r   rl  s      rW   r   r    s     H7Gtq!Cl7Gr  rR  )rR   r  r  r   r  r  r   r  )elementsheadersr   widthsrowr   r  total_widths           rW   tabulate_2dr    s    #*+7ac#a&k7F+3x3w<'''cNDAFIs3q6{3FI #  E	LLH3w3GHHIf+Vq1S[1_EK	LL{"#SXXHs37GHHI 99U ,s   D7c              #     #    [        U R                  5       5      [        UR                  5       5      -  nU H6  nU R                  U5      nUR                  U5      nUUb  UOUUb  UOU4v   M8     g7f)a  
Zip two dictionaries together, replacing missing keys with default values.

Args:
    dict1 (dict): The first dictionary.
    dict2 (dict): The second dictionary.
    d1_default (Any): the default value for the first dictionary
    d2_default (Any): the default value for the second dictionary

Yields:
    tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
           and the value from dict2 (or d2_default if missing).
N)r#   rY  rW  )dict1dict2
d1_default
d2_defaultall_keysrc  value1value2s           rW   	zip_dictsr    sp     ( %**,'*UZZ\*BBH 33 (Fj(Fj
 	
 s   A1A3c                v           SS jn        SS jnU R                  S[        R                  R                  5      nU R	                  5       n U(       aq  U" U SS5        U" U SS5        U" U S[
        R                  R                  (       + 5        U" U SS	5        U" U S
[        R                  R                  5        U" U SS5        U R                  S[        R                  R                  5      nU R                  S[        R                  R                  5      nUS:X  a  U(       a  [        S5      eU $ )a
  
Ensures the configuration is internally consistent for standalone AOTInductor.

If `aot_inductor_mode.compile_standalone` is set to True in the provided
`config_patches` (or falls back to the global config), this function ensures
that the following configs are also enabled:
    - `aot_inductor.package_cpp_only`

Args:
    config_patches (dict[str, Any]): A dictionary of user-provided config
        overrides for AOTInductor compilation.

Returns:
    dict[str, Any]: The possibly-updated `config_patches` dictionary.
c                    U R                  U[        [        U5      5      nUc  X U'   g U(       d  X2:w  a  [        SU SU S35      eg g )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)rW  rO   ri   r   config_patchesconfig_nameconfig_valuer   s       rW   patch_config2maybe_aoti_standalone_config.<locals>.patch_config  sY     "";0LM=*6;'50";-q>qr  1r{   c                    U R                  U[        [        U5      5      nX2:w  a  [        R	                  SUU5        X U'   g )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)rW  rO   ri   r   r  r  s       rW   force_patch_config8maybe_aoti_standalone_config.<locals>.force_patch_config  sB     "";0LM KKV
 '3{#r{   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  dict[str, Any]r  r  r  r   r   r  )rW  ri   aot_inductor_modecompile_standalonecopyrP   r  rq   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )r  r  r  r  r  r  s         rW   maybe_aoti_standalone_configr    sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 r{   c                   [         R                  R                  (       a)  [         R                  R                  S:X  a  [	        S5      e[         R                  R                  (       a0  [         R                  R
                  S:X  a  [	        S5      eSnSnX!4$ [         R                  R                  S:X  a  SnSnX!4$ U S::  a  gSn[         R                  " 5       (       + nX!4$ )	z
Decide whether we should mmap weights, and whether to store the weights with .so.

If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

Returns tuple (use_external_weights, use_mmap_weights).
binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ri   r  force_mmap_weights package_constants_on_disk_formatr   r  r  )consts_sizeuse_mmap_weightsuse_external_weightss      rW   determine_aoti_mmap_flagsr    s     	..@@MQJ
 	

 --44	A]   $#55;;}L# #55m# !++--11r{   c                     SSK Jn   U R                  R                  nUc  g[	        U[
        5      (       d  [        S5      eUS:X  a  g[        R                  " SU5      (       d  [        S5      eg)zD
Validates if a model name is suitable for use in code generation.

r   rh   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	rz  ri   r  model_name_for_generated_filesr}   r  r  r   r   )ri   
model_names     rW   is_valid_aoti_model_namer  8  sn    
 '$$CCJj#&&OPPR 88/<<d
 	
 r{   c                <    U(       a  [        U 5      $ [        U 5      $ r   )r)   r(   )rT   unbacked_onlys     rW   get_free_symbolsr  S  s    $Q''Ar{   c                 *   0 [         R                  ES[         R                  R                  S[         R                  R	                  [
        R                  5      5      0En [        R                  " 5       (       a  [        R                  " S5      U S'   U $ )z9
Get a base environment for running Python subprocesses.

PYTHONPATHTORCH_CUSTOM_PYTHONPATHr6  
PYTHONHOME)r  r  rW  r|  r  r  r  ri   r  	sysconfigget_path)envs    rW   python_subprocess_envr  Z  so    

** 	bjjnn%rzzsxx'@
	C  %..v6LJr{   c                  .    \ rS rSr% SrS\S'   S\S'   Srg)CUDAGraphWrapperMetadataiu  z
Metadata for Customized CUDAGraphWrapper.

Currently assumes there is 1 dynamo graph and will extend to
multiple graphs in the future.
r   num_partitionspartition_indexr   Nr   r   r{   rW   r  r  u  s      r{   r  .c                  $    \ rS rSr% SrS\S'   Srg)CUDAGraphWrapperi  NzOptional[CUDAGraphWrapperType]r  r   )r   r   r   r   r  r   r   r   r{   rW   r  r    s    .2G+2r{   r  c                    U [         l        g r   )r  r  )r  s    rW   !set_customized_partition_wrappersr    s    5<*2r{   c                H  ^ U R                   R                  nU R                   R                  / UQU R                   R                  QU R                   R                  5      nU R                   R                  n[
        R                  " X45      u  p4SS jnU Vs/ s H:  nU" U5      (       a(  [        R                  R                  R                  USS9OUPM<     nnSS jmSU4S jjnU Vs/ s H
  og" U5      PM     nn[
        R                  " X45      u  pX4$ s  snf s  snf )	Nc                    [        U [        R                  R                  R                  5      =(       a3    [        U [        R                  R                  R
                  5      (       + $ r   )r}   rP   r  r  r@   GeneratorStater  s    rW   _is_tensor_ir(snode_args_kwargs.<locals>._is_tensor_ir  sH    !U__//667 

u!!00A
 =
 	
r{   F)guard_shapec                ,    [         R                  " XUS9$ )Nr   )rP   r   )r  r   r   s      rW   _tensor"snode_args_kwargs.<locals>._tensor  s    {{4V<<r{   c                   > [        U [        R                  5      (       d  U $ T" U R                  5       U R                  U R
                  5      nU$ r   )r}   rP   r  r  r   r   )r   r  r  s     rW   to_real_tensor)snode_args_kwargs.<locals>.to_real_tensor  s:    !U\\**Haffh2
r{   r  )r   r  )r   r   r   r   )rl  r  fill_non_provided_argsconstant_argsr  pytreer$   rP   r  r  ir_node_to_tensortree_unflatten)	r#  r   r  	flat_argsflat_args_pytree_specr  r(  r  r  s	           @rW   snode_args_kwargsr    s   ::D::,,*$*))*

D ZZF'-':':D>'J$I
 	 A  	,,QE,B	 	  = -66Iq"II6((JLD<%  7s   AD,Dc                    SSK Jn  U R                  nUR                  R                  (       a(  UR	                  UR                  R                  S-   5      nUR                  S5      $ )Nr7   rr  r   )primals_r  fwd_rng_staterO  rL  )ru  rs  r   rv  removeprefixr  )r:  rs  dep_names      rW   is_nonfreeable_buffersr    sN    xxH 	ww||(();<I r{   c                x    [        X S3-  5       nUR                  5       sSSS5        $ ! , (       d  f       g= f)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr&  s      rW   load_templater    s+    	lvY//	0Avvx 
1	0	0s   +
9c                   U R                   n[        U[        R                  R                  [        R                  R
                  45      (       d   S[        U5       35       e[        R                  (       d  g[        [        R                  R                  R                  R                  [        R                  R                  R                  R                  /5      nX;   a  g[        [        R                  R                  R                   /5      n[        U[        R                  R
                  5      (       a  X;   $ [#        U 5      (       + $ )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r}   rP   r  r  r  r  ri   fallback_by_defaultr#   r  ru  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr!   )rl  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rW   should_fallback_by_defaultr
    s    [[F&&

(F(FG  O	?V~NO  %% *4IINN))11IINN**22	
*& 3 				@	@AM &%**8899&&&t,,,r{   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    U [         ;   $ )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rW   is_collective_opr    s    n$$r{   c                 p    [         R                  " 5       (       a	   SSKJn   U $ / $ ! [         a    / s $ f = f)Nr   tlx_only_cuda_options)ri   r  )torch._inductor.fb.tlx_templates.registryr  r  r  s    rW   r  r    s<    	W(( 		  	I	s   & 55c                    X-   S-
  U-  U-  $ )z(Round x up to the nearest multiple of y.r7   r   )rT   ys     rW   	_round_upr  "  s    UQY1!!r{   c                   SSK JnJn  U" US5      (       a  UR                  UR                  4$ [        U5      S:  Ga2  U" US   U S   5      (       a  U" US   S5      (       d%  U" US   S5      (       a,  U" US   U S   5      (       a  UR                  UR                  4$ U" US   U S   5      (       a  U" US   [        U S   S5      5      (       d2  U" US   U S   5      (       a6  U" US   [        U S   S5      5      (       a  UR                  UR                  4$ U" US   [        U S   S5      5      (       a6  U" US   [        U S   S5      5      (       a  UR                  UR                  4$ U[        R                  :X  a  SOSnU[        R                  :X  a  U[        R                  :X  a  [        U S   S5      [        [        XS   -  S5      S5      -  n	[        U S   S5      [        [        XS   -  S5      S5      -  n
U" X)5      (       d  U" X*5      (       a  UR                  UR                  4$ U[        R                   :X  Ga
  [        R"                  R$                  (       d  [        U S   S5      [        [        XS   -  S5      S5      -  n	[        U S   S5      [        [        XS   -  S5      S5      -  n
U" X)5      (       d  U" X*5      (       a  UR&                  UR                  4$  g	[        U S   S5      U-  U S   -  n	[        XS   -  S5      U S   -  n
U" X)5      (       d  U" X*5      (       a  UR&                  UR                  4$ g	)
z2
Core implementation for scale/swizzle inference.
r   )r6   SwizzleTyper7   r  rs   rr   rN  rF  NN)torch.nn.functionalr6   r  
TensorWise
NO_SWIZZLErR   RowWiserj   BlockWise1x128BlockWise128x128rP   rH  rD  r  BlockWise1x16SWIZZLE_32_4_4rF  r  rq   BlockWise1x32)mat_size
scale_sizescale_numel	mat_dtypescale_dtypeeq_fnr6   r  K_multiplierexpected_numel_aexpected_numel_bs              rW   _infer_scale_swizzle_implr*  '  sg    = [!%%{'='=== :!*Q-!--%
1q2I2I*Q-##jmXa[(I(I&&(>(>>> *Q-!--jmWXa[#%>??*Q-!--jmWXa[#%>??--{/E/EEE AS 9::uqM78A;4@
 @
 //1G1GGG "U%;%;;1L E***{e>Q>Q/Q$Xa[#6LA;.3Q:
 
 %Xa[#6LA;.3Q:
 
 //53W3W,,k.H.HHH e***}}  (!c:Y{2B7>    )!c:Y{2B7>   [33u8 8 #00+2L2LLL8   'x{B7,FRST&|qk'A2FRST[33u8 8 #00+2H2HHHr{   c           	         [        U R                  S   U R                  S   4[        UR                  5      UR                  5       U R                  UR                  S S9$ )aR  
Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

This function determines how scale factors are laid out relative to the matrix:
- TensorWise: Single scale for entire tensor
- RowWise: One scale per row
- BlockWise1x128/128x128: Block-scaled with float32 scales
- BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
- BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

Args:
    mat: The matrix tensor (FP8 or FP4)
    scale: The scale factor tensor

Returns:
    Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
r   r7   c                
    X:H  $ r   r   r  s     rW   r  %infer_scale_swizzle.<locals>.<lambda>  s    16r{   r!  r"  r#  r$  r%  r&  )r*  rA  r  numelr   )matscales     rW   infer_scale_swizzler2  w  sO    ( %))A,		!-%KKM))KK! r{   c           	     r  ^ SSK Jm  U R                  5       nUR                  5       nU(       a
  US   US   4nU(       a&  [        R                  " [
        R                  US5      OSnSU4S jjn[        [        U5      S:  a
  US   US   4OUS   S4[        U5      UU R                  UR                  US9$ )z
Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

This is the IR-compatible version of infer_scale_swizzle, using symbolic
size comparisons via V.graph.sizevars.statically_known_equals.
r   rr  r7   c                N   > TR                   R                  R                  X5      $ )z5Compare values using symbolic equality when possible.)rv  rw  rJ  )r(  r)  rs  s     rW   symbolic_eq+infer_scale_swizzle_ir.<locals>.symbolic_eq  s    ww77==r{   r  r.  )r(  r   r)  r   r   r:  )r  rs  r<  r  r  r   r!  r*  rR   r  r   )r0  r1  	transposer!  r"  r#  r5  rs  s          @rW   infer_scale_swizzle_irr8    s     .||~H!J QK!- DN)""8<<Q?STK> %/28}/A(1+x{+QRUVGW$))KK r{   r  )ry   r   r   r   )r   r   r   r:  )   d   )r   Callable[[], Any]r   r   r   r   r   r  )r9  r:  F)
r   r;  r   r   r   r   r   r:  r   r  r  )r   z"Union[Optional[torch.device], str]r   torch.device)r$  zIterable[sympy.Expr]r   r   )r,  r_  r-  r_  r   r   )r$  zIterable[_T]r   zValuesView[_T])r7  r]  r8  r]  r   r]  )rc  r  r   r  )ri  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])ro  Union[int, torch.SymInt]r   r]  )r   r]  r   r=  )ri  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])r  torch._ops.OpOverloadr   r:  )r  r5   r  z'Callable[[torch._ops.OpOverload], bool]r   r:  )r  r   r   r  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rH   )r   r  r   r  )r7   rH   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r  r   r  )r   r  r  g      ?rH   )r  r?  r  r@  r   r   r  r   r  r  r   r  r   r  )r  r   r  r  r   r  )r  r   r  r   r   r  )r(  r   r)  r   r   r   )rT   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rT   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  r  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r   z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r  )r  rB  r  r:   r   tuple[str, str]r   )rp  zIterable[torch.fx.Node]rq  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])r   zSequence[IRNode]r  zdict[str, IRNode]r   rE  r  )r  r   r   zValueRanges[Any])r  r  r   r:  )r  re   rb  r   r   r  )r  r:  r   r:  )r   r  r   r  )rm  r   r  zdict[sympy.Expr, Any]r   r   )r(  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])r   r   r   r:  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  rF  r   r5   )r  rF  r   zOrderedSet[torch.device]r  )r  r   r   r   )rc  r  r   r  r   r  )NNT)r"  zOptional[dict[str, Any]]r  rf  r#  r:  r   r  )r,  r@  r(  r:  r   	list[int])rx  r+   r,  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r(  r:  r   rG  )r   r`  r   r   r  r  )r  zUnion[int, torch.device]r   r:  r  )r  r   r   r<  r  Optional[int]r   r8   )r<  rA   r  zlist[torch.dtype]r   r:  )r  r  r   r:  )
r<  rA   r  r:  r  r:  r  r:  r   r:  )rb  r@   r+  r^  r,  r:  r   r:  )rb  r@   r+  rA   r,  r:  r   r:  )rq  r6   rr  r6   rs  zlist[ScalingType]r   r:  )r  r   r  r   r<  rA   r  r:  r  r:  r  Optional[Any]r  rI  r  rI  r   r:  )
r<  rA   r  r   r(  r   r  r   r   r:  r  )r<  rA   r  r  r(  r  r  r  r  r@   r  r@   r  zOptional[IRNode]r  zOptional[_IntLike]r   r:  )r  r  r   r:  r   )
r  r  r(  r  r  r  r  r   r   r:  )r  r  r(  r  r  r  r   r:  )r  r  r(  r  r  r  r   rG  )r   r  r   r  )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r<  rA   r   r:  )r<  rA   r  zUnion[ReinterpretView, Buffer]r  r@   r   r:  )FTFN)r<  rA   r  r@   r  r@   r  r:  r   r:  r(  r:  r  rH  r   r:  )r   Callable[P, _T]r   r  r  r  r   ztuple[_T, list[str]])r   r?  r   ztuple[Any, list[str]])r   rJ  r   r  r  r  r   r   )r   rJ  r   r  r  r  r   r  )r   rJ  r   r  r  r  r   ztuple[Any, list[GraphLowering]])r}  r?  r~  r?  r   r  )r  r?  r  zOptional[Callable[..., Any]]r   r   )r  r  r   r  )r   rf  )rZ  r@  r   r:  )r  zSequence[torch.Tensor]r   r:  )ro  r   r   r`  )r  r:  r   r   r  r   r   zIterator[Any])r   r`  r   r  )r  r  r   r:  )r  r  r   r   )r  zIterable[Any]r   r:  )
r  r?  r  r4   r  r@  r  r  r   r  )r  z"Optional[Union[Buffer, Operation]]r   r:  )rl  z Optional[Union[Node, Operation]]r  z!Optional[torch._ops.OperatorBase]r   r:  )rl  z"Optional[Union[IRNode, Operation]]r   r:  )r#  rF   r$  z-Optional[Callable[[BaseSchedulerNode], bool]]r   r:  )r#  rF   r   r:  )rl  zOptional[Operation]r  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r:  )r-  r  r.  r  r/  r  r   r   )r#  rF   r9  zMutableSet[BaseSchedulerNode]r.  zdict[str, SchedulerBuffer]r/  zdict[str, BaseSchedulerNode]r6  zCallable[[Any], bool]r   r  )rF  r   rG  r   r   r   )rR  rF  r   r   )rb  r   r   r  )r   r  r   r  )r   rf  r   r:  )r   r  r   r:  )r   r`  r   r:  )r  r>  r  rf  r|  r`  r}  r`  r~  r  r  r:  r   r:  )r  rC  r   r  )r   r  r   r:  )r  r  r   r:  )r   r  )r   rJ  r   r  r  r  r   ztuple[_T, str])r  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r  rA  r  zOrderedSet[int]r   rL  )rT   r  r   r  )r  r  r  rA  r  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  rK  r  rA  r   rA  )r   r   r   r:  )r  r@  r  rE   r   r  )r   r`  r   r  )r   r  r   r`  )r6  r  r   r  r   r:  )r   ztuple[str, ...])r   r  r  r3   r  r  r   r  )r   r  r   r  )r   rf  r   r  )r   r`  r   r`  )r   zOptional[type[Any]]r   r:  r   r   )r   zOptional[list[int]])r   rQ  )r  ztorch._ops.OperatorBaser   rD  )r  torch.fx.Noder   r:  )rl  rB   r   r:  )r  r:   r   r:  )r  r   r   r`  )r   r  r   r:  )r  zSequence[Sequence[T]]r  zSequence[T]r   r  )
r  rA  r  rA  r  ValType | Noner  rN  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )r  r   r   ztuple[bool, bool])rT   r*   r  r:  r   zOrderedSet[sympy.Symbol])r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r#  rF   r   z tuple[list[Any], dict[str, Any]])r:  r;   r   r:  )r   r  r   r/   r   r  )rl  rM  r   r:  )r   r   )rT   r   r  r   r   r   )r!  ztuple[Any, Any]r"  ztuple[Any, ...]r#  r   r$  r`  r%  r`  r&  zCallable[[Any, Any], bool]r   #tuple[Optional[Any], Optional[Any]])r0  r  r1  r  r   rP  r  )r0  r>   r1  r>   r7  r:  r   rP  (  
__future__r   rS  r  rG  enumr  ry  r  r  r}  r  r  r   r  r  r   r  r   r  r  r  r  r  r  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r~   rP   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr    !torch.fx.passes.regional_inductorr!   torch.utils._dtype_abbrsr"   torch.utils._ordered_setr#   r$   r%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   r  r6   r  r8   rg  r:   dependenciesr;   rv  r=   r  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   rN   rL   r   rX   torch._dynamo.device_interfacerY   torch._dynamo.utilsrZ   torch.autogradr[   torch.autograd.profiler_utilr\   (torch.fx.passes.graph_transform_observerr]   torch.fx.passes.shape_propr^   torch.utils._sympy.functionsr_   r`   ra   rb   rc   torch.utils._sympy.symbolrd   re   torch.utils._sympy.value_rangesrf   rg   r  ri   runtime.runtime_utilsrj   r6  _IS_WINDOWS	getLoggerr   r   rl   r  r4  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr  r  r1  r	  r[  rP  r\  rR  r]  rT  rV  r   rJ  rL  rN  rD  rE  float8_e4m3fnuzfloat8_e5m2fnuzrt   r   rx   rz   r   Functionr   rH  r   r   r   r   r  r  r%  r.  r1  rf  rj  rp  rz  r|  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r%  rn  rt  r  r  r  r  r  r  r  r  r  r  	frozensetro  r  r  r  r  r  r  r  r  r  r  r'  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher0  r@  rD  rF  rJ  rM  r  r  r[  r  r  r  r  r  r  r  r  r  r)  rc  ri  ro  rt  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r,  r.  rG  rN  rS  ri  rl  rn  rw  r  r  r  r  r  r  r  r  r  r&  r  r  r  r  r  r  r  r  r  r#  r  Enumr  r  r	  r  r  r  r   r'  r*  r0  r8  r@  rI  rU  rW  rh  rk  r  rn  rp  rr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rZ  r  compiler  r  r  r  r   r  r  r  r  r  r  r  r"  r#  r$  r&  rK  rO  rQ  r\  r^  rc  rm  r  rv  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyperO  r  r  r  r  r  r  r
  r  r  r  r  r*  r2  r8  )r  r   s   00rW   <module>r     sv   "        	     	  	   
              C B    $ $ ? : E 0 / ; ($ 
  >>//C$"/,5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%! T]UZZ'(	U5<<ell:;<	 Ebii(I7S 
  !"  	 
 2<

2 . ( {Q'A-+2B XDX XB5
LENN  d#  $"G GX #(	 
 !	
 4 #(	___ 
_ !	_
 _D  ;@
+*"*+A**#AL+	+++	"/	)/#/G @OI	I<I 
I0 *8+0' ' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:)++/+\C,4).4)O4) 	4)nW2CW2!W2 W2x 48*0 (G
G$5GG:,^%	DU	>2- $ $'& 
< !# I "	 " "( .27+7	7 7 	7 7v !5 $ "  49  ( 	$$	7$ 	$
 $N Q7 7*  , , ,
S' S'l
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
< :>RW||&6|KO|	|@ BG&,:>	 BGVV&,V:>V	V OOO %O 
	O Q	  	 Q	  	 Q  >>> > 	>
 > > >  > 
>BP " EEE E 	E
 E E E E 
EPJ CO,) , EF!)?B	 (  . 5( 5(p @ @ R R:"JH&8@F	: ""&"&==
= = 	=
  = =  = 
=@'C C"&&& & 	&$ &2:/(V		 &	2:		## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D '6 '6T  Q	.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F1	" -1#
*#)# 
#L( @D	Q	Q<	Q 
	Q#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!)
.2$&$!$ $ 	$
 $ $ 
$NHBL'  &2:2:*" ( %	0	: 37$$$$ 0$ 3	$<$ $$3N!3B	:&/ '#)* $%
  +?*D*D*FG*F$!*FG  **Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)# # )

)
-" 01 -"` D)t   *499  4 42Y5!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
  
FRj&2R66 d#  $ 38$./@ 3 3 *:); &= F
"-L 
%
  "
MMM M 	M
 M &M )M`	*(B "	"" " )	" "{ Hs   $z