
    N jl                       S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJr  S SKJrJrJr  S SKJrJrJrJr  S SKJrJ r J!r!J"r"J#r#  S SK$r$S SK%r$S SK&J'r'  S SK(J)r)  S S	K*J+r+  S S
K,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7J8r8J9r9  S SK:J;r;  S SK<J=r=  \>" \	R~                  R                  SS5      5      rA\"(       a  S SKBJCrC  S SKDJErEJFrFJGrG  SSKHJIrI  SSKJJKrK  SSKLJMrM  SrN\;" \OS5      rP " S S\Q5      rR " S S5      rS " S S5      rT\#\+R                  \+R                  4   rW\R                   " S S 5      5       rY\R                   " S! S"5      5       rZ " S# S$\Z5      r[ " S% S&5      r\ " S' S(5      r] " S) S*\Z5      r^ " S+ S,\\\^5      r_ " S- S.\]\^5      r` " S/ S0\Z5      ra " S1 S2\\\a5      rb " S3 S4\]\a5      rc " S5 S6\\\Z5      rd " S7 S8\]\Z5      re " S9 S:\\\Z5      rf\R                  SFS; j5       rh    SGS< jri " S= S>5      rjSHS? jrkSIS@ jrl    SJSA jrm " SB SC5      rn " SD SE5      rog)K    )annotationsN)CallableIterableSequence)FutureProcessPoolExecutorThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)Timer)do_bench_using_profilingget_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet.TORCHINDUCTOR_AUTOTUNE_POOL_INACTIVITY_TIMEOUT600)
ModuleType)ChoiceCallerPartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      \ rS rSrSrg)!NonzeroWorkspaceNotSupportedErrorI    N__name__
__module____qualname____firstlineno____static_attributes__r2       q/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/autotune_process.pyr0   r0   I       r9   r0   c                      \ rS rSrSr\SS j5       r\ S       SS jj5       r\SS j5       rSS jr	S r
SS	 jrSSS
 jjrSSS jjrSSS jjrSS jrSS jrSS jrSS jrSrg)TuningProcessM   z>
Class to launch and interact with a benchmarking subprocess.
c                   ^ ^ [         R                  S[        R                  " 5       [        R                  R                  [        5      5        U U4S jn U" 5         g! [         a     gf = f)z$
Entry point for the child process.
z3Started autotune subprocess %s. Visible devices: %sc                    >  [         R                  T5      u  pU c  g  U(       a  [        R                  R	                  U5        U " 5       n[         R                  UT5        Ma  ! [
         a  nUn S nAN)S nAff = fN)r=   recvosenvironupdate	Exceptionsend)job	extra_envresulte	read_pipe
write_pipes       r:   workloop,TuningProcess.process_main.<locals>.workloop]   so    !.!3!3I!>; 

)))4 UF ""6:6  ! Fs   -A$ $
A:.A55A:N)autotuning_logdebugrC   getpidrD   getr-   EOFError)rL   rM   rN   s   `` r:   process_mainTuningProcess.process_mainR   sQ    
 	AIIKJJNN/0	
	7	J 		s   A 
A+*A+Nc                T    [         R                  " X4U5        UR                  5         g rA   )pickledumpflush)objrM   rI   s      r:   rG   TuningProcess.sendq   s!     	S$j1r9   c                .    [         R                  " U 5      $ rA   )rX   load)rL   s    r:   rB   TuningProcess.recvx   s    {{9%%r9   c                0    Xl         U R                  5         g rA   )devicestart)selfra   s     r:   __init__TuningProcess.__init__|   s    

r9   c                   [         R                  R                  [         R                  R                  [        5      S5      n[         R
                  " 5       u  p#[         R
                  " 5       u  pE[         R                  " US5      U l        [         R                  " US5      U l        [        R                  " 5       U l        U R                  R                  U R                  [        R                  5        [        R                  US[         R                   " 5        3S[#        U5       3S[#        U5       3/n0 [%        5       ES['        5       [(        R*                  (       a  SOSS	.EnU R,                  b  [#        U R,                  5      U[.        '   [0        R2                  " UUX%4S9U l        [         R6                  " U5        [         R6                  " U5        SU l        g
)z$
Start the benchmarking subprocess.
z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)rC   pathjoindirname__file__pipefdopenrM   rL   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerR   strr    r   r*   /profile_bandwidth_with_do_bench_using_profilingra   r-   
subprocessPopenprocesscloserunning)rc   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdrn   s           r:   rb   TuningProcess.start   so    RWW__X68NO$&GGI!$&GGI!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%8

 	!
!"r9   c                `    U R                   =(       a    U R                  R                  5       SL $ )z*
True if the subprocess is still running.
N)r   r   pollrc   s    r:   aliveTuningProcess.alive   s%     ||; 1 1 3t ;;r9   c                    U R                  5       (       d  U R                  5         [        R                  XR                  US9  g)z(
Push a work item to the child process.
rI   N)r   rb   r=   rG   rM   )rc   reqrI   s      r:   putTuningProcess.put   s/     zz||JJL39Er9   c                    U R                   R                  U5      (       d"  [        SU R                  R                   35      e[
        R                  U R                  5      u  p#[        U[        5      (       a  UeU$ ! [         a    U R                  5         e [         a    U R                  5         e [         a<    [        R                  SU R                  R                  5        U R                  5         e f = f)zs
Get a response from the child process. Raises TimeoutError on timeout;
raises EOFError if the subprocess crashes.
zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rx   selectTimeoutErrorr   pidr=   rB   rL   killrT   r   rF   rP   	exception
isinstance)rc   timeoutrJ   _s       r:   rS   TuningProcess.get   s    
	==''00"%DT\\EUEUDV#WXX%**4>>:IF fi((L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   A#A> >A:C8c                    U R                  5       (       a   [        R                  SU R                  5        U(       a  U R	                  5         gg)z3
Signal the child process to shut down gracefully.
N)r   r=   rG   rM   waitrc   r   s     r:   shutdownTuningProcess.shutdown   s4     ::<<tT__5IIK r9   c                    U R                  5       (       a  U R                  R                  5         U R                  5         g)z%
Wait for the child process to exit.
N)r   r   r   r   r   s    r:   r   TuningProcess.wait   s(     ::<<LL

r9   c                    U R                   R                  5         U R                  R                  5         U R                  R                  5         SU l        g)z
Close resources.
FN)rx   r   rL   rM   r   r   s    r:   r   TuningProcess.close   s;     	r9   c                    U R                  5       (       aD  [        R                  SU R                  R                  5        U R                  R                  5         U R                  5         g)z&
Send a SIGKILL to the child process.
z)Sending SIGKILL to autotune subprocess %dN)r   rP   errorr   r   r   r   r   s    r:   r   TuningProcess.kill   sH     ::<<  ;   LL

r9   c                B    U R                  SS9  U R                  5         g)z(
Gracefully restarts the child process.
Tr   N)r   rb   r   s    r:   restartTuningProcess.restart   s     	4 

r9   )ra   r   rL   r   rx   rM   )rL   	IO[bytes]rM   r   returnNonerA   )r[   r   rM   r   rI   dict[str, str] | Noner   r   )rL   r   r   r   )ra   Optional[int]r   bool)r   r   rI   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r4   r5   r6   r7   __doc__staticmethodrU   rG   rB   rd   rb   r   r   rS   r   r   r   r   r   r8   r2   r9   r:   r=   r=   M   s      < LP'4I	  & &+Z<F6
r9   r=   c                  \    \ rS rSrSrS
S jr\SS j5       rS
S jrSS jr	    SS jr
Srg	)TuningProcessPooli  z
Maintains a pool of TuningProcesses to benchmark kernels in parallel
across devices. By default, we create one TuningProcess per device and
set the sub-process environment to make only that device visible.
c                V   U R                  5       n[        R                  SU5        U Vs/ s H  n[        US9PM     snU l        [
        R                  " 5       U l        U R                   H  nU R                  R                  U5        M      [        [        U5      S9U l        gs  snf )z
Start the child processes.
z$Sub-process autotune device list: %sra   max_workersN)get_device_listrP   rQ   r=   	processesqueueQueueprocess_queuer   r	   lenexecutor)rc   devicesra   ps       r:   rd   TuningProcessPool.__init__  s     &&(CWM FMMW6-v6WM9>A""1%   +s7|D Ns   B&c                    [         R                  (       d  S/$ [        5       n [        U 5      nUR	                  5       n[
        [        R                  ;   aR  [        R                  [
           R                  S5       Vs/ s H  n[        U5      PM     nn[        U5      U::  d   eU$ [        [        U5      5      $ s  snf )z4
Gather the list of devices to be used in the pool.
N,)r*   autotune_multi_devicer   r   device_countr-   rC   rD   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r:   r   !TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   >B<c                    U R                   R                  5         U R                   H  nUR                  SS9  M     U R                   H  nUR                  5         M     g)z%
Signal all child processes to exit.
Fr   N)r   r   r   r   )rc   r   s     r:   r   TuningProcessPool.shutdown3  sG     	 AJJEJ"  AFFH  r9   c                X   UR                   c   eSS/nU Vs0 s H,  o3[        R                  ;   d  M  U[        R                  U   _M.     nnU R                  R	                  5       nUR                  UR                   R                  US9   UR	                  [        R                  5      U R                  R                  U5        $ s  snf ! [         aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ [         ai  n[        R                  " SU S35        S	[        U5      ;   a  UR                  5         [        S5      s SnAU R                  R                  U5        $ SnAff = f! U R                  R                  U5        f = f)
z
Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
remove it from the queue, execute the benchmark in that subprocess, and return
the TuningProcess to the queue.
NTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRr   zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice 'cudaErrorLaunchFailure)bmreqrC   rD   r   rS   r   	benchmarkr*   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   rF   r}   r   )rc   choiceenv_varsvrI   r   process_exceptions          r:   targetTuningProcessPool.target>  sz    ||'''-/AB/7Kx!

?%Q

1%x	K$$((*FLL**i@	,;;BB. ""7+7 L  	 MM1& :W W
 < ""7+  	 MM.vh 7W W (3/@+AA!<""7+	  ""7+sG   CCC /F	:F 	F	 AF#F	$F F		F F)c           	     v    [        [        XR                  R                  U R                  U5      5      5      nU$ )z.
Benchmark each choice in a separate process.
)dictzipr   mapr   )rc   choicesresultss      r:   r   TuningProcessPool.benchmarkd  s-     s7MM$5$5dkk7$KLMr9   )r   r   r   Nr   )r   zSequence[Optional[int]])r   r(   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])r4   r5   r6   r7   r   rd   r   r   r   r   r   r8   r2   r9   r:   r   r     sC    E& " "(	$,L+ 
+r9   r   c                  |    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SrS\S'   \    SS j5       rSS jrSr	g)
TensorMetaiv  ztorch.devicera   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
        [        U[        5      (       a;  U Vs/ s H  o R                  U5      PM     nn[        S U 5       5      (       d   eU$ Un[        U[        R
                  5      (       a  [        R                  " SUS9nUR                  5       nUc   eUR                  5       nUc   e[        UU[        R                  R                  R                  UR                  5       5      [        R                  R                  R                  UR                  5       5      [        R                  R                  R!                  UR#                  5       R$                  5      UR'                  5       S9$ s  snf )Nc              3  B   #    U  H  n[        U[        5      v   M     g 7frA   )r   r   .0xs     r:   	<genexpr>*TensorMeta.from_irnodes.<locals>.<genexpr>  s     A&Qz!Z00&s   fake)r   layout)ra   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r,   graphsizevarsoptimization_hintsget_size
get_strideoptimization_hint
get_layoutr   get_name)clsirnodesr   rJ   noder   ra   s          r:   r  TensorMeta.from_irnodes  s$    gx((>E Fg!1!1!!4gF FA&AAAAAMdBII&&99&6D    "!!!''""55dmmoFGG$$778IJ77##55doo6G6N6NO
 	
 !Gs   E6c                    [        U R                  U R                  U R                  U R                  U R
                  S9$ )N)ra   r   
extra_size)r   r   r   ra   r   r   r   s    r:   	to_tensorTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r9   r2   )r  z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)
r4   r5   r6   r7   __annotations__r   classmethodr  r  r8   r2   r9   r:   r   r   v  sP    ((++KD-
E
	,
 
4
r9   r   c                      \ rS rSrSr          SS jr      SS jrSS jrSS.     SS jjrSS.     SS	 jjr	S
r
g)BenchmarkRequesti  a  
Only handle triton template benchmark for now. The extern kernel benchmark
can be done inside the same process since they usually don't cause crash.

Important: Instances of this class and subclasses have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                $  ^ Xl         [        U[        5      (       a	  U/U l        OX l        T(       aQ  [        T[        [
        45      (       a6  [        T5      S:  a  [        U4S jT 5       5      (       d   eTS   U l        OTU l        X@l	        g )Nr)   c              3  n   >#    U  H*  nS   H   n[        TS   U5      [        X5      :H  v   M"     M,     g7f))ra   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r:   r   ,BenchmarkRequest.__init__.<locals>.<genexpr>  s=      / Q .q148GA<LL Q M/s   25r   )
kernel_namer   r   input_tensor_metatupler   r   r  r!  
extra_args)rc   r#  r$  r!  r&  s      ` r:   rd   BenchmarkRequest.__init__  s     ''448I7JD"7H"*-?%"O"O%&* /    
 '9&;D# 3ED#$r9   c                   [         erA   NotImplementedErrorrc   outinput_tensorss      r:   make_run_fnBenchmarkRequest.make_run_fn  s
     "!r9   c                    g rA   r2   r   s    r:   cleanup_run_fnBenchmarkRequest.cleanup_run_fn  s    r9   Nr,  c                   [         erA   r)  rc   fnr,  r-  s       r:   do_benchBenchmarkRequest.do_bench  s
     "!r9   c                  [         R                  [        R                  5      nU(       a  [        R                  " 5       nUcp  U R
                  (       a  U R                  (       d   S5       e[        U5      S:X  d   e[        S U R
                   5       5      nU R                  R                  5       nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       n U R                  " USU06nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       nU R                  " U/UQUP76 nU(       a:  [        R                  " 5       W-
  n	[         R                  S[!        U 5      WWU	5        U R#                  5         U$ ! [         a#    [         R                  S5        [        S5      s $ f = f)NzJInput and output tensor meta must be populated when input_tensors is emptyr   c              3  @   #    U  H  oR                  5       v   M     g 7frA   )r  r   s     r:   r   -BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA++--9Os   r,  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rP   isEnabledForloggingDEBUGtimer$  r!  r   r%  r  r.  r0   infor   r7  rQ   r}   r1  )
rc   r,  r-  rQ   start_tscreate_tensor_elapser6  load_elapseresbench_elapses
             r:   r   BenchmarkRequest.benchmark  so   
 ++GMM:yy{H ;))d.E.E \E }%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!=:c:B ))+0Kyy{HmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   (F *GG)r&  r$  r#  r!  )
r#  r}   r$  r  r!  r  r&  Iterable[Any]r   r   r-  r  r,  r  r   zCallable[[], None]r   r-  r  r,  Optional[torch.Tensor]r   r   )r4   r5   r6   r7   r   rd   r.  r1  r7  r   r8   r2   r9   r:   r  r    s    %% ?% @	%
 "% 
%<"*"1="	"
 '+	" %" $	"
 
" '+,$, $, 
	, ,r9   r  c                  ^    \ rS rSrSr     S         S	S jjrSS.     S
S jjrSrg)_TestBenchmarkRequesti  z
Supports unit testing. Defined in this file instead of the test file so the
TuningProcess sub-process can unpickle these objects.
Nc                @    Xl         X l        X0l        X@l        XPl        g rA   )rJ   ra   sleepexccrash)rc   rJ   ra   rN  rO  rP  s         r:   rd   _TestBenchmarkRequest.__init__  s     

r9   r3  c                  U R                   b=  [        R                  R                  [        S 5      [        U R                   5      :X  d   eU R                  (       a   [        R                  " U R                  5        U R                  (       a  U R                  eU R                  (       a  [        R                  " S5        U R                  $ )Nr)   )ra   rC   rD   rS   r-   r}   rN  r?  rO  rP  r{   exitrJ   r+  s      r:   r   _TestBenchmarkRequest.benchmark  sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r9   )rP  ra   rO  rJ   rN  )        NNNF)
rJ   r   ra   r   rN  zOptional[float]rO  zOptional[Exception]rP  r   rI  )r4   r5   r6   r7   r   rd   r   r8   r2   r9   r:   rL  rL    sv      $!%#'  	
 !  KO*1G	 r9   rL  c                  0    \ rS rSrSS.     SS jjrSrg)GPUDeviceBenchmarkMixini-  Nr3  c                  [        S / UQUP 5       5      n[        U5      S::  d
   SU 35       e[        S U 5       S5      n[        U5      n[        U5      S:X  a  [        [	        U5      5      nOUR                  5       nUR                  U5         [        R                  " XS9nUR                  5         S S S 5        U$ ! , (       d  f       W$ = f)Nc              3    #    U  H{  n[        U[        R                  5      (       d  M$  [        UR                  R
                  5      (       d  MJ  UR                  R                  c  Mc  UR                  R                  v   M}     g 7frA   )r   torchTensorr   ra   typeindexr   tensors     r:   r   3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>4  s^      $
/&%,,/   v}}))*   ##	  FMM/s   #B"BB(Br)   zCan not mix devices c              3     #    U  HA  n[        UR                  R                  5      (       d  M)  UR                  R                  v   MC     g 7frA   )r   ra   r\  r^  s     r:   r   r`  =  s5      +F&--,,- #""+s
   (AAcudar   )
r"   r   nextr   itercurrent_devicera   r+   r   synchronize)	rc   r6  r,  r-  device_idx_setdevice_typer   
device_idxrD  s	            r:   r7   GPUDeviceBenchmarkMixin.do_bench.  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0''?C((* 1 
	 10 
s   %C
Cr2   rI  r4   r5   r6   r7   r7  r8   r2   r9   r:   rW  rW  -  s/    
 '+	 % $	
 
 r9   rW  c                  0    \ rS rSrSS.     SS jjrSrg)CPUDeviceBenchmarkMixiniP  Nr3  c               .    [         R                  " U5      $ rA   )r+   benchmark_cpur5  s       r:   r7   CPUDeviceBenchmarkMixin.do_benchQ  s     ((,,r9   r2   rI  rk  r2   r9   r:   rm  rm  P  s/    
 '+	- %- $	-
 
- -r9   rm  c                     ^  \ rS rSrSr       S                               S	U 4S jjjr      S
S jrS rSS jrSr	U =r
$ )TritonBenchmarkRequestiZ  z
Represents a standalone benchmark request for a Triton Template.

Important: Instances of this class have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                   > [         TU ]  XX45        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        Xl
        Xl        Xl        g rA   )superrd   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpackworkspace_sizeworkspace_zero_fill)rc   r#  r$  r!  r&  ru  rv  rw  rx  ry  rz  r{  r|  r}  r~  r  	__class__s                   r:   rd   TritonBenchmarkRequest.__init__b  sT    $ 	9KX& 0$"#6 %:"$8!(
,#6 r9   c                  [         R                  " U R                  U R                  5      n[        R                  SU R                  U R                  5        [        X0R                  5      R                  n[        U R                  5      nU R                  bu  SSKJn  [        R                  " U R                  4[        R                   UR"                  S9nU R$                  (       a  UR'                  5         UR)                  U5      nXuU'   SUR*                  l        0 n	SS Kn
SU
R1                  U5      R2                  ;   a  SU	S'   UR"                  R4                  S:X  a  SnOPUR"                  R4                  n[7        U5      nUR9                  U R:                  R"                  R(                  5      n[=        [        X0R                  5      [        R>                  R@                  RB                  RD                  5      (       a"  [F        RH                  " U/UQUPUQ70 U	DSU0D6$ [F        RH                  " U/UQUPUQ70 U	DUS	S
.D6$ )Nz"benchmark module key: %s, path: %sr   )WORKSPACE_ARG_PLACEHOLDERr   ra   FwarmupcpustreamT)r  benchmark_run)%r   load_by_key_pathrv  ru  rP   rQ   r  r#  runr   r&  r~   torch._inductor.select_algorithmr  rZ  emptyuint8ra   r  zero_r]  __self__with_bandwidth_infoinspect	signature
parametersr\  r   get_raw_streamr!  r   	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rc   r,  r-  mod
run_methodr&  r  workspace_tensorworkspace_index
warmup_argr  r  rh  r   s                 r:   r.  "TritonBenchmarkRequest.make_run_fn  s+    **4+@+@$BRBRS0!!	
 S"2"2377
$//*

 *R${{$$&kkzz 
 '' &&((../HIO*:'27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 
 $$  	
    $$  	
  " r9   c                    [         R                  " U R                  U R                  5      n[	        XR
                  5      R                  5         g rA   )r   r  rv  ru  r  r#  
precompile)rc   r  s     r:   r  !TritonBenchmarkRequest.precompile  s7    **4+@+@$BRBRS%%&113r9   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r#  ru  rv  r   s    r:   __str__TritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr9   )r}  r{  rv  ru  rz  ry  rw  rx  r|  r~  r  )r   r   r   r   r   NF) r#  r}   r$  r  r!  r  r&  rG  ru  r}   rv  r}   rw  r   rx  r   ry  r   rz  r   r{  r   r|  r   r}  r   r~  r   r  r   r   r   rH  r   r}   )r4   r5   r6   r7   r   rd   r.  r  r  r8   __classcell__r  s   @r:   rr  rr  Z  s    " $%%&$%(,$)!77 ?7 @	7
 "7 7 7 7 7 !7  #7 "7 7 7 &7  "!7" 
#7 7>E*E1=E	EN4U Ur9   rr  c                      \ rS rSrSrg)TritonGPUBenchmarkRequesti  r2   Nr3   r2   r9   r:   r  r    r;   r9   r  c                      \ rS rSrSrg)TritonCPUBenchmarkRequesti  r2   Nr3   r2   r9   r:   r  r    r;   r9   r  c                     ^  \ rS rSrSr  S               SU 4S jjjr      SS jrSS.   SU 4S jjjrSS jrS	 r	SS
 jr
SrU =r$ )ExternKernelBenchmarkRequesti  a  
A class to handle extern kernel benchmark requests. This allows extern kernels
(like aten::mm) to be benchmarked in a subprocess, similar to Triton kernels.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
Nc                ^   > [         TU ]  XX45        XPl        U=(       d    0 U l        Xpl        g rA   )rt  rd   callable_pathkwargshas_out_variant)	rc   r#  r$  r!  r&  r  r  r  r  s	           r:   rd   %ExternKernelBenchmarkRequest.__init__  s,     	9KX*l.r9   c                   U R                  5       nU R                  (       a  [        R                  " U/UQ7SU06$ [        R                  " U/UQ76 $ )Nr,  )to_callabler  r  r  )rc   r,  r-  r6  s       r:   r.  (ExternKernelBenchmarkRequest.make_run_fn  sN     $$RA-ASAA $$R8-88r9   r3  c               *  >^^ Ub  UR                  5       S:X  a  gU R                  (       d  [        T5      S:X  a  [        TU ]  " TSU06$ U R                  5       mT" T6 nUbt  [        R                  R                  R                  R                  U[        UR                  5       5      [        UR                  5       5      5        UR                  U5        [        R                   (       a  [#        UU4S j5      $ [$        R                  " TT0 5      $ )Nr   rU  r,  c                    > T " T6 $ rA   r2   )algor-  s   r:   <lambda>8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s
    m8Lr9   )numelr  r   rt  r   r  rZ  _C_dynamoguardsassert_size_strider%  sizestridecopy_r*   r~   r   r+   )rc   r,  r-  out_newr  r  s     ` @r:   r   &ExternKernelBenchmarkRequest.benchmark  s     ?syy{a/3}#5#:7$m===##%DM*G  ''::U388:.cjjl0C 		'"EE/0LMM((}bAAr9   c                    g rA   r2   r   s    r:   r  'ExternKernelBenchmarkRequest.precompile  s    r9   c                    SSK Jn  [        XR                  5      nU R                  (       a!  [
        R                  " U40 U R                  D6$ U$ )Nr   )extern_kernels)r  r  r  r#  r  r  r  )rc   r  r6  s      r:   r  (ExternKernelBenchmarkRequest.to_callable  s>     	D^%5%56;;$$R74;;77	r9   c                "    SU R                    S3$ )NzExternKernelBenchmarkRequest())r  r   s    r:   r  $ExternKernelBenchmarkRequest.__str__  s    .t/A/A.B!DDr9   )r  r  r  )NT)r#  r}   r$  r  r!  r  r&  rG  r  r}   r  zOptional[dict[str, Any]]r  r   r   r   rH  )r-  r  r,  rJ  r   r  )r4   r5   r6   r7   r   rd   r.  r   r  r  r  r8   r  r  s   @r:   r  r    s     ,0 $// ?/ @	/
 "/ / )/ / 
/ /	9*	91=	9		9 KOB*B1GB B(
E Er9   r  c                      \ rS rSrSrg)ExternKernelGPUBenchmarkRequesti#  r2   Nr3   r2   r9   r:   r  r  #       	r9   r  c                      \ rS rSrSrg)ExternKernelCPUBenchmarkRequesti)  r2   Nr3   r2   r9   r:   r  r  )  r  r9   r  c                     ^  \ rS rSrSr S             SU 4S jjjrS r      SS jrSS jrS r	SS jr
SS	 jrS
rU =r$ )CUTLASSBenchmarkRequesti/  aM  
A class to handle CUDA (CUTLASS) benchmark requests. This class is for
managing the lifecycle of a CUDA kernel benchmark, including compiling
the source code, managing workspace memory, and executing the kernel.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
c                .  > [         TU ]  XX45        XPl        SU l        S U l        S U l        SU l        SU l        SU l        X`l	        [        U l        [        U5      U l        U R                  R                  U R                  S5      u  U l        U l        g )Nr   F so)rt  rd   source_coder~  	workspaceDLL_workspace_size_updatedhash_keysource_filerh  r   codecache_clsr   r   write)rc   r#  r$  r!  r&  r  rh  r  s          r:   rd    CUTLASSBenchmarkRequest.__init__9  s     	9KX&#$15)-',$ "&* 8 E*.*<*<*B*Bd+
't'r9   c                    [         R                  SU 5        [        R                  " U R                  S5        [         R                  SU 5        g)zk
Precompile the CUDA source code to populate the CUDACodeCache.
This may happen in a separate thread pool.
Precompiling %sr  Done precompiling %sN)rP   rQ   r   compiler  r   s    r:   r  "CUTLASSBenchmarkRequest.precompileQ  s<    
 	.5d..53T:r9   c          	       ^ U R                  5         U R                  5         [        U5      U/-    Vs/ s H  n[        UR	                  5       5      PM     nn[
        R                  SU R                  U R                  U R                  U R                  UU R                  5        U R                  R                  5       n[        UR                  5      n[        U R                  U R                  5      n[        S5      nU R                   S:  af  ["        R$                  " U R                   S-   S-  ["        R&                  UR(                  S9U l        [        U R*                  R	                  5       5      n[,        R.                  " U/UQU R                  QSPUPUP76 n	 U	" 5         U	$ s  snf ! [0         a-  n
[3        U
5      mU4S jnU R5                  5         Us Sn
A
$ Sn
A
ff = f)zW
Create a function to run the CUDA/XPU kernel with the given input and output tensors.
zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         r  Nc                    > [        T 5      erA   )RuntimeError)err_msgs   r:   raise_runtime_error@CUTLASSBenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r9   )ensure_dll_loadedupdate_workspace_sizer   r   data_ptrrP   rQ   r#  r  r  r  r&  r   current_streamcuda_streamr  r~  rZ  zerosfloat64ra   r  r  r  r  r}   r1  )rc   r,  r-  r_  argsr  
stream_ptrr  workspace_ptrretrK   r  r  s               @r:   r.  #CUTLASSBenchmarkRequest.make_run_fnZ  s    	 ""$:>}:MQTPU:UV:U*+:UVMMHHOO	
 ..==?n889
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
Y WF  	'!fG, !&&	's#   #F(F- -
G$7"GG$G$c           
     
   U R                   (       a  g U R                  5         [        [        R	                  S U R
                   5       5      5      n[        US-   5       Vs/ s H  n[        S 5      PM     nnU R                  R                  5       n[        UR                  5      n[        U R                  U R                  5      n[        5       nU" / UQU R                  Q[!        U5      PS PUP76   U R                  R#                  5         UR$                  U l        [(        R+                  SU R&                  U R                  U R,                  U R.                  U R                  UU R                  5        SU l         g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7frA   )r   )r   metas     r:   r   @CUTLASSBenchmarkRequest.update_workspace_size.<locals>.<genexpr>  s     G0F))0Fs   r)   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   fromkeysr$  r   r   r   r  r  r  r  r#  r   r&  r
   rf  valuer~  rP   rQ   r  r  )rc   unique_input_countr   r  r  r  r  c_workspace_sizes           r:   r  -CUTLASSBenchmarkRequest.update_workspace_size  sa   ''  MMG0F0FGG
 )..@1.D(EF(E1(EF..==?n889
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	))+.44 hMMHHOO		
 (,$9 Gs   "F c                    U R                   c:  U R                  R                  U R                  S5      u  U l         U l        U l        g g )Nr  )r  r  r^   r  r  r  r   s    r:   r  )CUTLASSBenchmarkRequest.ensure_dll_loaded  s@    888<8J8J8O8O  $95DHdmT%5 r9   c                n    U R                   b!  U R                   R                  5         S U l         S U l        g rA   )r  r   r  r   s    r:   r1  &CUTLASSBenchmarkRequest.cleanup_run_fn  s(    88HHNNDHr9   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nr  z, self.source_file=z, self.hash_key=)r#  r  r  r   s    r:   r  CUTLASSBenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr9   )
r  r  r  r   rh  r  r  r  r  r~  )rb  )r#  r}   r$  r  r!  r  r&  rG  r  r}   rh  r}   r   r   rH  r   r  )r4   r5   r6   r7   r   rd   r  r.  r  r  r1  r  r8   r  r  s   @r:   r  r  /  s      "

 ?
 @	

 "
 
 
 

 
0;5*51=5	5n#,JM Mr9   r  c                  j   ^  \ rS rSr            SU 4S jjrS r      SS jrS	S jrSrU =r	$ )
CppBenchmarkRequesti  c                `   > [         TU ]  XX45        XPl        [        U5      U l        S U l        g rA   )rt  rd   r  r   r  r  )rc   r#  r$  r!  r&  r  r  s         r:   rd   CppBenchmarkRequest.__init__  s.     	9KX& -6:r9   c                    [         R                  SU 5        [        R                  " U R                  SS9  [         R                  SU 5        g )Nr  r  rh  r  )rP   rQ   r   r^   r  r   s    r:   r  CppBenchmarkRequest.precompile  s<     	.5$**>3T:r9   c               h   [         R                  " U R                  SS9U l        [	        U5      U/-    Vs/ s H  o3R                  5       PM     nn[        R                  SU R                  U R                  UU R                  5        [        U R                  U R                  5      n[        S U R                   5       5      (       d   e[        R                  /[        U5      [        [	        U R                  5      5      -   -  Ul        [         R"                  " U/UQU R                  Q76 $ s  snf )Nr  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frA   )r   ctypesc_ulonglong)r   args     r:   r   2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>  s      R/3:c6#5#566/s   '))r   r^   r  r  r   r  rP   rQ   r#  r&  r  r  r  r  r   argtypesr  r  )rc   r,  r-  r_  r  r  s         r:   r.  CppBenchmarkRequest.make_run_fn  s     $$T%5%55I04]0Cse0KL0Kf!0KLXHHOO	
 TXXt'7'78
R$//RRRRR%112ID122


   

 __
 	
! Ms   D/c                "    SU R                   < 3$ )Nr  )r#  r   s    r:   r  CppBenchmarkRequest.__str__  s    #$""$%%r9   )r  r  r  )r#  r}   r$  r  r!  r  r&  rG  r  r}   r   r   rH  r  )
r4   r5   r6   r7   rd   r  r.  r  r8   r  r  s   @r:   r  r    so    ;; ?; @	;
 "; ; 
;;
*
1=
	
6& &r9   r  c                  ^   ^  \ rS rSrSr            SU 4S jjr      SS jrSrU =r$ )CuteDSLBenchmarkRequesti  z;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                   > [         TU ]  XX45        UR                  5       n[        R                  " U5      u  U l        U l        g rA   )rt  rd   finalize_allr   r  rv  ru  )rc   r#  r$  r!  r&  r  finalized_coder  s          r:   rd    CuteDSLBenchmarkRequest.__init__   s>     	9KX$1132=2C2CN2S/t/r9   c          	     n  ^^^	 [         R                  " U R                  U R                  5      nSSKJn  U R                   SU 3n[        X55      (       dG  [        U5       Vs/ s H   n[        [        X65      5      (       d  M  UPM"     nn[        SU SU 35      e[        X55      m	UU	U4S jnU$ s  snf )z
Create a function to run the CuteDSL kernel with the given input and output tensors.
Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
r)   )MAIN_SUFFIXr   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~   > [        S5      n U R                  TR                  R                  5      nT" / TQTP7SU06$ )Nrb  r  )r   r  ra   r]  )r   r  r-  kernel_funcr,  s     r:   
run_kernel7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernel#  s@    7?%44SZZ5E5EFFBBsB6BBr9   )r   r  rv  ru  codegen.cutedsl.cutedsl_kernelr#  r#  hasattrdircallabler  r  )
rc   r,  r-  r  r#  main_func_namer   	availabler&  r%  s
    ``      @r:   r.  #CuteDSLBenchmarkRequest.make_run_fn  s     **4+@+@$BRBRS 	@ ,,-Q{m<s++*-c(S($hws?Q6R(IS??OOghqgrs  c2	C
  Ts   #B2B2)rv  ru  )r#  r}   r$  r  r!  r  r&  ztuple[Any, ...]r  r'   r   r   rH  )	r4   r5   r6   r7   r   rd   r.  r8   r  r  s   @r:   r  r    so    ETT ?T @	T
 $T #T 
T*1=	 r9   r  c                 Z    [        5       n [        R                  " U R                  5        U $ rA   )r   atexitry   r   )pools    r:   get_tuning_process_poolr2  +  s    D
OODMM"Kr9   c                4    [        5       R                  U 5      $ )zG
Do benchmarking in a subprocess and return the perf number (latency).
)r2  r   )r   s    r:   benchmark_in_sub_processr4  2  s     #$..w77r9   c                      \ rS rSr% SrSrS\S'   \R                  " 5       r	S\S'   Sr
S	\S
'   S r\S 5       r\S 5       rSS jrSS jrSS jrS rSS jrSS jrSS jrS r\S 5       rSrg)AutotuneProcessPooli;  zf
Singleton pool manager for running autotuning (precompilation + benchmarking)
in a separate process.
NzOptional[AutotuneProcessPool]	_instancezthreading.Lock_lockFr   _shutdown_for_inactivityc                t    U R                  5       U l        S U l        S U l        U R	                  5       U l        g rA   )
_init_pool_pool_warmup_future_warmup_start_time_init_timer_timerr   s    r:   rd   AutotuneProcessPool.__init__E  s0    151B
2604$($4$4$6r9   c                    U R                   c:  U R                     U R                   c  U " 5       U l         SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z*Get or create the singleton pool instance.N)r7  r8  r  s    r:   get_instance AutotuneProcessPool.get_instanceK  sV     == ==($'ECM  }}s}}	  }}s   A
A-c                    [         R                  (       d   S5       eU R                  c*  U R                  5       U l        U R	                  5       U l        U R                  $ )zGet the process pool.zFTo use AutotuneProcessPool, pipeline_max_autotune_gemm must be enabled)r*   pipeline_max_autotune_gemmr<  r;  r?  r@  r   s    r:   r1  AutotuneProcessPool.poolU  sQ     00 	
T	
0 ::*DJ**,DKzzr9   c                L    [         S:  a  [        [         U R                  5      $ g )Nr   ) AUTOTUNE_POOL_INACTIVITY_TIMEOUTr   _on_inactivity_timeoutr   s    r:   r?  AutotuneProcessPool._init_timer`  s!    +a/94;V;VWWr9   c                T    U R                   b  U R                   R                  5         g g rA   )r@  record_callr   s    r:   _record_activity$AutotuneProcessPool._record_activitye  s!    ;;"KK##% #r9   c                   [         R                  S[        5        U R                     U R                  b   U R                  R                  SS9  S U l        S U l        S[        l        S S S 5        g ! , (       d  f       g = f)NzAAutotuneProcessPool shutting down due to inactivity (timeout=%ds)Fr   T)	rP   r@  rJ  r8  r<  r   r@  r6  r9  r   s    r:   rK  *AutotuneProcessPool._on_inactivity_timeouti  sb    O,	

 ZZzz%

###/!
DK
 <@8 ZZs   A A00
A>c                    [         R                  " S5      n[        SUS9n[        R                  " U R
                  5        [        R                  S5        U$ )z
Get or create the process pool.

Uses ProcessPoolExecutor with 'spawn' context for CUDA safety.
ProcessPoolExecutor is lazily initialized - workers are not spawned
until the first submit() call, making this property non-blocking.
spawnr)   )r   
mp_contextz2AutotuneProcessPool created (workers spawn lazily))mpget_contextr   r0  ry   	_shutdownrP   r@  )rc   ctxr1  s      r:   r;  AutotuneProcessPool._init_poolz  sH     nnW%"
 	'PQr9   c                   U R                   c  U R                     U R                   c  [        R                  " 5       U l        U R
                  R                  [        [        R                  R                  R                  R                  S9U l         U R                   R                  U R                  5        [        R!                  S5        SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z
Submit a warmup job to eagerly spawn workers and initialize CUDA.

This is optional - call it early to hide spawn latency.
Returns the warmup future which can be ignored or awaited.
N)
allow_tf32zWarmup job submitted)r=  r8  r?  perf_counterr>  r1  submit_init_autotune_subprocessrZ  backendsrb  matmulr\  add_done_callback_on_warmup_completerP   r@  r   s    r:   warm_upAutotuneProcessPool.warm_up  s     &&&..2.?.?.AD+*.))*:*:1#(>>#6#6#=#=#H#H +; +D' ''99$:R:RS"''(>?  """t"""  """s   B1C++
Dc                
   SnU R                   b"  [        R                  " 5       U R                   -
  n UR                  5       n[        R                  SUU5        g! [         a  n[        R                  SU5        UeSnAff = f)z/Callback invoked when the warmup job completes.NzEAutotuneProcessPool warmup completed successfully in %.4f seconds: %sz4AutotuneProcessPool warmup failed after %.4f seconds)r>  r?  r]  rJ   rP   r@  rF   r   )rc   futurewarmup_elapsed_timerJ   rK   s        r:   rc  'AutotuneProcessPool._on_warmup_complete  s    """."&"3"3"58O8O"O	]]_FW#
  	  F# G	s   'A 
B%A==Bc                   ^  T R                   R                  " U/UQ70 UD6nT R                  b  UR                  U 4S j5        U$ )z-Submit a job to the pool and return a Future.c                $   > TR                  5       $ rA   )rO  )r   rc   s    r:   r  ,AutotuneProcessPool.submit.<locals>.<lambda>  s    t/D/D/Fr9   )r1  r^  r@  rb  )rc   r6  r  r  rg  s   `    r:   r^  AutotuneProcessPool.submit  s?    !!"6t6v6;;"$$%FGr9   c                    U R                   b!  U R                   R                  5         SU l         U R                  b!  U R                  R                  SS9  SU l        gg)zShutdown the pool on exit.NFr   )r@  quitr<  r   r   s    r:   rX  AutotuneProcessPool._shutdown  sN    ;;"KKDK::!JJU+DJ "r9   c                    U R                   bD  U R                     U R                   b!  U R                   R                  5         SU l         SSS5        gg! , (       d  f       g= f)z+Explicitly shutdown the singleton instance.Nr7  r8  rX  rC  s    r:   shutdown_instance%AutotuneProcessPool.shutdown_instance  sH     ==$==,MM++-$(CM  %s   /A
A!)r<  r@  r=  r>  )r   zTimer | Noner   )r   Future[Any])rg  ru  r   r   )r4   r5   r6   r7   r   r7  r  	threadingLockr8  r9  rd   r  rD  propertyr1  r?  rO  rK  r;  rd  rc  r^  rX  rs  r8   r2   r9   r:   r6  r6  ;  s    
 04I,3%NN,E>,%*d*7    
&@"(#(( ) )r9   r6  c                 Z    [         R                  =(       a    [        R                  (       + $ rA   )r*   rG  r6  r9  r2   r9   r:   use_pipelined_autotuningrz    s!    )) 	=#<<<r9   c                    SSK nUR                  R                  5       (       a  UR                  " SSS9  XR                  R                  R
                  l        g)z1
Warmup function run in the autotune subprocess.
r   Nr)   rb  r   T)rZ  rb  is_availabler  r`  ra  r\  )r\  rZ  s     r:   r_  r_    sC      zz  Af%,6NN)r9   c                     U R                  5       nU$ ! [         a$    [        R                  SU 5        [	        S5      s $ f = f)a
  
Run autotuning benchmarks in a subprocess.

This function is submitted to AutotuneProcessPool and runs in isolation
to prevent GPU contention with the main compilation process.

Args:
    picklable_choices: List of picklable choice information

Returns:
    timing
zFailed to benchmark choice %sr   )r   rF   rP   r   r   )benchmark_requesttimings     r:   run_autotune_in_subprocessr    sI     ",,. +	

 U|s    +AAc                      \ rS rSr% SrSrS\S'   \R                  " 5       r	SSS jjr
\SS j5       rS rSSS	 jjr\SS
 j5       rSrg)PrecompileThreadPooli  z
Thread pool for running precompilation asynchronously.

This allows the main compilation process to continue while
precompilation happens in background threads.
NzOptional[PrecompileThreadPool]r7  c                     [        US9U l        g )Nr   )r	   	_executor)rc   r   s     r:   rd   PrecompileThreadPool.__init__  s    +Dr9   c                    SSK Jn  U R                  c@  U R                     U R                  c  U " U" 5       5      U l        S S S 5        U R                  $ U R                  $ ! , (       d  f       U R                  $ = f)Nr   )get_num_workers)r  r  r7  r8  )r  r  s     r:   rD  !PrecompileThreadPool.get_instance  s]    D== ==($'(9$:CM  }}s}}  }}s    A  
A9c                B    U R                   R                  " U/UQ70 UD6$ rA   )r  r^  )rc   r6  r  r  s       r:   r^  PrecompileThreadPool.submit  s!    ~~$$R9$9&99r9   c                4    U R                   R                  US9$ )Nr   )r  r   r   s     r:   rX  PrecompileThreadPool._shutdown   s    ~~&&D&11r9   c                    U R                   bC  U R                     U R                   b   U R                   R                  SS9  S U l         S S S 5        g g ! , (       d  f       g = f)NFr   rr  rC  s    r:   rs  &PrecompileThreadPool.shutdown_instance#  sK    ==$==,MM+++7$(CM  %s   .A
A )r  )   )r   r   )r   r  )F)r   r   r   )r4   r5   r6   r7   r   r7  r  rv  rw  r8  rd   r  rD  r^  rX  rs  r8   r2   r9   r:   r  r    sX     15I-4NNEE  :2 ) )r9   r  c                  d    \ rS rSrSr0 r\SS j5       r\S	S j5       r	\      S
S j5       r
Srg)AsyncAutotuneri,  a  
Handles asynchronous autotuning of kernel choices in a separate process.

This class manages the lifecycle of autotuning:
1. Accepts precompiled choices from the main process
2. Submits benchmarking work to AutotuneProcessPool
3. Returns results via a Future

Usage:
    autotuner = AsyncAutotuner(choices)
    autotuner.start()  # Kicks off async benchmarking
    timings = autotuner.get_results()  # Blocks until complete
c                (    U R                  5       U-   $ rA   )r  )r   
inputs_keys     r:   get_choice_hashAsyncAutotuner.get_choice_hash=  s     :--r9   c                    U H  n[         R                  X25      nU[         R                  ;   a  M.  [        USS5      c   S5       e[        R                  5       R                  [        UR                  5      nU[         R                  U'   M     g)z
Start asynchronous autotuning in a subprocess.

This method:
1. Extracts picklable benchmark requests from choices
2. Submits benchmarking work to AutotuneProcessPool
3. Returns immediately (non-blocking)
r   Nzbmreq is None for choice)	r  r  choice_hash_to_futurer  r6  rD  r^  r  r   )r  r   r  r   choice_hashautotune_futures         r:   rb   AsyncAutotuner.startA  s     F(88LKnBBB67D1= *= 2>>@GG*O
 APN00= r9   c                    0 nU H;  n[         R                  XB5      n[         R                  U   R                  5       X4'   M=     U$ )z
Get autotuning results, blocking until complete.

Args:
    timeout: Maximum time to wait in seconds. None means wait forever.

Returns:
    Dict mapping ChoiceCaller to benchmark timing
)r  r  r  rJ   )r  r   r  timingsr   r  s         r:   get_resultsAsyncAutotuner.get_results]  sE     F(88LK,BB;OVVXGO  r9   r2   N)r   r&   r  r}   r   r}   )r   list[ChoiceCaller]r  r}   )r   r  r  r}   r   zdict[ChoiceCaller, float])r4   r5   r6   r7   r   r  r   r  r  rb   r  r8   r2   r9   r:   r  r  ,  sc     . . P P6 (69	" r9   r  )r   r   r   r   )r\  r   r   r   )r~  r  r   r   )p
__future__r   r0  r  dataclassesr  r=  multiprocessingrV  rC   rX   r   rv   r   r{   rv  r?  r   collections.abcr   r   r   concurrent.futuresr   r   r	   r
   r   r   r   typingr   r   r   r   r   rZ  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   $torch._inductor.compile_worker.timerr   torch._inductor.utilsr   r   r   r   r    torch._loggingr!   torch.utils._ordered_setr"   r   rD   rS   rJ  typesr%   r  r&   r'   r(   r  r*   runtime.benchmarkingr+   virtualizedr,   r-   r4   rP   rF   r0   r=   r   r  r  LayoutOrBuffer	dataclassr   r  rL  rW  rm  rr  r  r  r  r  r  r  r  r  cacher2  r4  r6  rz  r_  r  r  r  r2   r9   r:   <module>r     s~   "       	     
    8 8 N N 2 2 : :  $ C .   7  - /
 $'JJNNCUK$       -  . "8\:		 	t tnl l^ ryy"))+, *
 *
 *
Z c c cL, D   F- -sU- sUl	 79O 		 79O 	HE#3 HEV	9		9	TM57G TMn4&13C 4&n+57G +\  8'8&8R) R)j'
>$) $)NC Cr9   