
    N j&                     P   S SK r S SKJrJr  S SKrS SKJr  S SKJr  S SK	J
r
  SSKJrJrJr  SSKJr  SS	KJrJrJrJr  SS
KJrJrJrJr  SSKJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)J*r*J+r+  \(       a  SSKJ,r,  SSKJ-r-  \ R\                  " \/5      r0\RJ                  Rb                  r1\S 5       r2\" S\2\
" S5      SS9r3\" \Rh                  S\1Rh                  Rj                  S9r6\" \Rh                  \Rn                  Rq                  5       (       a  SOSS\1Rh                  Rr                  S9r:\" \Rv                  S\1Rv                  Rj                  S9r<\Rz                  " \1Rh                  5      S!SS.S jj5       r>\Rz                  " \1Rv                  5      SSSS.S  j5       r?g)"    N)TYPE_CHECKINGUnion)counters)CKGemmTemplate)load_kernel_template   )configirlowering)MMKernelInputs)	loweringsmake_pointwisemake_reductiontransform_args)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)_use_cutlass_for_opuse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_bmm_templateuse_cutlass_templateuse_nv_universal_gemm_templateuse_triton_template)opsV   )_is_static_problemis_batch_stride_largest_or_zeromm_argsuse_native_matmul)ChoiceCaller)KernelTemplatec                6    U" XS   5      U" X#S   5      -  U S4$ )NBLOCK_MBLOCK_Nr    )bmnmetacdivs        k/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/_inductor/kernel/bmm.pybmm_gridr/   -   s&    O$tAI'??AFF    bmm
triton_bmmT)namegridsource"cache_codegen_enabled_for_templatezat::bmm_out)op_overloadzat::_bmm_out_dtype_xpuzat::_bmm_out_dtype_cuda	bmm_dtype)r3   r7   zat::baddbmm_outlayoutc          
      ~	  ^ ^ [        S T U4 5       5      (       Ga  T R                  5       S   S:X  d  UR                  5       S   S:X  aX  [        R                  " T S5      m [        R                  " US5      n[        R                  " [        R
                  " T U5      SS9$ S nS mU4S jnU" T 5      (       a0  [        R                  R                  R                  S	   nU" T U5      m U" U5      (       a/  [        R                  R                  R                  S   nU" X5      n[        T U5      (       a  [        [        R                     " T S5      m [        [        R                     " US5      n[        T U/0 S
SSS9u  p[        R                  R                   (       aU  T R"                  [$        R&                  [$        R(                  4;   a'  U 4S jn
U Vs/ s H  n[+        U
5      " U5      PM     nn[+        [,        R.                  5      " U6 n[1        S5      " US5      nU$ [3        T XUS9u  pnnm nSn[5        T U/US9nT R                  5       S	   n[6        S   SU SU SU SU 3==   S-  ss'   [8        R;                  SUUUUT R=                  5       UR=                  5       U5        [>        n0 nU(       a/  T RA                  5       RB                  S;   d   S5       e[D        nSU0n/ n/ n0 n[G        5       (       a   URI                  U5        UUURJ                  '   [M        USS9(       a,  Ub  UT R=                  5       :X  a  URI                  [N        5        URQ                  [        RR                  RU                  UUUUS95        [W        U5      u  nn[Y        T X5      nU(       aP  U(       aI  [[        X>UU5      (       a7  []        U5      (       a'  SSK/J0n  URc                  UUURe                  5       5        [g        UT U5      (       a'  SSK4J5n  URm                  UUURe                  5       5        [o        X>UU5      (       a&  [p        Rr                  " UUURe                  5       5        U(       a$  [u        X>UUT U5      (       a  SSK;J<n  U" UUU5        [{        UUURe                  5       U5      $ s  snf )zX
Lowering for autotuning aten.bmm with different backends (Aten, Triton, CUTLASS, etc.)
c              3   Z   #    U  H!  oR                  5       R                  S :H  v   M#     g7f)cpuN)
get_devicetype).0xs     r.   	<genexpr>tuned_bmm.<locals>.<genexpr>M   s     
>A<<>%'s   )+r   r   )axisc                     [         R                  " U 5      (       d  g[         R                  " U SS9u  p[        U[         R                  5      $ )NTF)freeze)r
   is_storage_and_layoutas_storage_and_layout
isinstanceFlexibleLayout)t_r:   s      r.   is_valid_to_require_contiguous1tuned_bmm.<locals>.is_valid_to_require_contiguousT   s=    ++A..005AIAfb&7&788r0   c                     US   S:H  =(       a    U S   S:H  =(       d    US   U S   :  =(       d)    US   S:H  =(       a    U S   S:H  =(       d    US   U S   :  $ )NrD   r   r(   )sizesstridess     r.    is_preferred_layout_as_bmm_input3tuned_bmm.<locals>.is_preferred_layout_as_bmm_inputZ   sf     q QeBi1n&PuRy8PU"+"Sb	Q(R'"+r:RUr0   c                    > UR                   S   R                  5       nUR                   S   R                  5       nT" X#5      (       d  [        R                  R                  U 5      n U $ )Nval)r,   sizestrider
   ExternKernelrequire_contiguous)rL   meta_trR   rS   rT   s       r.   may_require_contiguous)tuned_bmm.<locals>.may_require_contiguousd   sU    KK&++-Ekk%(//1G3ECCOO66q9Hr0   r   TNF)argskwargs	broadcasttype_promotion_kindconvert_input_to_boolc                 D   > [         R                  " U TR                  SS9$ )NF)use_compute_types)r   to_dtypedtype)rA   mat1s    r.   	_to_dtypetuned_bmm.<locals>._to_dtype   s    ||AtzzUKKr0   dot)r:   	out_dtyper1   )rl   aten_mm_infoz	aten.bmm_rM   zZTuned aten.bmm: batch=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)cudaxpuz+out_dtype is only supported for CUDA or XPUrl   check_max_autotune)kwarg_overrides)CUTLASS3xGemmTemplate)CppBmmTemplate)add_nv_universal_gemm_choices)>allget_sizeL	unsqueezesum_mulr   graphcurrent_noder_   r"   r   atenr   inductor_configtritoncodegen_upcast_to_fp32rg   torchfloat16bfloat16r   r   rk   r   r!   r   r   loginfo	get_dtypeaten_bmmr>   r?   aten_bmm_dtyper   appenduidr   bmm_templateextendchoicesget_template_configsr   r    r   r   codegen.cutlass.gemm_templaters   add_cutlass_gemm_choicesnodesr   codegen.cpp_bmm_templatert   add_choicesr   r   add_ck_gemm_choicesr   codegen.nv_universal_gemmru   r   ) rh   mat2rl   r:   rN   r]   	meta_mat1	meta_mat2r_   r`   ri   rA   mul_pointwisedot_reductionr*   r+   kr3   kernel_inputs
batch_sizeaten_handleraten_extra_kwargsr   templates_to_userr   rM   
is_nonzerobatch_stride_largest_or_zerors   rt   ru   rT   s    `                              @r.   	tuned_bmmr   H   sQ   
 
>$
>>>==?1"dmmoa&8A&=;;tR(D;;tQ'D66!%%d+!44	9	U	 *$//,,11!4I)$	:D)$//,,11!4I)$:Dt$$(r2(q1% $"'
 !!88TZZMMNNL
 >

L ;??$QN9-a0$D?&sww/6&u-mQ? #*dY#A!VT4 D #D$<9EM #J^yAaS!AaSABaGBHHd				 (0L %%8 	
9	
8 &()4"$G IKO-,=(()6e<Y$..*:: 	- NN			&&+	 	' 	
 'v.MAz#B4#V $ Aq11%%I66V]002	
 FD$//=""!	
 Fq!,,**7FM<O<O<QR4V1dDQQM%gv}E$T7M4G4G4I6RR @s   R:)alphabetar:   c                   [        X5      (       a  US:X  a  SnO[        [        R                     " X@5      nUS:X  a  SnO9[        [        R                     " U[        [        R                     " X5      5      n[        [        R
                     " Xg5      $ [        XXS9u  pppn [        XU/[        X4S9S9nUR                  5       S   n[        S   SU SU SU	 SU
 3==   S-  ss'   [        R                  S	UUU	U
UR                  5       UR                  5       U R                  5       U5	        S
n/ n/ n[        5       (       a  UR                  [         5        [#        USS9(       a  UR                  [$        5        UR'                  [(        R*                  R-                  XU5      5        [/        XUR1                  5       U5      $ )zW
Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
r   r9   )r   r   )scalarsrm   zaten.baddbmm_rM   r   zkTuned aten.baddbmm: batch_size=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%sbaddbmmFrp   )r"   r   r~   r{   r1   addr!   r   dictrw   r   r   r   r   r   r   aten_baddbmmr   r   r   r   r   r   r   r   )inprh   r   r   r   r:   arg1arg2r*   r+   r   r   r   r3   r   r   s                   r.   tuned_baddbmmr      s   
 $$19DTXX&t1DA:DTXX&ui.A$.MND"4.. (/t3'N$A!T #	D4e#?M
 #J^}ZL!AaS!EF!KFHHu			
 D"$G IK-6e<- NN			&&}M %TM4G4G4I6RRr0   )N)@loggingtypingr   r   r   torch._dynamo.utilsr   7torch._inductor.codegen.rocm.ck_universal_gemm_templater    torch._inductor.kernel.mm_commonr    r	   r   r
   r   rx   r   r   r   r   r   r   select_algorithmr   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr   r   	mm_commonr   r    r!   r"   r#   r$   	getLogger__name__r   r~   r/   r   r1   outr   ro   is_available	dtype_outr   r   r   register_loweringr   r   r(   r0   r.   <module>r      s~    '  ( R A ; ; * P P    !  !1!yy~~ G G 		-'+	 eiiDHHLLQ#	II %		 6 6 8 8>W	""	 "	MM$$,,2B2B
 TXX[SD [S [S| T\\",-Ad 8S #8Sr0   