
    R j                     ,	   % S SK Jr  S SKrS SKrS SKJrJrJr  SSKJ	r	  S SK
JrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  S SKJr  S SKJr  S SKrSS/r\" S5      r\" S5      r\R<                  " \5      r  S SK!J"r#  \RN                  RP                  r(S r)0 r*\+\\4   \,S'   S r-S@S\\\\4   /\\\4   4   4S jjr.\." \(R^                  5      SS.S\04S jj5       r1\." \(Rd                  5      SAS\04S jj5       r3\." \(Rh                  5      SAS\04S jj5       r5\." \(Rl                  5      SAS\04S jj5       r7\." \(Rp                  5           SBS\04S  jj5       r9 S@S!\:\0   S"\:\0   S#\:\0   S$\;S\04
S% jjr<\." \(Rz                  \(R|                  \(R~                  \(R                  \(R                  /5      SS.S\04S& jj5       rB\." \(R                  5      S\04S' j5       rDS( rE\." \(R                  \(R                  \(R                  /5      SS.S\04S) jj5       rIS* rJSS+.S\\K\K\0S,4   \K\0S,4   \K\0S,4   \K\0S,4   S-  4      4S- jjrLSS+.S\\K\K\0S,4   \K\0S,4   \K\0S,4   \K\0S,4   S-  4      4S. jjrM\." \(R                  S/S09SS.S\04S1 jj5       rO\." \(R                  S/S09S\04S2 j5       rQS3 rR\." \(R                  \(R                  \(R                  /5      SS.S\04S4 jj5       rV\." \(R                  S/S09S\04S5 j5       rX\." \(R                  S/S09S\04S6 j5       rZ0 \(R^                  \1_\(Rd                  \3_\(Rh                  \5_\(Rl                  \7_\(Rp                  \9_\(Rz                  \B_\(R|                  \B_\(R~                  \B_\(R                  \B_\(R                  \B_\(R                  \D_\(R                  \I_\(R                  \I_\(R                  \I_\(R                  \V_\(R                  \V_\(R                  \V_\(R                  \O\(R                  \Q\(R                  \X\(R                  \Z0Er*S7 r[/ S8Qr\S9 r]S: r^S\_4S; jr`S< ra " S= S5      rb " S> S?\5      rcg! \$ a+    \%" S S 5       5      (       a  \ RM                  S5        \r# GNf = f)C    )NoneTypeN)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_PJITFunctionc              #   \   #    U  H"  n[        [        R                  US 5      S Lv   M$     g 7fN)getattrtorchversion).0attrs     i/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/utils/flop_counter.py	<genexpr>r"      s$     
]F\d75==$-T9F\s   *,)cudahipxpuz@triton not found; flop counting will not work for triton kernelsc                 \    [        U [        R                  5      (       a  U R                  $ U $ r   )
isinstancer   Tensorshape)is    r!   	get_shaper+   #   s!    !U\\""wwH    flop_registryc                 8   ^  [        T 5      S S.U 4S jj5       nU$ )N)out_valc                 B   > [        [        XU 45      u  pnT" USU0UD6$ )N	out_shape)r   r+   )r/   argskwargsr1   fs       r!   nfshape_wrapper.<locals>.nf+   s.    "*9tW6M"Ni$6)6v66r,   r   r4   r5   s   ` r!   shape_wrapperr8   *   s#    
1X 7 7 Ir,   returnc                 h   ^ ^ S[         [        [        4   S[         [        [        4   4UU 4S jjnU$ )Nflop_formular9   c                    >^  T(       d  [        T 5      m SU 4S jjn[        R                  R                  R	                  UT5        T $ )Nc                    > [        U [        R                  R                  [        45      (       d  [        SU  S[        U 5       35      eU [        ;   a  [        SU  35      eT[        U '   g )Nz|register_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), or JitFunction, got z which is of type zduplicate registrations for )	r'   r   _opsOpOverloadPacket_JITFunction
ValueErrortyper-   RuntimeError)targetr;   s    r!   register=register_flop_formula.<locals>.register_fun.<locals>.register7   sp    v

(C(C\'RSS #H$6tF|nFG G &"%A&#JKK$0M&!r,   )r9   N)r8   r   utils_pytree	tree_map_)r;   rE   get_rawtargetss   ` r!   register_fun+register_flop_formula.<locals>.register_fun3   s7    (6L	1 	%%h8r,   )r   r   r   )rK   rJ   rL   s   `` r!   r   r   1   s5    8BF#3 R8H  & r,   )r1   c                R    U u  pVUu  pxXg:w  a  [        SU SU 35      eXX-  S-  U-  $ )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper1   r2   r3   mkk2ns	            r!   mm_floprY   H   sE    
 DAEBwRSTRUUZ[]Z^_``519q=r,   c                     [        X5      $ )zCount flops for addmm.rY   
self_shaperS   rT   r1   r3   s        r!   
addmm_flopr^   T   s     7$$r,   c                     U u  pEnUu  pxn	XG:w  a  [        SU SU 35      eXh:w  a  [        SU SU 35      eXE-  U	-  S-  U-  n
U
$ )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got rO   z0bmm: inner dimensions must match (k == k2), got rP   rQ   )rS   rT   r1   r3   brU   rV   b2rW   rX   flops              r!   bmm_floprc   Y   ss    
 GA!IBAwOPQsRWXZW[\]]wOPQsRWXZW[\]]519q=1DKr,   c                     [        X5      $ )z&Count flops for the baddbmm operation.)rc   r\   s        r!   baddbmm_flopre   h   s    
 G%%r,   c	                     [        X5      $ )zCount flops for _scaled_mm.r[   )
rS   rT   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr1   r3   s
             r!   _scaled_mm_floprm   o   s     7$$r,   x_shapew_shaper1   
transposedc                 |    U S   nU(       a  U OUSS nUtpgn [        U5      [        U5      -  U-  U-  U-  S-  n	U	$ )a  Count flops for convolution.

Note only multiplication is
counted. Computation for bias are ignored.
Flops for a transposed convolution are calculated as
flops = (x_shape[2:] * prod(w_shape) * batch_size).
Args:
    x_shape (list(int)): The input shape before convolution.
    w_shape (list(int)): The filter shape.
    out_shape (list(int)): The output shape after convolution.
    transposed (bool): is the convolution transposed
Returns:
    int: the number of flops
r   rP   Nr   )
rn   ro   r1   rp   
batch_size
conv_shapec_outc_infilter_sizerb   s
             r!   conv_flop_countrw      s[    ( J''Y;J 'E+ 
d;//*<uDtKaODKr,   c                    [        XXvS9$ )zCount flops for convolution.rp   )rw   )
rn   ro   _bias_stride_padding	_dilationrp   r1   r2   r3   s
             r!   	conv_flopr~      s     7YNNr,   c                 0   S nSn U
S   (       a"  [        US   5      nU[        XX(       + 5      -  nU
S   (       aY  [        US   5      nU(       a#  U[        U" U 5      U" U5      U" U5      SS9-  nU$ U[        U" U5      U" U 5      U" U5      SS9-  nU$ )Nc                 4    U S   U S   /[        U SS  5      -   $ )Nr   r   rP   )list)r)   s    r!   tconv_backward_flop.<locals>.t   s$    a%(#d59o55r,   r   r   Fry   )r+   rw   )grad_out_shapern   ro   rz   r{   r|   r}   rp   _output_padding_groupsoutput_maskr1   r   
flop_countgrad_input_shapegrad_weight_shapes                   r!   conv_backward_flopr      s    6JDL 1~$Yq\2on?OQ_``
1~%il3/!N*;QwZK\I]joppJ
  /!G*a6GK\I]joppJr,   c                     U u  p4pVUu  pxpUu  ppX7s=:X  a  U:X  a!  O  OXHs=:X  a  U:X  a  O  OXj:X  a
  X:X  a  Xj:X  d  [        S5      eSnU[        X4-  XV4X4-  Xi45      -  nU[        X4-  XY4X4-  X45      -  nU$ )zR
Count flops for self-attention.

NB: We can assume that value_shape == key_shape
z8sdpa_flop_count: query/key/value shapes are incompatibler   rR   rc   )query_shape	key_shapevalue_shaper`   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopss                   r!   sdpa_flop_countr     s     !NA#"Cc$Cc?s?!/c/3:]`]gWXXK8QUC-s/@AAK8QUC-s/@AAKr,   c                    [        XU5      $ )Count flops for self-attention.r   )r   r   r   r1   r2   r3   s         r!   	sdpa_flopr   ,  s     ;;??r,   c                     SSK Jn  SSKJn  [	        XU45      (       d8  U R
                  R                  S:w  a  U R                  5       R                  5       $ U/U R                  S5      S-
  -  $ )z
If the offsets tensor is fake, then we don't know the actual lengths.
In that case, we can just assume the worst case; each batch has max length.
r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r'   devicerB   difftolistsize)offsetsmax_lenr   r   s       r!   _offsets_to_lengthsr   5  s\    
 9Dg,<=>>7>>CVCVZ`C`||~$$&&9Q!+,,r,   )grad_out.c              #     #    UGb+  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u  pn
UR                  u  pnUR                  u  pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a'  
Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr)   rR   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r!   %_unpack_flash_attention_nested_shapesr   A  sp    $  syy>Q !Z[[u{{q  !\]]HNNekk$A !ghhkkiikk !NOO !NOO??ioo- !dee+I=+I=&)-t&T"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUU 'U 	
++syy%++AUx~~[_
__s   E&E(c              #     #    UGb.  [        UR                  5      S:w  a  [        S5      e[        UR                  5      S:w  a  [        S5      eUb%  UR                  U R                  :w  a  [        S5      eU R                  u    pn
UR                  u    pnUR                  u    pnUc  [        S5      eUc  [        S5      eUR                  UR                  :w  a  [        S5      e[        XF5      n[        XW5      n[	        UUS	S
9 H'  u  nnSU	UU
4nSUUU4nSUUU4nUb  UOSnUUUU4v   M)     gU R                  UR                  UR                  Ub  UR                  OS4v   g7f)a+  
Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
each batch element.

In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r!   )_unpack_efficient_attention_nested_shapesr   u  s    $  syy>Q !tuuu{{q  !vwwHNNekk$A   "B  C  C131313 !kll !kll!3!33  "Z [ ['C	'C		9TBLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUU C 	
++syy%++AUx~~[_
__s   E)E+T)rJ   c          
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   r   r   r   r   s        r!   r"   0_flash_attention_forward_flop.<locals>.<genexpr>  &      6;2KK 	<<6;   r   sum)r   r   r   r   r   r   r   r1   r2   r3   sizess              r!   _flash_attention_forward_flopr     s?    " 2E  6;  r,   c           
      D    [        U UUUUUUS9n
[        S U
 5       5      $ )r   )r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XU5      v   M     g 7fr   r   r   s        r!   r"   4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r   r   r   )r   r   r   biasr   r   r   r   r2   r3   r   s              r!   !_efficient_attention_forward_flopr     s?    " 6!!!!E  6;  r,   c                    SnUu  pVpxUu  ppUu  pnnU u  nnnnXYs=:X  a  Us=:X  a  U:X  a  O  OXjs=:X  a  Us=:X  a  U:X  a  O  OX:X  d  [        S5      eUU:X  a  X:X  a  UU:X  d  [        S5      eSnU[        XV-  Xx4XV-  X45      -  nU[        XV-  UU4XV-  UU45      -  nU[        XV-  X4XV-  UU45      -  nU[        XV-  X{4XV-  X45      -  nU[        XV-  X4XV-  X{45      -  nU$ )Nr   zFsdpa_backward_flop_count: batch/heads/dimension mismatch among tensorszJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatibler   )r   r   r   r   r   r`   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4s                        r!   sdpa_backward_flop_countr     s2   K NA#"Cc$Cc3'Cc3!s!c!)?S)?C)?szeff#:SZsczijjK 8QUC-s/@AAK 8QUC-sC/@AAK8QUC-sC/@AAK 8QUC-s/@AAK8QUC-s/@AAKr,   c                    [        XX#5      $ )z(Count flops for self-attention backward.r   )r   r   r   r   r1   r2   r3   s          r!   sdpa_backward_flopr   	  s    
 $NXXr,   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   r   r   r   r   s        r!   r"   1_flash_attention_backward_flop.<locals>.<genexpr>+  &      CI?KK 	!iUUCIr   r   )r   r   r   r   out	logsumexpr   r   r   r   r2   r3   shapess                r!   _flash_attention_backward_flopr     sB    " 3	F  CI  r,   c
                 F    [        UUUU UUUU	S9n[        S U 5       5      $ )N)r   r   r   r   r   r   r   r   c              3   @   #    U  H  u  pp4[        XAX#5      v   M     g 7fr   r   r   s        r!   r"   5_efficient_attention_backward_flop.<locals>.<genexpr>L  r   r   r   )r   r   r   r   r   r   r   r   r   r   r2   r3   r   s                r!   "_efficient_attention_backward_flopr   1  sB    " 7!!!!	F  CI  r,   c                 6    [        U [        5      (       d  U 4$ U $ r   )r'   tuple)xs    r!   normalize_tupler   j  s    atHr,   ) KMBTc                     [        S[        [        [        5      S-
  [        [	        U 5      5      S-
  S-  5      5      n[        U   $ )Nr   r   rP   r   )maxminr   suffixesstr)numberindexs     r!   get_suffix_strr  s  s=     3s8}q(3s6{+;a+?A*EFGEE?r,   c                 X    [         R                  U5      nU SU-  -  S nU[         U   -   $ )Ni  z.3f)r	  r  )r  suffixr  r   s       r!   convert_num_with_suffixr  z  s2    NN6"E%c*E8E?""r,   c                     US:X  a  gX-  S $ )Nr   0%z.2% )numdenoms     r!   convert_to_percent_strr    s    zk#r,   c                 0   ^  [        T 5      U 4S j5       nU$ )Nc                 >   > [        U 5      u  pT" U6 n[        X25      $ r   )r   r   )r2   	flat_argsspecr   r4   s       r!   r5   )_pytreeify_preserve_structure.<locals>.nf  s#    &t,	mc((r,   r   r7   s   ` r!   _pytreeify_preserve_structurer    s     
1X) )
 Ir,   c                     ^  \ rS rSrSr    SS\R                  R                  \\R                  R                     -  S-  S\	S\
S\\\4   S-  SS4
U 4S	 jjjrS\	4S
 jrS\\\\\	4   4   4S jrSS jrS rS rS rSrU =r$ )r   i  a  
``FlopCounterMode`` is a context manager that counts the number of flops within its context.

It does this using a ``TorchDispatchMode``.

It also supports hierarchical output by passing a module (or list of
modules) to FlopCounterMode on construction. If you do not need hierarchical
output, you do not need to use it with a module.

Example usage

.. code-block:: python

    mod = ...
    with FlopCounterMode(mod) as flop_counter:
        mod.sum().backward()

Nmodsdepthdisplaycustom_mappingr9   c                 n  > [         TU ]  5         [        S 5      U l        X l        X0l        S U l        Uc  0 nUb  [        R                  " SSS9  0 [        EUR                  5        VVs0 s H%  u  pVU[        USS5      (       a  UO
[        U5      _M'     snnEU l	        [        5       U l        g s  snnf )Nc                       [        [        5      $ r   )r   intr  r,   r!   <lambda>*FlopCounterMode.__init__.<locals>.<lambda>  s
    +VYJZr,   z<mods argument is not needed anymore, you can stop passing itrP   )
stacklevel_get_rawF)super__init__r   flop_countsr  r   modewarningswarnr-   itemsr   r8   r   mod_tracker)selfr  r  r   r!  rV   v	__class__s          r!   r*  FlopCounterMode.__init__  s     	6ABZ6[
-1	!NMMXefg

WeWkWkWmnWmtqqwq*e44!-:JJWmn
 )? os   +,B1c                 N    [        U R                  S   R                  5       5      $ )NGlobal)r   r+  valuesr1  s    r!   get_total_flopsFlopCounterMode.get_total_flops  s!    4##H-44677r,   c                     U R                   R                  5        VVs0 s H  u  pU[        U5      _M     snn$ s  snnf )zReturn the flop counts as a dictionary of dictionaries.

The outer
dictionary is keyed by module name, and the inner dictionary is keyed by
operation name.

Returns:
    Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
)r+  r/  dict)r1  rV   r2  s      r!   get_flop_countsFlopCounterMode.get_flop_counts  s7     (,'7'7'='='?@'?tq47
'?@@@s   :c                 (  ^ ^
^^ Uc  T R                   nUc  SnSS KnSUl        / SQn/ nT R                  5       m
[	        T
5      mSmU
UUU 4S jn[        T R                  R                  5       5       HB  nUS:X  a  M  UR                  S5      S	-   nXq:  a  M&  U" XgS	-
  5      nUR                  U5        MD     ST R                  ;   a'  T(       d   U H  n	S
U	S   -   U	S'   M     U" SS5      U-   n[        U5      S:X  a  / SQ/nUR                  XCSS9$ )Ni?B r   T)ModuleFLOPz% TotalFc           	        > [        T
R                  U    R                  5       5      nT	UT:  -  m	SU-  n/ nUR                  X0-   [	        UT5      [        UT5      /5        T
R                  U    R                  5        H<  u  pVUR                  US-   [        U5      -   [	        UT5      [        UT5      /5        M>     U$ )N z - )r   r+  r7  appendr  r  r/  r
  )mod_namer  r   paddingr7  rV   r2  global_flopsglobal_suffixis_global_subsumedr1  s          r!   process_mod.FlopCounterMode.get_table.<locals>.process_mod  s     d..x8??ABK+"==EkGFMM"']C&{LA 
 ((288:eOc!f,+A}=*1l;  ; Mr,   r6  .r   rC  )r6  0r  )leftrightrO  )headerscolalign)r  tabulatePRESERVE_WHITESPACEr9  r  sortedr+  keyscountextendr   )r1  r  rR  headerr7  rJ  mod	mod_depth
cur_valuesr   rG  rH  rI  s   `         @@@r!   	get_tableFlopCounterMode.get_table  s%   =JJE=E 	'+$.++-&|4"	 	, $**//12Ch		#*I $Sa-8JMM*% 3 t'''0Bq>a   !1-6Fv;!+,F  B\ ]]r,   c                     U R                   R                  5         U R                  R                  5         [	        U 5      U l        U R
                  R                  5         U $ r   )r+  clearr0  	__enter___FlopCounterModer,  r8  s    r!   r`  FlopCounterMode.__enter__  sG     ""$$T*			r,   c                    U R                   c  [        S5      eU R                   R                  " U6 nS U l         U R                  R                  5         U R                  (       a$  [        U R                  U R                  5      5        U$ )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r,  rR   __exit__r0  r   printr\  r  )r1  r2   r`   s      r!   rd  FlopCounterMode.__exit__  sf    99 !_``II%	!!#<<$..,-r,   c                     XR                   ;   a[  U R                   U   nU" U0 UDSU0D6n[        U R                  R                  5       H  nU R                  U   U==   U-  ss'   M     U$ )Nr/   )r-   setr0  parentsr+  )r1  func_packetr   r2   r3   flop_count_funcr   pars           r!   _count_flopsFlopCounterMode._count_flops  sm    ,,,"00=O($F&F#FJ4++334  %k2j@2 5
r,   )r  r   r+  r-   r0  r,  )NrP   TNr   )__name__
__module____qualname____firstlineno____doc__r   nnr@  r   r$  boolr<  r	   r*  r9  r
  r=  r\  r`  rd  rm  __static_attributes____classcell__)r3  s   @r!   r   r     s    * DH 48+((//D$99D@+ + 	+
 !cNT1+
 >B+ +*8 8
Ac4S>&9!: 
A<^~ r,   c                   @    \ rS rSrSrS\SS4S jrS rS rSS	 jr	S
r
g)ra  i   Tcounterr9   Nc                     Xl         g r   ry  )r1  ry  s     r!   r*  _FlopCounterMode.__init__#  s    r,   c                    SSK nUR                  U R                  R                  5      nU    U" U6 nSSS5        UR                  U R                  R                  5      nX@R                  l        WU4$ ! , (       d  f       NG= f)a]  Execute a branch function and capture its FLOP counts without
affecting self.counter.flop_counts

Args:
    branch_fn: The branch function to execute
    operands: Arguments to pass to the branch function

Returns:
    Tuple of (result, flop_counts) where result is the branch output
    and flop_counts is a copy of the FLOP counts after execution
r   N)copyry  r+  )r1  	branch_fnoperandsr~  checkpointed_flop_countsresultr+  s          r!   $_execute_with_isolated_flop_counting5_FlopCounterMode._execute_with_isolated_flop_counting&  sg     	#'99T\\-E-E#F )F ii 8 89#; {""	 Ts   A33
Bc                    U[         R                  R                  R                  [         R                  R                  R                  1;   nU(       au  SSKJn  SSKJn  U" US   5      n[        X5      (       d1  [        US5      (       a  UR                  nOO[        X5      (       d  M1  U R                  R                  US X45      $ U[         R                  R                  R                  L GaL  Uu  ppU R                  X5      u  pU[         L a  [         $ U R                  X5      u  nnU[         L a  [         $ [#        UR%                  5       5      [#        UR%                  5       5      -  n0 nU H  nUU   nUU   n0 n[#        UR%                  5       5      [#        UR%                  5       5      -  nU H6  nUR'                  US5      nUR'                  US5      n[)        UU5      UU'   M8     UUU'   M     UR+                  5        H.  u  nnU R                  R,                  U   R/                  U5        M0     U$ [         $ )Nr   )
get_kernelr   
kernel_idxfn)r   opshigher_ordertriton_kernel_wrapper_mutation triton_kernel_wrapper_functional*torch._higher_order_ops.triton_kernel_wrapr  triton.runtime.jitr   r'   hasattrr  ry  rm  condr  NotImplementedrh  rU  getr  r/  r+  update)r1  functypesr2   r3   	is_tritonr  r   kernel_namepredtrue_branchfalse_branchr  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dicts                               r!   _handle_higher_order_ops)_FlopCounterMode._handle_higher_order_ops:  s    UYY33RR"YY33TTV V	M6$VL%9:K ::;--"-..K	 !::
 <<,,[$MMUYY++000
 9=5D|)-)R)R*&H >)%%+/+T+T,(I( N*%% /4467#>O>T>T>V:WWL!#)	#3I#> $5i$@!%'" #$4$9$9$; <sCTCYCYC[?\ \ -H/33Ha@H 1 5 5h BI36x3K&x0 !.
 1C"9- * *<)A)A)C%	:((3:::F *D
 O!!r,   c                 d   U(       a  UO0 nU[         R                  R                  R                  R                  [         R                  R                  R
                  R                  [         R                  R                  R
                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                  R                  [         R                  R                  R                   R                  [         R                  R                  R"                  R                  [         R                  R$                  R&                  R                  1;   a  [(        $ [+        U[         R,                  R.                  5      (       a  U R1                  XX45      $ XR2                  R4                  ;  ac  U[         R                  R$                  R6                  R                  La2  U    UR8                  " U0 UD6nU[(        La  UsS S S 5        $  S S S 5        U" U0 UD6nU R2                  R;                  UR<                  XcU5      $ ! , (       d  f       N== fr   )r   r  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr  r'   r>   HigherOrderOperatorr  ry  r-   r   	decomposerm  _overloadpacket)r1  r  r  r2   r3   rr   s          r!   __torch_dispatch__#_FlopCounterMode.__torch_dispatch__w  s9   !r EIINN44<<IINN0088IINN00>>IINN99AAIINN??GGIINN''//IINN++33IINN))11IINN--55IINN1199IINN55==IINN((00IINN,,44IINN&&..IINN))113 3  "!dEJJ::;;00dKK ||111d%))..BWBWB_B_6_NND3F3N* *  D#F#||(()=)=s&QQ s   N!!
N/r{  )r  N)ro  rp  rq  rr  supports_higher_order_operatorsr   r*  r  r  r  rv  r  r,   r!   ra  ra     s,    &*# D #(;"z"Rr,   ra  )Fr   )NNNFN)dr  r   loggingr   torch.utils._pytreer   r   r   module_trackerr   typingr	   r
   collections.abcr   r   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r-  __all__r   r   	getLoggerro  logr  r   r@   ImportErroranywarningr  r  r+   r-   r<  __annotations__r8   r   mmr$  rY   addmmr^   bmmrc   baddbmmre   
_scaled_mmrm   r   ru  rw   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideabler~   convolution_backwardr   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r	  r  r  r
  r  r  r   ra  r  r,   r!   <module>r     s?      F F )  $ $ ' # :   5
6T]t_!> yy~~
 !#tCH~ "XxB?O>PRZ[]_a[aRb>b5c . tww/3 	# 	  	 tzz"%# % #% txx C  ! t||$&C & %& t' % 	% (%( 	$#Y$#Y$ Cy$ 	$
 	$L (())..1155	7 8
 cg Oux O8
O t001e e 2eN& DD@@@@B C EI @WZ @C@	-" 1` eE#s(OU38_eCHouSRUXY]G]]^_1`r 4` eE#s(OU38_eCHouSRUXY]G]]^_4`n t44dC  	 D> t88$G 	 H>: MMIIIIK L ^b Yps YLY t55tD 	 E@ t994H 	 I@GGWJJ
 	HHh 	LL,	
 	OO_ 	i 	y 	I 	!!9 	y 	1 	00) 	,,i 	,,i 	99;M  	557I!" 	557I#$ 	!!#@%%'H""$B&&(J+0 $# #  
N N`yR( yRK  

]F\
]]]VWLs   =Q" "-RR