
    R jK                       % S r SSKJr  SSKrSSKrSSKJr  \(       a  SSKJr  SSK	J
r
  SSKJr  SSKJrJr  SSKrSS	KJr  S
SKJr  S/rSqS\S'   SqS\S'   \
 " S S5      5       r\S*S j5       r S+   S,S jjrS-S jrS.S jr                S/S jr                        S0S jr                      S1S jr \" S5      r!S2S jr"S3S jr#    S4                                 S5S jjr$ S6                               S7S jjr%   S8SSSSSSS.                                     S9S  jjjr&SSSSSSS.                               S:S! jjr'SSSS".                                 S;S# jjr(      S<SS$.                   S=S% jjjr)   S>SS$.             S?S& jjjr*SS$.                             S@S' jjr+\RX                  " S(\S)9  g)Az
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                  *    \ rS rSr% S\S'   SS jrSrg)
_FA3Handle&   zLibrary | Nonelibraryc                P    S U l         [        R                  R                  S5        g )NF)r   torch_C_set_sdp_use_fa3)selfs    h/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/nn/attention/_fa3.pyremove_FA3Handle.remove*   s    !!%(    )r   N)returnNone)__name__
__module____qualname____firstlineno____annotations__r   __static_attributes__ r   r   r   r   &   s    )r   r   c                H    [         R                  R                  U 5      u  pU$ N)r   cudaget_device_capability)devicemajor_s      r   _get_device_majorr-   0   s    zz//7HELr   c                |    [        U 5        [        R                  R                  S5        [	        [        5       5      $ )z
Register FA3 flash attention kernels with the PyTorch dispatcher.

Args:
    module_path: Python module path to the FA3 implementation.
T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsmodule_paths    r   r   r   6   s/     {# 
HHd#+-..r   c                   [         R                  " U 5        [        [        R                  S5      (       d  [        SU  S35      e[        [        R                  R                  S5      (       d  [        SU  S35      e[        [        R                  R                  S5      (       d  [        SU  S35      e[        R                  R                  R                  q[        R                  R                  R                  q
g )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr4   r5   r   r6   r   r1   s    r   r/   r/   G   s    K(599n--Xk]2OPQQ599))511{m#HI
 	
 599))511{m#IJ
 	
 II**..MII**..Mr   c                 4   [        SSS5      n U R                  S[        S5        U R                  S[        S5        U R                  S[        S5        U R                  S[
        S5        U R                  S[        S5        U R                  S	[        S5        U $ )
NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)r
   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   X   s    
&&&
)CHH,.OQW HH7<
 HH"$Mv HH-D HH(*LfUHH6=
 Jr   c                    US:w  a  g[        S U 5       5      (       d  g[        U Vs1 s H  owR                  iM     sn5      S:w  a  gU R                  [        R
                  :X  a$  Ub  Ub  Uc  [        R                  " S[        5        Uc  U R                  5       S:w  a  g	Ub  U R                  5       S
:w  a  g[        R                  R                  5       (       d  g[        U R                  5      S:w  a  gg s  snf )N        zdropout_p must be 0c              3  8   #    U  H  oR                   v   M     g 7fr'   )is_cuda.0ts     r   	<genexpr>,_fa3_common_support_error.<locals>.<genexpr>   s     *'Qyy's   zinputs must be CUDA tensorsr   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllenr*   dtyper   float8_e4m3fnwarningswarnUserWarningdimr(   is_availabler-   )querytensors	dropout_p	cum_seq_q	q_descale	k_descale	v_descalerR   s           r   _fa3_common_support_errorri   t   s     C$*'***,
g&gHHg&'1,){{e)))Y.)2C+ 	
 UYY[A-'!1(::""$$#&!+4) 's   C;c           	       ^ U(       a  gUb  gUb1  UR                   [        R                  :w  a  gUR                  (       d  g[        R                  [        R
                  [        R                  4m[        U4S jXU1 5       5      (       d  ST 3$ [        XU1 Vs1 s H  oR                   iM     sn5      S:w  a  g[        U XU4UUUU	U
5      nUb	  US	:X  a  g
U$ g s  snf )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc              3  @   >#    U  H  oR                   T;   v   M     g 7fr'   r[   rQ   rR   supported_dtypess     r   rS   -_fa3_forward_support_error.<locals>.<genexpr>   s     H4Gqww**4G   inputs must be one of r   #all inputs must have the same dtyperU   z(query, key, value must be on same device)
r[   r   int32rO   r\   float16bfloat16rY   rZ   ri   )rb   keyvaluerd   return_debug_maskalibi_slopes	seqused_kre   rf   rg   rh   rR   errorrn   s                @r   _fa3_forward_support_errorr|      s     0+??ekk),  +++U]]ENNKHU4GHHH'(8'9::
e%010GG012a74%	UE ..= 2s   Cc
           	       ^ UR                   [        R                  :X  a   gUR                   [        R                  :w  a  g[        R                  [        R
                  4m[        U4S jXX#U1 5       5      (       d  ST 3$ [        XX#U1 V
s1 s H  oR                   iM     sn
5      S:w  a  g[        UXX#XE4UUS S S 5      nUb  U$ g s  sn
f )NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c              3  @   >#    U  H  oR                   T;   v   M     g 7fr'   rl   rm   s     r   rS   ._fa3_backward_support_error.<locals>.<genexpr>   s     W4Vqww**4Vrp   rq   r   rr   )	r[   r   r\   float32rt   ru   rY   rZ   ri   )grad_outrb   rv   rw   out	logsumexprd   re   window_size_leftwindow_size_rightrR   r{   rn   s               @r   _fa3_backward_support_errorr      s     {{e)))V	
 %--'0u~~6WXcRU4VWWW'(8'9::
hs3?@?GG?@AQF4%	#c5E  As   C
Tsc                 &    [        S U  5       5      $ )Nc              3  D   #    U  H  oR                  S S5      v   M     g7f)r      N)	transposerP   s     r   rS   #_transpose_dense.<locals>.<genexpr>   s     4GqQ""Gs    )tuple)rc   s    r   _transpose_denser      s    4G444r   c                V    U b%  U R                  S5      S:w  a  U R                  5       $ U $ )z2Ensure tensor is contiguous in the last dimension.r   )stride
contiguous)xs    r   _maybe_contiguousr      s&    ]qxx|q/@1<<>GaGr   c                   [         c  [        S5      e[        U 5      n[        U5      nUR                  [        R
                  :X  a:  UR                  S5      S:w  a%  UR                  S5      S:w  a  UR                  5       O
[        U5      n[        U5      n[        U5      n[        U5      n[        / UPUPUPSPSPSPUPUPUPSPSPUPUPUPSPSPSPSPSPSPUPUPUPUPUPU	b  U	OSPU
b  U
OSPSPSPSPSP[        R                  " 5       (       a  SOSPSP[        R                  R                  5       =(       d    SP76 u  nnnnUUR                  5       4$ )	z>
Run the FA3 forward pass by calling the C++ kernel directly.
NFA3 not registeredr   r   r   rM   T)r   r;   r   r[   r   r\   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)rb   rv   rw   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr   r   rz   r   rf   rg   rh   qkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accums                           r   _fa3_run_forwardr      sP   * /00% A#A ;;%---LL!LL! 	 u%  %X.L$X.L!),I5B #6	#6	#6 	
#6 		#6
 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6 	#6  	!#6" 	##6$ 	%#6& 	'#6( 	)#6* 	+#6, 	-#6. 	/#60 	1#62 	3#64 -8b5#66 /:7#68 	
9#6: 	;#6< 	=#6> 	?#6@ 7799qA#6B 	C#6D 	..05AE#62Ci!2H &&(((r   c                j   [         c  [        S5      e[        U 5      nUR                  S5      S:w  a  UR	                  5       OUnUR                  S5      S:w  a  UR	                  5       OUnUR                  S5      S:w  a  UR	                  5       OUn[        U5      n[        U5      n[
        R                  " U5      n[
        R                  " U5      n[
        R                  " U5      n[        UUUUUUUUUUUS S UU	U
UUUSU[
        R                  R                  5       =(       d    S5        UUU4$ )Nr   r   r   rM   r   )	r   r;   r   r   r   r   
empty_liker   r   )r   rb   rv   rw   r   r   r   r   max_seqlen_qmax_seqlen_kr   r   r   r   deterministicdoutr   r   r   olsedqdkdvs                           r   _fa3_run_backwardr   C  s/   " /00 X&D#ll2.!3AJJrNa/SA#ll2.!3A#A
I
&C 
		!	B			!	B			!	B				


..05A-0 r2:r   r   r   r   r   rz   ry   r   c                  [        U UUUU	UUUU
UU5      nUb  [        SU 35      e[        U UUUUUUUUUUUUU
UU5      u  nn[        R                  " S[        R
                  U R                  S9n[        R                  " S[        R
                  U R                  S9n[        R                  " SU R                  U R                  S9nUUUUU4$ )Nz)FA3 flash_attention forward unsupported: )r   )r[   r*   r%   r   )	r|   r;   r   r   zerosuint64r*   emptyr[   )rb   rv   rw   re   	cum_seq_kr   r   rd   r   rx   rf   rg   rh   r   r   r   rz   ry   r   r{   r   	rng_statephilox_offset
debug_masks                           r   rE   rE   ~  s    , 'E FugNOO!HC$ DU\\JIKK%,,u||LMQekk%,,GJYz99r   c
               8    [        U UUUUUUUUU	S S S U
UUUUUS9$ )Nr   )rE   )rb   rv   rw   re   r   r   r   rd   r   rx   r   r   r   rz   ry   r   s                   r   rG   rG     sJ    & -)+!' r   )r   r   r   c                   [        U UUUUUU
UUU5
      nUb  [        SU 35      e[        R                  " 5       n[	        U UUUUUUUUU	UUUb  UOSUb  UOSU5      u  nnnUUU4$ )z0FA3 implementation of _flash_attention_backward.z*FA3 flash_attention backward unsupported: r   )r   r;   r   r   r   )r   rb   rv   rw   r   r   re   r   r   r   rd   r   r   unusedr   r   r   r{   r   r   r   r   s                         r   rI   rI     s    * (E GwOPP>>@M",8b.:JBB" r2:r   r   c	                  [        U UUUUS S S UUU5      n
U
b  [        SU
 35      e[        XU5      u  pnU R                  [        R
                  :X  a  [        R                  OU R                  n[        R                  " XS9nUR                  SS5      nUR                  S5      nUR                  S5      n[        UUUS S UUUUUU	UUUUS9u  nnnnnU R                  S5      nUR                  S5      nUUS S UUUUU4	$ )NzFA3 SDPA forward unsupported: rl   r   r   )r   r   rf   rg   rh   )r|   r;   r   r[   r   r\   ru   r   r   sizerE   )rb   rv   rw   rf   rg   rh   rd   r   rx   r   r{   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr,   r   r   r   r   r   r   s                             r   rF   rF      s;    'E ;E7CDDu51GA!
 #(++1D1D"D%++I7H!!!Q'H&&)K&&)K3T			40AsI}j" JJqMEHHQKE
 
r   c               &    [        U UUS S S UUUUS9
$ )Nr   )rF   )rb   rv   rw   rd   r   rx   r   s          r   rH   rH   g  s0     @ r   c                   [        XX#XEU
SSS5
      nUb  [        SU 35      e[        XX#U5      u  nnnnn[        UUUUUUSSUU	U
UUUUS9u  nnn[        UUU5      u  nnnUUU4$ )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r;   r   rI   )r   rb   rv   rw   r   r   re   r   r   r   rd   r   philox_seedr   r   r{   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outs                              r   rJ   rJ     s    & (SYdDE <UGDEE (8S($JS#u 4JBB& .b"b9FFF66!!r   FA3)register_fn)r*   ztorch.devicer   int)flash_attn_interface)r2   strr   r   )r2   r   r   r   )r   r
   )rb   torch.Tensorrc   ztuple[torch.Tensor, ...]rd   floatre   torch.Tensor | Nonerf   r   rg   r   rh   r   r   
str | None)rb   r   rv   r   rw   r   rd   r   rx   boolry   r   rz   r   re   r   rf   r   rg   r   rh   r   r   r   )r   r   rb   r   rv   r   rw   r   r   r   r   r   rd   r   re   r   r   
int | Noner   r   r   r   )rc   z
Unpack[Ts]r   ztuple[Unpack[Ts]])r   r   r   r   )NNNN)"rb   r   rv   r   rw   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   rz   r   r   r   rf   r   rg   r   rh   r   r   z!tuple[torch.Tensor, torch.Tensor])F) r   r   rb   r   rv   r   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])NNN)&rb   r   rv   r   rw   r   re   r   r   r   r   r   r   r   rd   r   r   r   rx   r   rf   r   rg   r   rh   r   r   r   r   r   r   r   rz   r   ry   r   r   r   ) rb   r   rv   r   rw   r   re   r   r   r   r   r   r   r   rd   r   r   r   rx   r   r   r   r   r   r   r   rz   r   ry   r   r   r   )"r   r   rb   r   rv   r   rw   r   r   r   r   r   re   r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNNrM   FF)rb   r   rv   r   rw   r   rf   r   rg   r   rh   r   rd   r   r   r   rx   r   r   r   )rM   FF)rb   r   rv   r   rw   r   rd   r   r   r   rx   r   r   r   )r   r   rb   r   rv   r   rw   r   r   r   r   r   re   r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   )-__doc__
__future__r   r7   r]   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r	   r   torch.libraryr
    r   __all__r   r#   r   r   r-   r   r/   r0   ri   r|   r   r   r   r   r   r   rE   rG   rI   rF   rH   rJ   register_flash_attention_implr%   r   r   <module>r      sY   #     ( !  2  !  #
 "& %!% % ) ) )   .///"/"8""%" " #	"
 #" #" #" "J((	( ( 	(
 ( &( #( #( #( #( #( (V### 
# 	#
 
# # # ## !# "# #L $5H$  $%)%)%)!J)J)	J) J) "	J)
 "J) J) J) J) J) !J) "J) #J) 
J) #J) #J)  #!J)" '#J)x  888 
8 	8
 
8 8 "8 "8 8 8 8 8 8 8 8  5!8L &*%)%):: %)(,#)::::	:: :: #	::
 #:: :: :: :: :: :: #:: #:: #:: ::  !::" #::$ #%::& &'::( 
)::R %)(,##''	' ' #	'
 #' ' ' ' ' ' ' ' ' #'  &!'" 
#'t #'$(%888 
8 	8
 
8 8 #8 #8 8 8 8 8 8 8  !8" !#8$ "%8~ &*%)%)#D DD	D D #	D
 #D #D D D D DV # 	  	
   P !2"2"2" 
2" 	2"
 
2" 2" #2" #2" 2" 2" 2" 2" 2"  2"  !2"j 
 ' ';W Xr   