
    R j\1              #          S r SSKrSSKJr  SSKJrJr  SSKr\R                  " \	5      r
SS/rS\\   S-  S\\   4S	 jr\" S
S9S\S\4S j5       r " S S\5      r\R$                  R'                  S0 S9   S0S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\S\S\S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S jj5       r\R0                     S0S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\S\S\S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S jj5       rSSSS.S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\S\S\S-  S\S-  S\\\4   S\R(                  \\R(                  \R(                  4   -  4S jjrS \S!\\S"4   S#\SS4S$ jr\R$                  R'                  S%0 S9  S1S&\R(                  S\R(                  S\R(                  S\R(                  S'\R(                  S(\R(                  S\R(                  S\R(                  S\S\S\S)\R(                  S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S* jj5       r\R0                    S1S&\R(                  S\R(                  S\R(                  S\R(                  S'\R(                  S(\R(                  S\R(                  S\R(                  S\S\S\S)\R(                  S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S+ jj5       rS \S&\R(                  S,\R(                  S-\R(                  S\\R(                  S-  S"4   4
S. jr\R?                  \\S/9  g)2z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequestwindow_sizereturnc                 `    U c  SS/n [        U 5      S:w  a  [        S[        U 5       35      eU $ )N   z$window_size must have length 2, got )len
ValueError)r   s    j/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/torch/nn/attention/varlen.py_normalize_window_sizer      s=    2h
;1?K@P?QRSS       )maxsizedevice_indexc                     g)z;Cache device capability check to avoid repeated CUDA calls.F )r   s    r   _should_use_cudnnr      s     r   c                   (    \ rS rSr% SrSr\\S'   Srg)r   #   z
Request which auxiliary outputs to compute from varlen_attn.

Each field is a boolean indicating whether that auxiliary output should be computed.
Flser   N)	__name__
__module____qualname____firstlineno____doc__r   bool__annotations____static_attributes__r   r   r   r   r   #   s     Cr   ztorch_attn::_varlen_attn)mutates_argsquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscalec
                 |   [        U	5      n	U R                  =(       a    [        U R                  R                  5      n
U
(       at  [
        R                  S5        U	S   S:w  d	  U	S   S:w  a  [        S5      e[        R                  R                  R                  U UUSUUUUSSUS	US
9nUS   US   US   pnOS[
        R                  S5        [        R                  R                  R                  U UUUUUUSUS	UU	S   U	S   S9u  pn  n[        R                  " S[        R                  U R                  S9nXU4$ )z
Private custom op for variable-length attention.

This is the internal implementation. Users should use the public varlen_attn function instead.
#Using cuDNN backend for varlen_attnr   r      TcuDNN backend does not support window attention. Please use Flash Attention backend.NT        Fr,      -Using Flash Attention backend for varlen_attn)return_debug_maskr,   window_size_leftwindow_size_rightr   dtypedevice)r   is_cudar   r;   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r$   r%   r&   r'   r(   r)   r*   r+   r,   r   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                    r   _varlen_attnrO   -   sS   $ )5KG"3ELL4F4F"GI67q>R;q>R#7f  88 9 
  *0F1IvayYY@A/4yy~~/V/V#(^)!n 0W 0
,Y1  ELLJ 
**r   c
                    [        U	5      n	[        R                  " U 5      n
U R                  S5      nU R                  S5      n[        R                  R
                  (       aE  UR                  S5      S-
  n[        R                  " XU4[        R                  U R                  S9nO/[        R                  " X4[        R                  U R                  S9n[        R                  " S[        R                  U R                  S9nXU4$ )z
Fake implementation for meta tensor computation and tracing.

Based on the 3D varlen path from meta__flash_attention_forward:
- query shape: (total, num_heads, head_dim)
- logsumexp shape: (num_heads, total_q)
r   r/   r9   r8   )
r   rA   
empty_likesizeversionhipemptyfloatr;   rG   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r   rJ   total_q	num_heads
batch_size	logsumexprL   s                   r   _varlen_attn_faker[   s   s    ( )5K e$F jjmG

1I}}]]1%)
KKE*%++ell
	 KK ELL
	 DU\\JIi''r   )r   r   )
return_auxr,   r   r\   c                    U	S:H  n
[         R                  R                  R                  U UUUUUUU
U[	        U	5      5
      u  pnUb  UR
                  (       a  X4$ U$ )a  
Compute variable-length attention using Flash Attention.
This function is similar to scaled_dot_product_attention but optimized for
variable-length sequences using cumulative sequence position tensors.

Args:
    query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
    key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
    value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
    cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    max_q (int): Maximum query sequence length in the batch.
    max_k (int): Maximum key/value sequence length in the batch.
    return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
    scale (float, optional): Scaling factor for attention scores
    window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
        Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
        or (W, 0) for causal attention with sliding window of size W.

Returns:
    output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H, D)`.

    If ``return_aux`` is not None and ``return_aux.lse`` is True:
        lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H)`.

Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H`: Number of attention heads
    - :math:`D`: Head dimension

Example::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
    >>> head_dim = embed_dim // num_heads
    >>> seq_lengths = []
    >>> for _ in range(batch_size):
    ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
    ...     seq_lengths.append(min(length, max_seq_len))
    >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
    >>> total_tokens = seq_lengths.sum().item()
    >>>
    >>> # Create packed query, key, value tensors
    >>> query = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> key = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> value = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>>
    >>> # Build cumulative sequence tensor
    >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
    >>> cu_seq[1:] = seq_lengths.cumsum(0)
    >>> max_len = seq_lengths.max().item()
    >>>
    >>> # Call varlen_attn
    >>> output = varlen_attn(
    ...     query, key, value, cu_seq, cu_seq, max_len, max_len
    ... )
)r   r   )rA   rB   
torch_attnrO   listr   )r$   r%   r&   r'   r(   r)   r*   r\   r,   r   r+   outr   rM   s                 r   r   r      sj    \ w&I))&&33[KCa *..xJr   ctxinputs.rJ   c           
          Uu
  nnnnnnn	n
nnUu  pnU R                  X4XVX}X5        Xl        Xl        Xl        Xl        Xl        g N)save_for_backwardr)   r*   r+   r,   r   )ra   rb   rJ   r$   r%   r&   r'   r(   r)   r*   r+   r,   r   r`   r   rL   s                   r   _setup_contextrf      sa     	 Ci%excUIIMI!Or   z!torch_attn::_varlen_attn_backwardgrad_outr`   r   rL   c                 V   [        U5      n[        R                  " SUR                  S9nUR                  =(       a    [        UR                  R                  5      nU(       al  [        R                  S5        US   S:w  d	  US   S:w  a  [        S5      e[        R                  R                  R                  U UUUUUUUUU	SU
UUUS9u  nnnOV[        R                  S	5        [        R                  R                  R                  U UUUUUUUUU	SU
UUUUS   US   S
9u  nnnUUU4$ )Nr   )r;   r.   r   r/   r0   r1   r2   r4   )r,   r6   r7   )r   rA   rU   r;   r<   r   r=   r>   r?   r@   rB   rC   _cudnn_attention_backward_flash_attention_backward)rg   r$   r%   r&   r`   r   r'   r(   r)   r*   r+   rL   r,   r   unusedrH   dqdkdvs                      r   _varlen_attn_backwardro     sD   " )5K[[5<<0FG"3ELL4F4F"GI67q>R;q>R#7f  YY^^== > 

B$ 	@AYY^^==(^)!n# > 

B& r2:r   c                     [        U5      n[        R                  " U5      n[        R                  " U5      n[        R                  " U5      nXU4$ )z>
Fake implementation for meta tensor computation and tracing.
)r   rA   rQ   )rg   r$   r%   r&   r`   r   r'   r(   r)   r*   r+   rL   r,   r   
grad_querygrad_key
grad_values                    r   _varlen_attn_backward_fakert   \  sI    ( )5K!!%(J$H!!%(J++r   grad_lsegrad_rngc                 (   U R                   u  pEpgppU R                  nU R                  nU R                  nU R                  nU R
                  n[        R                  R                  R                  UUUUU	U
UUUUUUUU5      u  nnnUUUS S S S S S S 4
$ rd   )
saved_tensorsr)   r*   r+   r,   r   rA   rB   r^   ro   )ra   rg   ru   rv   r$   r%   r&   r'   r(   r`   r   rL   r)   r*   r+   r,   r   rl   rm   rn   s                       r   	_backwardry   y  s     BEARAR>EIIEIIEIIIE//K%%;;JBB  r2tT4tT4??r   )setup_context)FNN)NN) r   logging	functoolsr   typingr   r   rA   	getLoggerr   r>   __all__r_   intr   r    r   r   library	custom_opTensorrV   tuplerO   register_faker[   r   rf   ro   rt   ry   register_autogradr   r   r   <module>r      s     "  !,
'S	D(8 T#Y  1C D  
  3"E $(B+<<B+	B+ <<B+ ll	B+
 llB+ B+ B+ B+ 4<B+ cT!B+ 5<<u||34B+ FB+J  $(((<<((	(( <<(( ll	((
 ll(( (( (( (( 4<(( cT!(( 5<<u||34(( ((h %)#+]<<]	] <<] ll	]
 ll] ] ] T!] 4<] sCx] \\E%,,455]@" "U38_ "c "d "0 <2N $(AllA<<A 
A <<	A
 
A 
A llA llA A A A ||A 4<A cT!A 5<<u||34A OAH $$ $(,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 4<, cT!, 5<<u||34, %,8@	@@05@HM@
5<<$#$@<   y  Gr   