
    Z j                     z   S SK r SSKJr  SSKJr  S\ R
                  R                  S\ R                  S\ R                  S\ R                  S	\ R                  S-  S
\S\ R                  S\ R                  \\	\ R                  4   -  S\
S\
\\	\
4   -  S\ R                  S-  S\\ R                  S4   4S jr\ R                  R                  S\ R
                  R                  S\ R                  S\ R                  S\ R                  S
\S\ R                  S\\
\
4   S\ R                  S\ R                  4S j5       rg)    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachecu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kblock_tablereturnc                 B   [        U R                  R                  5      u  p[        U SS5      (       d  SOU R                  S-
  S4nUS:X  a  SOSn[        U[        5      (       a  X   nX   n	U
Gc  UR                  UUU R                  US	   US
   S9u  p#SU;   a  SUR                  S5      0O0 nU" UR                  SS5      R                  S5      R                  5       UR                  5       UR                  5       UR                  [        R                  5      UR                  [        R                  5      R!                  5       UU	4U R"                  SUS.UD6n[        U[$        5      (       a  US   nUS4$ SU;   a  SUS   0O0 n['        XX#XWXU
4	0 UD6nUS4$ )a   Performs the forward pass of attention with paged key-value cache. This function handles the cache updates and
performs the attention computation. For decode-only batches (when block_table is provided), uses
`flash_attn_with_kvcache` for fused attention + cache update. Otherwise uses `flash_attn_varlen_func`.
See the [paged attention guide](https://huggingface.co/docs/transformers/en/paged_attention) for more details.

Args:
    q: (1, nheads, total_q, headdim), where total_q = total number of query tokens in the batch.
    k: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
    v: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
    cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into q.
    cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
       of the sequences in the batch, used to index into kv.
    max_seqlen_q: int. Maximum query sequence length in the batch.
    max_seqlen_k: int. Maximum key sequence length in the batch.
    block_table: (num_groups, batch_size, max_blocks_per_seq), dtype int32. Block table for paged KV cache.
        If provided, uses flash_attn_with_kvcache for fused attention + cache update. For each request, the block
        table is a vector of size (max_blocks_per_seq,) with indices indicating the physical location of the cache
        to read from and write to. The kernel, using the cache_seqlens for that request, knows how much cache to
        read and dispatches the read using the block table. Same for the write. If a request has fewer than
        max_blocks_per_seq blocks, the block table is padded with -1s to indicate that the block is not allocated.
sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_size)r   config_attn_implementationgetattrr   
isinstancedictupdater   get	transposesqueeze
contiguoustotorchint32clonescalingtuple_paged_decode_forward)r   r   r   r	   r
   r   r   r   r   r   r   kwargsflash_attn_varlen_funcflash_attn_with_kvcacher   
layer_typecustom_kwargsattn_outputflash_kwargss                      v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr:      s   J 7X**73
 &-V5Eu%M%MXTZTiTilmTmopSqN%3x%?!EXJ-&&%1#/ ||&&l+}-  
 ;BV:K&**W"56QS,KK1%%a(335LLNLLNU[[)U[[)//1
 !..&
 
 k5))%a.K 	 6=5F1B+qU>\g
kw
     r   c	                 6   UR                   U R                     u  pUR                  U   R                  SUR                  UR
                  UR                  5      nUR                  U   R                  SUR                  UR
                  UR                  5      nUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nUR                  S5      nUSUS-    USU -
  S-
  R                  [        R                  5      nX   XR                  U5      '   U" S	UUUUUUU R                  SUS.	U	D6n[!        U["        5      (       a  US   nUR%                  S5      $ )
zaDecode fast path using flash_attn_with_kvcache. Disabled because FA3 has issue with tracing this.r   r   r   r      NT)	r   k_cachev_cacher   r	   cache_seqlensr   r   r     )layer_index_to_group_indicesr   	key_cacheview
block_sizenum_key_value_headshead_dimvalue_cachepermuter*   sizer+   r,   r-   get_block_table_keyr/   r$   r0   r)   )r   r   r   r	   r   r   r   r4   r   r8   	group_idxlayer_idx_in_groupr>   r?   
batch_sizer@   r7   s                    r9   r1   r1   [   s    %*$F$FvGWGW$X!Ioo0166r5;K;KUMfMfhmhvhvwG 2388
Ee77G 	
		!Q1((*A			!Q1((*A			!Q1((*A J"1zA~6{
9SSVWW[[\a\g\ghMGRG]L**+BCD) 


#nn" K +u%%!!nq!!r;   )r,   generation.continuous_batchingr   modeling_flash_attention_utilsr   nnModuleTensorr%   strintr0   r:   compilerdisabler1   rA   r;   r9   <module>rX      s    @ NQHHOOQ||Q ||Q ||	Q
 LL4'Q Q <<Q <<$sELL'8"99Q Q S#X&Q $Q 5<<Qh ."HHOO."||." ||." ||	."
 ." <<." #s(O." ." \\." ."r;   