
    Z jo                        S SK r S SKJrJrJr  S SKJr  S SKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJr  S	S
KJrJrJrJr  S	SKJrJrJrJr  S\	S\\\\      \\   4   4S jr\" 5        " S S5      5       r  " S S5      r!g)    N)floorgcdsqrt)Any   )PreTrainedConfig)ContinuousBatchingConfig)is_flash_attention_requested)attach_tracertraced   )BlockManagerCacheAllocatorFullAttentionCacheAllocatorSlidingAttentionCacheAllocator)RequestStateRequestStatusget_device_and_memory_breakdownloggerconfigreturnc                 6   [        U SS5      nUc6  [        U SS5      b  SOSn[        U R                  5       Vs/ s H  o2PM     nn0 n[        U5       H  u  pVUR	                  U/ 5      U/-   XF'   M     [        UR                  5        Vs/ s H  n[        U5      PM     sn6 n/ n	UR                  5        H7  u  pg[        S[        U5      U5       H  nU	R                  XuXX-    5        M     M9     U	 V
s/ s H
  oU
S      PM     nn
X4$ s  snf s  snf s  sn
f )a|  
Group layers depending on the attention mix, according to VLLM's hybrid allocator rules:
    - Layers in each group need to have the same type of attention
    - All groups have the same number of layers

For a model with the following layer types: ["sliding", "full", "full", "sliding", "full", "full", "full", "full"]
We would get four groups: [0, 3], [1, 2], [4,5] and [6,7].
layer_typesNsliding_windowsliding_attentionfull_attentionr   )
getattrrangenum_hidden_layers	enumerategetr   valueslenitemsappend)r   r   	attn_type_layer_countsi
layer_typeindices
group_sizelayer_groupslggroup_typess               ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/cache.pygroup_layers_by_attn_typer1      s/    &-6K+26;KT+R+^'dt	*/0H0H*IJ*IQy*IJ L";/#/#3#3J#Cqc#I  0 <3F3F3HI3Hs7|3HIJJ L+113
q#g,
3AAN ;< 4  4 1=="r!u%K=$$# K J >s   DD6Dc                      \ rS rSrSr\R                  S4S\S\S\R                  \
-  S\R                  S\S-  S	S4S
 jjrS\S\S	\4S jr\S\S\
S\S	\S-  4S j5       r\S\
S	S4S j5       rS	\4S jr\S\
S\S\S\\\      S-  S\\\      S	S4S j5       rS\
S\S\S\R,                  S	S4
S jr\S\S\S	\\
\4   4S j5       r\S\R,                  S\R,                  S\S\\R,                     S\\R,                     S	\\R,                  \R,                  4   4S j5       rS\S	\
4S  jrS\
S!\\   S	\4S" jrS#\S$\S	S4S% jr S&\\   S'\\   S	S4S( jr!S)\
S*\\
   S	\\\   \\   4   4S+ jr"S.S, jr#S-r$g)/PagedAttentionCache=   u  
Manages the cache for a paged attention mechanism, inspired by VLLM's hybrid allocator. The cache relies on making
groups of layers to reduce the complexity of cache management and fragmentation.

The cache uses a three-level hierarchy:
- Pages: The smallest unit of cache, a page has a size of [num_heads, head_size], which is the space needed to
    store the key or value states for one token and one layer. For a model with only full-attention layers, to store
    the KV cache of one token, we need `2 * num_layers` pages: key and values each take `num_layers` pages.
    Pages are grouped into blocks:
- Blocks: A block is a collection of `block_size` pages, serving as the allocation unit to reduce management
    complexity and fragmentation. Cache is allocated and freed block by block, not page by page. One block is
    allocated to one layer group, which only has one attention type, like full-attention or sliding-attention.
    If all layers in the model have the same attention type, then all layers will be in the same group. There is
    more than one group if and only if the model has a mixed attention types, like layers with full-attention and
    layers with sliding-attention.
- Cache tensors: The physical supports for the cache. There are as many cache tensors as there are layer in a
    layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.

Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
    same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
    efficiently allocate and free blocks, and to efficiently read and write key and value states.

For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
layers and a sliding-attention group with 3 layers. At creation time, the physical cache tensors look like this:

cache_tensor_0: □ □ □ □ □ □ □ □
cache_tensor_1: □ □ □ □ □ □ □ □
cache_tensor_2: □ □ □ □ □ □ □ □

where □ means the blocks is not allocated to any layer group yet. We have 3 cache tensors because there are
3 layers per group.
We allocate 1 block to each group, after allocation, the cache tensors look like this:

cache_tensor_0: ✖ ◉ □ □ □ □ □ □
cache_tensor_1: ✖ ◉ □ □ □ □ □ □
cache_tensor_2: ✖ ◉ □ □ □ □ □ □

where ✖ means the block is allocated to the full-attention group, and ◉ means the block is allocated to the
sliding-attention group.
Now, if we continue to generate, and the sliding window has been reached, we only need to allocate a new block
for the full-attention group, and the cache tensors look like this:

cache_tensor_0: ✖ ◉ ✖ □ □ □ □ □
cache_tensor_1: ✖ ◉ ✖ □ □ □ □ □
cache_tensor_2: ✖ ◉ ✖ □ □ □ □ □

And after further generation, when we need a new block allocated:

cache_tensor_0: ✖ ◉ ✖ ✖ □ □ □ □
cache_tensor_1: ✖ ◉ ✖ ✖ □ □ □ □
cache_tensor_2: ✖ ◉ ✖ ✖ □ □ □ □

This would not have been possible if all layers were in the same group: we would have had to allocate a new block
for the sliding-attention group, although it is not needed.
Nr   continuous_batching_configdevicedtypetp_sizer   c                     Xl         X@l        X0l        [        USS5      nUb  UOUR                  U l        [        USS5      nUb  UOUR                  UR                  -  U l        UR                  U l        U R                  S::  a  [        SU R                   35      e[        U5      u  p[        US   5      n
[        U5      U l        0 U l        0 U l        [        U5       HM  u  pX   S:X  a  UR                   OSn[        U5       H#  u  pX4U R                  U'   XR                  U'   M%     MO     Ub5  US:  a/  U R
                  U-  S:w  a  [        SU R
                   S	U S
35      eU R                  U R
                  -  n[#        U R                   5      (       a  SnOSU	;   a  SnOSnUR                  U R                  -  nSUR                  SUR$                  -  -   4nSU-  UR                  U-   SU-  -   4n['        UUU R                  U
UU/US9nUR(                  c  UR+                  SS9  UR-                  UR.                  UR0                  UR(                  U R                  S9u  nnUU l        UU l        U R.                  U R                  -  U l        [4        R6                  " SU R.                  < SU R                  < SU< SU R0                  < SU< 3
5        UR8                  nUc  SnUU l        / U l        / U l        US-   U R                  -  U R
                  U R                  4U l        U R>                  S   S-
  U l         U R@                  S-
  U l!        [E        U
5       H  n[F        RH                  " U R>                  U R                  U R                  S9n[F        RH                  " U R>                  U R                  U R                  S9n[F        RJ                  RM                  U5        [F        RJ                  RM                  U5        U R:                  RO                  U5        U R<                  RO                  U5        M     [4        R6                  " SU R>                  < SU R:                  S   RP                  < SU R:                  S   RS                  5       < 35        URT                  U l*        / U l+        SU l,        SU l-        SU l.        [        U	5       H  u  nnUS:X  a4  [_        XR                  U RT                  S9nU =RX                  S-  sl,        OqUS:X  a]  [a        XR                  UR                   U R@                  U RB                  5      nU =RZ                  S-  sl-        URb                  U l.        O[        SU 35      eU RV                  RO                  U5        M     U RT                  =(       a    U	S/:H  U l2        [g        UU R                  5      U l4        SU l5        SU l6        g)a~  Initialize a paged attention cache for efficient memory usage. Also turns in prefix sharing if the model has
only full attention layers.

Args:
    config: Model configuration
    continuous_batching_config: Continuous batching configuration containing cache parameters
    device: Device for the cache tensors
    dtype: Data type of the cache
    tp_size: Tensor parallelism size
num_key_value_headsNhead_dimr   z%Block size must be positive, but got r   r   zNumber of key value heads z+ must be divisible by tensor parallel size .   )r5   	page_size
num_groupsr,   activation_peaksnum_attention_masksT)has_logit_processors)
num_blocksmax_batch_tokensmax_memory_percentcache_dtypez7PagedAttentionCache initialized with self.num_blocks = z, self.block_size = z, page_size = z, self.max_batch_tokens = z num_attention_masks = )r7   r6   zself.cache_shape = z self.key_cache[0].shape = z self.key_cache[0].numel() = r   )allow_block_sharingzInvalid group type: )7r   r7   r6   r   num_attention_headsr:   hidden_sizer;   
block_size
ValueErrorr1   r#   r?   sliding_windowslayer_index_to_group_indicesr    r   r
   
vocab_sizePagedAttentionMemoryHandlerrE   resolve_max_memory_percent%infer_num_blocks_and_max_batch_tokensrC   rD   	num_pagesr   infomax_blocks_per_request	key_cachevalue_cachecache_shapesentinel_indextrash_indexr   torchempty_dynamomark_static_addressr%   shapenumelrG   group_cache_managersnum_full_attention_groupsnum_sliding_attention_groups%max_sliding_window_blocks_per_requestr   r   _max_blocks_per_requestuse_prefix_sharingr   _block_manager_total_prefix_length_block_table_key)selfr   r5   r6   r7   r8   kv_headsr;   r-   r/   r,   r)   groupr   jlayerr>   rA   q_bytes_per_tokenlm_head_peakattention_peakmemory_handlerrC   rD   rT   r'   new_layer_key_cachenew_layer_value_cache
group_typecms                                 r0   __init__PagedAttentionCache.__init__w   s   $ 
 6#8$?4<4HfNhNh 6:t4)1)=X6CUCUY_YsYsCs 5????aDT__DUVWW %>f$E!a)
l+!,.)!,/HA6AnH[6[V22abN%e,<=611%8.<$$U+ - 0 7Q;'''1Q6 01I1I0JJuv}u~~  A  MMD$<$<<	'44"# K/"#"# #66FV%6%6!66

 	M!22Q]B

 5'A!*N; 3
 &88@&AAW[A\'5'['[1<<7HH9LL

	 (\ (
$
$ % 04??:FDOO3GG\$//I]]l`i_m n($$((@*=)AC	
 "<!R!R!)%&"
 '=# .0/1 (!^t>@X@XZ^ZgZgh"..q1A5..2z"A"'++d.>.>djjY]YdYd"e$)KK0@0@

[_[f[f$g!MM--.ABMM--.CDNN!!"56##$9: # 	*t''++GT^^A->-D-D,HHf$..YZJ[JaJaJcIghi $>#Q#Q :<!)*&,-)562&{3MAz--0OOY]YqYqr..!3.223(=(=t?R?RTXTdTd 11Q61=?=W=W: #7
|!DEE%%,,R0 4 #'":":"`{O_N`?`*:tG)*! !%    num_requested_blocksallocated_blocksc                     XR                   -  nU R                  (       a4  [        U R                  U-
  S5      nU[	        XA5      U R                  -  -  nX0R                  5       :*  $ )a  Returns a boolean indicating if the allocation of (num_requested_blocks) blocks will be successful. The
number of newly allocated blocks needed is predicted by the following rules:
- for full attention groups: since there is no sliding window for full attention layers, one requested block is
    always equivalent to one newly allocated block for EACH full attention group
- for sliding window groups: because of the sliding window, the number of blocks allocated to a request is
    capped. Using the number of already (allocated_blocks) we can compute the number of new blocks to actually
    allocate to the request, which can be lower than the number of requested blocks. That number is the same for
    all sliding window groups, as only one sliding window size is supported.
r   )ra   rb   maxrc   minget_num_free_blocks)ri   ry   rz   needed_blocksblocks_lefts        r0   will_allocation_be_successful1PagedAttentionCache.will_allocation_be_successful  s`     -/M/MM,,dHHK[[]^_KSCdFgFgggM 8 8 :::rx   n_blocks
request_idc                     U R                  X5      (       d  gSnU R                   H>  nUR                  XU R                  5      nUc  [	        SU SU 35      e[        XF5      nM@     U$ )zAllocate cache blocks across all layer groups for a given request. Actual allocation is done by the cache
managers, and this method only returns the maximum number of blocks actually allocated across all managers.Nr   zFailed to allocate z blocks for request )r   r`   allocate_blocksrf   rK   r|   )ri   r   r   rz   max_allocatedru   num_allocated_blockss          r0   r   #PagedAttentionCache.allocate_blocks-  sz    
 11(MM++B#%#5#5hDL_L_#` #+ #6xj@TU_T`!abbDM	 ,
 rx   c                 `    U R                    H  nUR                  XR                  5        M      g)zFree all allocated cache blocks for a given request across all layer groups. Actual deallocation is done
by the cache managers.N)r`   free_blocksrf   )ri   r   ru   s      r0   r   PagedAttentionCache.free_blocks=  s&     ++BNN:':':; ,rx   c                 .    U R                   R                  $ )zHGet the current number of unallocated blocks available for new requests.)rf   num_free_blocks)ri   s    r0   r~   'PagedAttentionCache.get_num_free_blocksD  s    ""222rx   past_lengthquery_length
read_indexwrite_indexc                    [        U R                  U5       H&  u  pgUR                  UR                  XU5      5        M(     UbA  [        U R                  U5       H&  u  phUR                  UR	                  XU5      5        M(     gg)a5  Retrieve physical cache indices for reading KV states in the cache across all layer groups. This method
coordinates with all cache managers to build the complete set of read indices needed for attention computation.
When read_index is None, the batch has no cache reads and we only compute the write indices.
N)zipr`   extendget_write_indicesget_read_indices)	ri   r   r   r   r   r   ru   write_indicesread_indicess	            r0   extend_read_and_write_indices1PagedAttentionCache.extend_read_and_write_indicesH  s~     "%T%>%>!LB  !5!5j|!\] "M !$'(A(A:$N ##B$7$7
Q]$^_ %O "rx   block_tablec                 j    [        U R                  5       H  u  pVUR                  XX4U   5        M     g )N)r    r`   fill_block_table)ri   r   r   r   r   r)   ru   s          r0   r   $PagedAttentionCache.fill_block_table]  s0     t889EA
ST~V :rx   c                     0 nU R                   S:  a  X-   US'   U R                  S:  a(  U[        XR                  R                  S-
  5      -   US'   U$ )zRetrieve the key sequence length for the given request_id across all layer types. Returns a dictionary of
layer types to their corresponding key sequence lengths.r   r   r   r   )ra   rb   r}   r   r   )ri   r   r   	seqlens_ks       r0   get_seqlens_k!PagedAttentionCache.get_seqlens_kc  s^     	))A-*5*DI&',,q0-9C[[MgMgjkMk<l-lI)*rx   
key_statesvalue_states	layer_idxc                    U R                   U   u  pgXF   nXV   n	U R                  U   n
U R                  U   nUR                  SS5      R	                  S5      nUR                  SS5      R	                  S5      nUR                  5       S:X  a'  U
R                  SX5        UR                  SX5        X4$ U R                  U   nUS:X  aW  U
R                  SX5        UR                  SX5        [        R                  " U
SU5      n[        R                  " USU5      nX4$ XR                  :H  R                  S5      R                  S5      n[        R                  " U
SU5      nUR                  X5        [        R                  " USU5      nUR                  X5        U
R                  SX5        UR                  SX5        X4$ )a  Update the cache with new key-value states for a specific layer, and retrieves the relevant KV states from
the cache for attention computation. The behavior differs based on the layer's attention type:

- Full attention: New KV states are written to cache, then complete sequence is read from cache
- Sliding window: Old KV is read from cache along with extra spaces for the new KV, then new KV is written to
    cache. This is because new KV might overwrite the old KV, so we need to read the old KV first.

When the layer's read index is empty, the batch has no cache reads (all requests are non-chunked prefills): we
only write to the cache and return the input KV states directly, skipping the index_select read-back.

Returns the complete KV states (cached + new) for attention computation.
r   r=   r   )rM   rU   rV   	transposesqueezer_   index_copy_rL   rZ   index_selectrX   	unsqueezemasked_scatter_)ri   r   r   r   r   r   	group_idxlayer_idx_in_grouplayer_read_indexlayer_write_indexk_cachev_cacher   key_states_with_cachevalue_states_with_cachemasks                   r0   updatePagedAttentionCache.updateo  s   , )-(I(I)(T%	%0'2..!34""#56))!Q/77:
#--a3;;A> !!#q(#4A#4C++ --i8Q#4A#4C$)$6$6wCS$T!&+&8&8!EU&V#" %== %(;(;;FFrJTTUWXD$)$6$6wCS$T!!11$C&+&8&8!EU&V##33DG#4A#4C %==rx   flash_attn_with_kvcache_fnc                 8   U R                   c  [        R                  " U5      R                  R	                  5       nSU;   a  SU l         U R                   $ SU;   a  SU l         U R                   $ [        S[        R                  " U5       35      eU R                   $ )zA function to get the name of the block table key for the given flash_attn_with_kvcache_fn. The function's
signature is only inspected once. This is necessary because different version of flash have different names for
the block table key.r   
page_tablezOflash_attn_with_kvcache_fn does not have a block_table or page_table argument: )rh   inspect	signature
parameterskeysrK   )ri   r   kwarg_namess      r0   get_block_table_key'PagedAttentionCache.get_block_table_key  s       (!++,FGRRWWYK+(5% $$$ ,(4%
 $$$ !efmfwfw  yS  gT  fU  V  $$$rx   
prompt_idsc                 n   Sn/ n[        [        U5      U R                  -  5       H  nX%U R                  -  US-   U R                  -   nU R                  R	                  X6SS9nU R                  R
                  R                  U5      nUb.  UR                  U5        U R                  R                  U5        M    O   U(       aC  [        R                  " SU S[        U5       S35        U R                  S   nXHR                  U'   [        U5      U R                  -  n	U =R                  U	-  sl        U	$ )a  Searches for a prefix match in the cache for the given (prompts_ids). If one is found, we reference the
matching blocks in the (request_id), increase the reference count of the blocks and return the number of blocks
that match. If no prefix match is found, we return 0.Nr   r   )group_idzFound prefix match for request z with z blocks)r   r#   rJ   rf   compute_hash_hash_to_idr!   r%   increase_ref_countr   debugr`   r   rg   )
ri   r   r   current_hashrz   btokensblock_idru   prefix_lengths
             r0   search_prefix_match'PagedAttentionCache.search_prefix_match  s    s:$//9:ADOO 3q1u6OPF..;;L[\;]L**66::<HH# ''1##66x@ ; LL::,fSQaMbLccjkl**1-B)9NN:&,-?!!]2!rx   statenum_complete_blocksc                 ,   US:X  d  UR                   [        R                  :X  a  gU R                   H_  nUR                  (       d  M  U R
                  R                  UUR                  UR                     UR                  UR                  -   S9  Ma     g)a  Marks the blocks allocated to a request (state) as complete if they are shareable and they have been computed
in the forward pass. A complete block is a block where the KV cache has been fully computed: if the block has
enough space to hold the cache for N tokens, the block is marked as complete when the cache data is present for
the N tokens. If block sharing is off, this is a no-op.r   N)r   rz   r   )statusr   FINISHEDr`   uses_block_sharingrf   !mark_shareable_blocks_as_completer   r   initial_tokensgenerated_tokens)ri   r   r   ru   s       r0   r   5PagedAttentionCache.mark_shareable_blocks_as_complete  s     !#u||}7M7M'M++B$$$##EE(;%'^^E4D4D%E % 4 4u7M7M M F  ,rx   list_source_blockslist_forked_blocksc                    [         R                  " XR                  [         R                  S9n[         R                  " X R                  [         R                  S9n[	        U R
                  U R                  5       Hu  u  pVUR                  SU R                  U R                  U R                  5      nUR                  SU R                  U R                  U R                  5      nXS   XT'   Xc   Xd'   Mw     g)z;Copy the cache from the source blocks to the forked blocks.)r6   r7   r   N)rZ   tensorr6   int32r   rU   rV   viewrJ   r:   r;   )ri   r   r   source_blocksforked_blocksrU   rV   s          r0   
copy_cachePagedAttentionCache.copy_cache  s    %7SXS^S^_%7SXS^S^_&)$..$:J:J&K"I!r4??D<T<TVZVcVcdI%**2t@X@XZ^ZgZghK'0'?I$)4)CK&	 'Lrx   source_request_iddestination_request_idsc                     / / pCU R                    HC  nUR                  XU R                  5      u  pgUR                  U5        UR                  U5        ME     X44$ )zhFork the cache of a request (state) into the one of a list of requests with the given (dst_request_ids).)r`   fork_blocksrf   r   )ri   r   r   r   destination_blocksru   
src_blocks
dst_blockss           r0   fork_request PagedAttentionCache.fork_request  s^     -/)++B%'^^4E`d`s`s%t"J  ,%%j1 , 00rx   c                     [        5       nU R                   H,  nUR                  UR                  R	                  5       5        M.     U H  nU R                  U5        M     g)zFree all blocks allocated to requests across all cache managers. This preserves prefix hashes in the block
manager (blocks become initialized rather than uninitialized if they were complete), allowing prefix sharing
to work across generation sessions.N)setr`   r   r   r   r   )ri   all_request_idsru   r   s       r0   free_all_requests%PagedAttentionCache.free_all_requests  sN     %++B""2>>#6#6#89 ,)JZ( *rx   )rf   rh   rg   rG   rJ   rW   r   r6   r7   r`   r;   rU   rM   rD   rT   rc   rC   ra   r?   r:   rR   rb   rX   rL   rY   re   rV   )r   N)%__name__
__module____qualname____firstlineno____doc__rZ   float16r   r	   r6   strr7   intrv   boolr   r   r   r   r~   listr   Tensorr   dictr   tupler   r   r   r   r   r   r   r   r   __static_attributes__ rx   r0   r3   r3   =   s   6z #]]"b% b% %=b% s"	b%
 {{b% tb% 
b%H;# ;Y\ ;ae ;$   PS X[^bXb   <c <d < <3S 3 `` ` 	`
 cOd*` $s)_` 
` `(WW,/W?BWQVQ]Q]W	W 	 	C 	DcN 	 	 ;>LL;> ll;> 	;>
 &;> %,,';> 
u||U\\)	*;> ;>z%c %c % c tCy S 4| Z] bf "DT#Y DDQTI DZ^ D	1c 	1DQTI 	1Z_`deh`ikopskt`tZu 	1)rx   r3   c                      \ rS rSrSr\R                  r\R                  r	Sr
SrS\S\S\S\S	\\\\4      S
\SS4S jr\SS\S\4S jj5       rS\\\4   S\R(                  S\\\\\4   4S jr\S\S\S\S\4S j5       rS\\\4   S\S\S-  S\S-  S\R(                  S\\\4   4S jrSSS\R0                  4S\S-  S\S-  S\S\R(                  S\\\4   4
S jjrS\S\S\R(                  S\4S jrSrg) rO   i  u  Determines the optimal number of pages (N) and max batch tokens (M) for the paged attention cache, given
available GPU memory. The relation between N and number of blocks is: num_blocks = N // block_size.

The memory footprint is a polynomial in N and M, where each term maps to a tensor allocated in
``ContinuousBatchingIOs._setup_static_tensors`` or ``PagedAttentionCache.__init__``:

    memory(N, M)  =  coeff_n · N  +  coeff_m · M  +  coeff_nm · N·M  +  coeff_mm · M²

See ``_equation_coefficients`` for the breakdown.  All three solving modes (auto, fixed-N, fixed-M) reduce to
solving this equation, which is at most quadratic in one variable.
i   i   r5   r>   r?   r,   r@   rA   r   Nc                 4   UR                   U l         X l        X0l        X@l        XPl        X`l        UR                  U l        U R                  c  UR                  U l        UR                  (       a  SOSU l	        UR                  (       a  SU l        gSU l        g)u>  Initialize the memory handler. `activation_peaks` is a list of `(Δcn, Δcm)` pairs giving the activation memory
contributions proportional to N (pages) and M (batch tokens) for each peak. Memory must satisfy the constraint
at every peak, so we solve each polynomial independently and take the most restrictive result.Nr=   r   )rJ   r>   r?   r,   r@   rA   rT   fallback_max_blocks_per_requestreturn_logprobsnum_output_rowsuse_async_batchingio_multiplier)ri   r5   r>   r?   r,   r@   rA   s          r0   rv   $PagedAttentionMemoryHandler.__init__  s     5??"$$ 0#6 &@&W&W#&&.*D*d*dD'$>$N$NqTU"<"O"OQUVrx   rE   c                 V    [        5       u  pp4U[        XC5      -
  n[        XP-  5      nU$ )z^Calculate available GPU memory for cache allocation, accounting for already allocated tensors.)r   r|   r   )rE   r'   totalreserved	allocatedavailable_memorys         r0   get_available_memory0PagedAttentionMemoryHandler.get_available_memory7  s7     )H(I%( 3y#;;/DErx   peakrF   c                    U R                   R                  nU R                  R                  nUR                  nU R                  nUu  pxSU R                  -  U R
                  -  U-  X`R                  -  S-  -   Xt-  -   n	X-  US-  U-  -   X`R                  -  U-  -   X`R                  -  U R                  -  U-  -   X`R                  -  S-  -   X`R                  -  S-  -   n
X`R                  -  U-  nX`R                  -  U-  nXX4$ )u{  Returns `(coeff_n, coeff_m, coeff_nm, coeff_mm)` for the memory polynomial of a single activation peak.
`peak = (Δcn, Δcm)` is the peak-specific activation contribution; the rest of the coefficients are shared
across peaks. Each addend is annotated with the tensor it corresponds to in
`ContinuousBatchingIOs._setup_static_tensors` (or the forward pass, for activation terms).
r=         )
_input_dtypeitemsize_activation_dtyper
  r,   r>   r?   r  rT   rA   )ri   r  rF   r)   ackdelta_ndelta_mcoeff_ncoeff_mcoeff_nmcoeff_mms                r0   _equation_coefficients2PagedAttentionMemoryHandler._equation_coefficientsA  s;    &&""++   $..014//!A%&k 	 K!eai&&&*+ //!))*,-..
 //!A%& //!A%& 	 ///!3///!333rx   r  r   r  c                     U S:X  a  U* U-  $ US-  SU -  U-  -
  nUS:  a  [        SU S35      eU* [        U5      -   SU -  -  nUS:  a  [        SU S35      eU$ )uQ   Largest positive root of a·x² + b·x + c = 0. Falls back to linear when a == 0.r   r=      z!No real solution (discriminant = )zNo positive solution (root = )rK   r   )r  r   r  discriminantroots        r0   _solve_quadratic,PagedAttentionMemoryHandler._solve_quadraticg  s     626M!ta!eai'!@aPQQT,''AE2!8<TF!DEErx   	availablerC   rD   c                 `   U R                  X5      u  pgpUc  Uc|  Sn
U R                  X-  XS-  -  -   XgU
-  -   U* 5      n[        X-  5      nX@R                  :  a  U R                  nSnO,[	        [        U5      U R                  -  U R                  5      nUcG  Un[        X'U-  -
  XS-  -  -
  XhU-  -   -  5      n[	        XR                  -  U R                  5      nX44$ UcI  X0R                  -  nU R                  XX-  -   Xm-  U-
  5      n[	        [        U5      U R                  5      nX44$ )zSolve for `(num_blocks, max_batch_tokens)` against one activation peak's memory polynomial. Clamps to upper
bounds. Either input may be None; whichever is None is solved for.Ng{Gz?r=   )r#  r*  r   _upper_bound_max_batch_tokensr}   r   rJ   _upper_bound_num_blocks)ri   r  r,  rC   rD   rF   cnru   cnmcmmmrR   MNs                 r0   _solve_for_peak+PagedAttentionMemoryHandler._solve_for_peakt  sH     66tI"2":A--cgd
.BBaKR[Q[\I"9=1"D"DD#'#E#E !
 y!1T__!DdFbFbc
 Ay61CQ$J>2a<PQIY//94;W;WXJ ++ %__,A%%c<)9KLA"58T-O-OP++rx   g?c                 X   U R                  U5      n[        R                  " SU 35        [        S5      n[        S5      nU R                   H.  nU R                  XXU5      u  p[        Xi5      n[        Xz5      nM0     Xgp!U R                  XU5      nX:  a  [        SU SU 35      eX4$ )ui  Solve for the missing variable(s) in the memory polynomial (see ``_equation_coefficients``). There is one
polynomial per activation peak; we solve each independently and take the most restrictive (smallest) result.
When both `N` and `M` are unknown, assumes `M = m·N` (m = 0.01, i.e. one batch fills ~1 % of the cache) and
solves the resulting quadratic in N.
zCache memory: infzMemory footprint z is more than available memory )	r  r   rS   floatr@   r6  r}   compute_memory_footprintMemoryError)ri   rC   rD   rE   rF   r,  acc_num_blocksacc_max_batch_tokensr  r   m_batch_tokensmemory_footprints               r0   rQ   APagedAttentionMemoryHandler.infer_num_blocks_and_max_batch_tokens  s     --.@A	nYK01u$U|))D'+';';DZkv'w$H :N#&';#L  *
 (6$88Wbc' 12B1CCbclbmnoo++rx   c                     XR                   -  nUnSnU R                   H;  nU R                  Xs5      u  ppX-  X-  -   X-  U-  -   X-  U-  -   n[        Xl5      nM=     U$ )zaEvaluate the memory polynomial at concrete (N, M) values, taking the max across activation peaks.r   )rJ   r@   r#  r|   )ri   rC   rD   rF   r5  r4  max_memory_footprintr  r0  ru   r1  r2  r@  s                r0   r;  4PagedAttentionMemoryHandler.compute_memory_footprint  sv    ( ))D#::4MBC!v1<sw{J#&';#N  * $#rx   )	r@   rJ   r,   r
  rT   rA   r?   r  r>   )g      ?)r   r   r   r   r   rZ   bfloat16r  r   r  r.  r/  r	   r   r   r  rv   staticmethodr:  r  r7   r#  r*  r6  r   rQ   r;  r  r  rx   r0   rO   rO     s   
 ;;L$(!"W$<W W 	W
 W uS#X/W !W 
W4        #4#s(O#427++#4	sCc!	"#4L 
E 
e 
 
% 
 
#,CHo#, #, $J	#,
 *#, [[#, 
sCx#,N "&'+$'#(==,$J, *, "	,
 [[, 
sCx,:
$3 
$# 
$\a\g\g 
$lo 
$rx   rO   )"r   mathr   r   r   typingr   rZ   configuration_utilsr   generation.configuration_utilsr	   utils.genericr
   utils.metricsr   r   cache_managerr   r   r   r   requestsr   r   r   r   r  r   r   r   r1   r3   rO   r  rx   r0   <module>rO     s     ! !   3 F 9 2 t t Z Z%&6 %5d3iRVWZR[A[;\ %B I) I) I)Zu$ u$rx   