
    Z jp                         S SK JrJr  S SKJr  S SKJr  S SKJr  S SK	J
r
  S SKrSSKJr  \
" S	5      rS
\\   S\\\\4      4S jr " S S5      r " S S5      r " S S\5      r " S S\5      r " S S\5      rg)    )ABCabstractmethod)deque)Iterator)ceil)TypeVarN   )loggerTxsreturnc              #   Z   #    [        U 5      S-
  nU S S S2    H  nX4v   US-  nM     g 7f)Nr	   )len)r   indexxs      ڊ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/cache_manager.pyreverse_enumerater      s4     GaKE"Xh
 s   )+c                   ^    \ rS rSrSrS\S\S-  S\SS4S jrS\4S	 jr\	S\
4S
 j5       rSrg)Block#   a  A class to represent a block managed by the block manager. We say that a block is complete when the physical KV
cache it points to is fully computed. A block can have a parent, which is the block that came before in the
sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block, the
layer (group_id) it belong to and its parent's hash (if there is a parent).id_	parent_idNgroup_idr   c                 D    Xl         X l        X0l        S U l        SU l        g )Nr	   idr   r   hash	ref_count)selfr   r   r   s       r   __init__Block.__init__)   s    %.% $	    c                     SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )Nz	Block(id=z, parent_id=z, group_id=z, hash=z, ref_count=)r   r    s    r   __repr__Block.__repr__0   s_    477)</?{4==/Y`aeajaj`kkwx|  yG  yG  xH  HI  J  	Jr#   c                     U R                   S L$ )N)r   r&   s    r   is_completeBlock.is_complete3   s    yy$$r#   )r   r   r   r   r   )__name__
__module____qualname____firstlineno____doc__intr!   strr'   propertyboolr*   __static_attributes__ r#   r   r   r   #   sV    S
 C  C$J  #  $  J# J %T % %r#   r   c                   t   \ rS rSrSrS\S\SS4S jr\S\4S j5       rS	\S\	4S
 jr
S	\S\S-  S\	S\S\\   S-  4
S jrS\\   S\S\	S\S\\\\      S-  \\   \\   4   4
S jrS\SS4S jrS\SS4S jrS\\   S\	SS4S jrS\SS4S jrS\S\\   S\\   SS4S jrS\S-  S\\   S\S\4S jrSrg) BlockManager8   a  A class to manage the number of free blocks and block re-use. When a block becomes in use, a flag is passed to
determine if the block is shareable or not. If it is, then a Block object is created and kept track of internally.
It can have the following states:
  - in use: one or more requests references this block, thus it cannot be written over. The number of requests
    referencing this block is stored as ref_count in the Block object.
  - un-initialized: the block points to a space in the KV cache tensor that contains no data yet. Those blocks can
    be given as free blocks to new requests without any overhead.
  - initialized: the block is complete and was used by one or more request that are finished. It contains KV cache
    data and its hash is stored in the hash table. If a new request needs a block with the same hash, we increase
    the ref_count of the block and remove it from the list of initialized blocks, because it is now in use.
    Still, the block can be freed if no un-initialized blocks are left. In that case, we remove its hash from the
    hash table.
If the block is not shareable, we just use the block manager as a FIFO structure where blocks are either free or in
use. Sharability is determined by the type of cache allocator: blocks created for full attention layers are
shareable, while blocks created for sliding window attention layers are not.
There is no structure to keep track of the blocks in use: if a block is neither un-initialized nor initialized,
it is in use.

num_blocks
block_sizer   Nc                 x    Xl         X l        [        [        U5      5      U l        0 U l        0 U l        0 U l        g)z^Initializes the block manager with a given number of blocks (num_blocks) of size (block_size).N)r:   r;   r   range_uninit_block_ids_init_block_ids_hash_to_id_id_to_block)r    r:   r;   s      r   r!   BlockManager.__init__L   s6    $$!&uZ'8!902+-.0r#   c                 X    [        U R                  5      [        U R                  5      -   $ )zfReturns the number of free blocks left. Both initialized and uninitialized blocks are considered free.)r   r>   r?   r&   s    r   num_free_blocksBlockManager.num_free_blocksU   s%     4))*S1E1E-FFFr#   n_blocksc                    [        U R                  5      U:  a  gU[        U R                  5      -
  n[        U R                  5      U:  a  g[        U5       Ho  nU R                  R	                  5       S   nU R
                  U   nU R                  R                  UR                  5        U R                  R                  U5        Mq     g)zChecks if there are enough free blocks to allocate the requested number of blocks (n_blocks). If there are
not enough uninitialized blocks, we uninitialize the required number of initialized blocks.TFr   )
r   r>   r?   r=   popitemrA   r@   popr   append)r    rF   block_to_uninitialize_id_to_uninitializeblocks         r   has_enough_free_blocks#BlockManager.has_enough_free_blocksZ   s     t%%&(2 (3t/E/E+F Ft##$'<<,-A!%!5!5!=!=!?!B%%&89E  ,""))*<= . r#   last_block_id	shareabler   c                     U R                  U5      (       d  g[        U5       Vs/ s H  oPR                  R                  5       PM     nnU(       a%  U H  n[	        XrU5      nXR
                  U'   UnM!     U$ s  snf )al  Returns a list of (n_blocks) free block and mark them as no longuer free in the internal data structures.
If the (shareable) flag is set to True, a Block object is created to keep track of the block, with the
(last_block_id) to indicate the last block id in the sequence, also named the parent block. If the manager
cannot find enough free blocks, it returns None.N)rO   r=   r>   popleftr   rA   )	r    rF   rQ   rR   r   rL   allocated_block_idsblock_idrN   s	            r   get_free_blocksBlockManager.get_free_blocksm   sx     **844INxYA55==?Y/hx@.3!!(+ ( 0
 #" Zs   #A8parent_blocks	num_forksc                 L   / nU(       a[  U HU  nU R                   U   nUR                  (       a2  UR                  UR                  5        U=R                  U-  sl        MU    O   [        U5      [        U5      -
  nUS:X  a!  [        U5       V	s/ s H  oSS PM	     sn	/ / 4$ / n
/ n/ nU(       a  US   OSn[        U5       HW  n	U R                  XX45      nUc  S/ / 4s  $ U
R                  X^-   5        UR                  X* S 5        UR                  U5        MY     XU4$ s  sn	f )u  Fork a given list of (parent_blocks) as many times as (num_forks). If the blocks are (shareable), we use
reference on the blocks that are complete. Otherwise, we allocate new blocks and keep track of their indices to
later copy the physical cache. For instance, when forking 4 blocks for 2 children:

Parent blocks: [0, 1, 2, 3], with all blocks being complete except the last one (block 3).

----------------------------------------- IF BLOCKS ARE NOT SHAREABLE -----------------------------------------

Forked blocks lists: [[5, 6, 7, 8], [9, 10, 11, 12]]
Copy source:          [0, 1, 2, 3,   0,  1,  2,  3]
                       ↓  ↓  ↓  ↓    ↓   ↓   ↓   ↓
Copy destination:     [5, 6, 7, 8,   9, 10, 11, 12]  → 8 blocks are newly allocated and copied

----------------------------------------- IF BLOCKS ARE SHAREABLE ---------------------------------------------

Forked blocks lists: [[0, 1, 2, 5], [0, 1, 2, 6]]
Copy source:          [         3,            3]     (block 3 is not complete so it's copied, not referenced)
                                ↓             ↓
Copy destination:     [         5,            6]     → only 2 blocks are newly allocated and copied
r   Nr   )	rA   r*   rJ   r   r   r   r=   rW   extend)r    rY   rZ   rR   r   forked_by_referencerV   rN   blocks_to_copyrL   forked_blocks_listscopy_srccopy_dstr   rU   s                  r   fork_blocksBlockManager.fork_blocks   s7   0 !)))(3$$'..uxx8OOy0O * ]+c2E.FFQ49)4DE4Dq*4DEr2MM ! 0C'+	y!A"&"6"6~R["f"*R|#&&':'PQOOM/*:;<OO/0 " #h66! Fs   D!rV   c                     U R                   U   nU=R                  S-  sl        UR                  S:X  a  U R                  R                  U5        gg)z4Increases the reference count of a given (block_id).r	   N)rA   r   r?   rI   r    rV   rN   s      r   increase_ref_countBlockManager.increase_ref_count   sE    !!(+1??a  $$X.  r#   c                    U R                   U   nU=R                  S-  sl        UR                  S:X  aX  UR                  (       a  SU R                  U'   gU R                   R	                  U5        U R
                  R                  U5        gg)zDecreases the reference count of a given (block_id). If the reference count reaches 0, the block is no longer
in use, and becomes initialized (if it was complete) or uninitialized (if it was incomplete).r	   r   N)rA   r   r*   r?   rI   r>   rJ   re   s      r   decrease_ref_countBlockManager.decrease_ref_count   st     !!(+1??a  15$$X.!!%%h/&&--h7  r#   blocksc                 ~    U(       a  U H  nU R                  U5        M     gU R                  R                  U5        g)zMarks a list of (blocks) as free. If the blocks were not (shareable), we simply add them to the uninitialized
blocks queue. Otherwise, their new state depends on whether they are complete.N)ri   r>   r\   )r    rk   rR   rV   s       r   free_blocksBlockManager.free_blocks   s5     "''1 # ""))&1r#   c                     U R                   R                  U5      nUR                  S:  a  [        SU SUR                  < 35      eU R                  R                  U5        g)zYMarks a block as uninitialized. Raises an error if the block has more than one reference.r	   Block z0 has more than one reference: block.ref_count = N)rA   rI   r   RuntimeErrorr>   rJ   re   s      r   uninitialize_unshared_block(BlockManager.uninitialize_unshared_block   s[     !!%%h/??Qz1bPUP_P_Ocdee%%h/r#   num_complete_blocksallocated_blocks
prompt_idsc                    Sn/ n[        U5       HE  u  pgU R                  U   nUR                  (       a  UR                  n  OUR	                  Xh45        MG     Sn	U(       Ga  UR                  5       u  phU	b  Xl        Sn	US:X  a  gUS-  nX6U R                  -  US-   U R                  -   n
U R                  XJUR                  5      Ul        U R                  R                  UR                  5      nUb  XR                  :X  a%  [        R                  " SUR                   S35        O[        R                  " SU SUR                   35        XU'   Un	U R!                  U5        U R#                  UR                  5        O`[        R                  " SUR                   S	UR                   S
UR                   35        UR                  U R                  UR                  '   UR                  nU(       a  GM  gg)zAmong the list of (allocated_blocks), mark (num_complete_blocks) incomplete blocks as now complete. The list
of (prompt_ids) is used to compute the hash of the new block.Nr   r	   rp   z& was marked as complete more than oncezFound existing block z for block zAdding new block z (group z) with hash )r   rA   r*   r   rJ   rI   r   r;   compute_hashr   r@   getr   r
   warningdebugrf   rr   )r    rt   ru   rv   parent_hashincomplete_blocksirV   rN   new_parent_idtokensexisting_block_ids               r   !mark_shareable_blocks_as_complete.BlockManager.mark_shareable_blocks_as_complete   s    57,-=>KA%%h/E  #jj$$aZ0 ? (,,.HA ("/ $ #a'  1$DOO 3q1u6OPF**;OEJ $ 0 0 4 4UZZ @ ,$0NNVEHH:5[#\]LL#89J8K;W\W_W_V`!ab*;Q'$5M++,=>44UXX> 0
(5>>BRR^_d_i_i^jkl/4xx  ,  **KI  r#   r|   r   c                 0    [        U[        U5      U45      $ )zComputes the hash of a block identified by the (tokens) it contains, its (parent_hash) and the layer
(group_id) it belong to. If the block has no parent, the parent hash is None.)r   tuple)r    r|   r   r   s       r   rx   BlockManager.compute_hash  s     [%-:;;r#   )r@   rA   r?   r>   r;   r:   )r,   r-   r.   r/   r0   r1   r!   r3   rD   r4   rO   listrW   r   rb   rf   ri   rm   rr   r   rx   r5   r6   r#   r   r8   r8   8   s   &13 1C 1D 1 G G Gs t &##,/$J#CG#SV#	cT	#&67!#Y673667CG67SV67	tDI%tCy$s);	<67p/3 /4 /
83 
84 
82$s) 2 2 20C 0D 05%#&5%:>s)5%QUVYQZ5%	5%n<d
 <DI <QT <Y\ <r#   r8   c                   N   \ rS rSr% Sr\\S'   \\\	\   4   \S'   \
\S'   \S\S\S\S	\S
-  4S j5       rS\S\S	S
4S jr\S\S\S\S	\	\   4S j5       r\S\S\S\S	\	\   4S j5       r\S\S\S\S\R$                  S	S
4
S j5       rS\S\	\   S\S	\\	\   \	\   4   4S jrSrg
)CacheAllocatori  zAbstract base class for cache managers. Cache managers keep track of per-request cache allocations, determine
when a new physical block needs to be allocated and compute physical indices for reading or writing to the cache._indexblock_tableuses_block_sharingrF   
request_idblock_managerr   Nc                     g)zAllocates (n_blocks) for a given (request_id) using the (block_manager). Returns the num of blocks allocated
if successful and None otherwise.Nr6   )r    rF   r   r   s       r   allocate_blocksCacheAllocator.allocate_blocks"      r#   c                     XR                   ;   a5  U R                   R                  U5      nUR                  X0R                  S9  g[        R
                  " SU R                   SU 35        g)zJFrees all blocks associated with a (request_id) using the (block_manager).)rR   zCacheAllocator z7 attempted to free blocks for non-existent request_id: N)r   rI   rm   r   r
   rz   r   )r    r   r   blocks_to_frees       r   rm   CacheAllocator.free_blocks'  s^    )))!--11*=N%%n@W@W%XNN!$++.efpeqrr#   past_lengthquery_lengthc                     g)zUReturns the physical indices of where to read request_id's cache in the cache tensor.Nr6   r    r   r   r   s       r   get_read_indicesCacheAllocator.get_read_indices1  r   r#   c                     g)zVReturns the physical indices of where to write request_id's cache in the cache tensor.Nr6   r   s       r   get_write_indices CacheAllocator.get_write_indices5  r   r#   c                     g)KFills the block table for a given request_id, past_length and query_length.Nr6   r    r   r   r   r   s        r   fill_block_tableCacheAllocator.fill_block_table9  r   r#   parent_request_idchildren_request_idsc                 d   XR                   ;  a  [        SU 35      eU R                   U   nUR                  U[        U5      U R                  U R
                  S9u  pVnUc  [        SU 35      e[        X%5       H0  u  pXR                   ;   a  [        SU 35      eXR                   U'   M2     Xg4$ )aa  Forks the cache blocks of a (parent_request_id) to a list of (children_request_ids). To manage the blocks,
the (block_manager) is used. When forking, the child's block are either shared with the parent, or they need to
be copied from the parent. Hence we return two lists of blocks that need to be copied: one for the source and
one for the destination.!No block table found for request )rY   rZ   rR   r   z"Failed to fork blocks for request z'Block table already exists for request )r   
ValueErrorrb   r   r   r   zip)
r    r   r   r   rY   list_forked_blocksr`   ra   children_request_idforked_blockss
             r   rb   CacheAllocator.fork_blocks?  s     $4$44@AR@STUU (():;1>1J1J'./--[[	 2K 2
.h %ABSATUVV 366J2_."&6&66 #JK^J_!`aa4A01 3` !!r#   r6   )r,   r-   r.   r/   r0   r1   __annotations__dictr2   r   r4   r   r8   r   rm   r   r   torchTensorr   r   rb   r5   r6   r#   r   r   r     sw   y Kc49n%%- - -\ -^adh^h - -c , 4  d3 dS dPS dX\]`Xa d d eC ec eQT eY]^aYb e e ZZ,/Z?BZQVQ]Q]Z	Z Z
"!$"<@I"Vb"	tCy$s)#	$"r#   r   c            
           \ rS rSrSrS\S\S\SS4S jrS	\S
\S\	S\S-  4S jr
S
\S\S\S\\   4S jrS
\S\S\S\\   4S jrS
\S\S\S\R                  SS4
S jrSrg)FullAttentionCacheAllocatori^  z3Cache manager for a group of full attention layers.r   r;   allow_block_sharingr   Nc                 6    Xl         X0l        X l        0 U l        g)zInitializes the cache manager for a group of full attention layers.
Args:
    - index: the index of the associated layer group
    - block_size: the size of the blocks in the cache
N)r   r   r;   r   )r    r   r;   r   s       r   r!   $FullAttentionCacheAllocator.__init__a  s     "5$r#   rF   r   r   c                     U R                   R                  U/ 5      nU(       a  US   nOX@R                   U'   SnUR                  XU R                  U R                  5      nUc  gUR                  U5        U$ )zAllocate (n_blocks) for a given (request_id) using the (block_manager). Returns the number of blocks
allocated if successful and None otherwise. For group of full attention layers, we always allocate the number of
requested blocks.r   N)r   ry   rW   r   r   r\   )r    rF   r   r   r   rQ   ru   s          r   r   +FullAttentionCacheAllocator.allocate_blocksl  s{    
 &&**:r:'OM+6Z( M(88RVRiRikokvkvw#+,r#   r   r   c                    U R                   R                  U5      nUc  [        SU 35      eX#-   nXPR                  -  nXPR                  -  n/ n[	        U5       H;  n	XI   U R                  -  n
UR                  [	        XU R                  -   5      5        M=     U(       a.  XF   U R                  -  n
UR                  [	        XU-   5      5        U$ )zReturns the physical indices of where to read request_id's cache. For a group of full attention layers, we
first write the new cache to the cache tensor and then read the entire cache from the beginning to the end.r   r   ry   r   r;   r=   r\   )r    r   r   r   r   total_lengthnum_full_blocks	remainderphysical_indicesbstarts              r   r   ,FullAttentionCacheAllocator.get_read_indices~  s     &&**:6@MNN"1&//9 ??2	'ANT__4E##E%1H$IJ ( 04??BE##E%1B$CDr#   c                    U R                   R                  U5      nUc  [        SU 35      eX R                  -  nX R                  -  nX#-   nUS-
  U R                  -  n/ n	[	        XXS-   5       Hb  n
XJ   U R                  -  nX:X  a  UOSnX:X  a  US-
  U R                  -  S-   OU R                  nU	R                  [	        X-   X-   5      5        Md     U	$ )zReturns the physical indices for writing to the cache. For a group of full attention layers, we write the new
cache as a continuation of the existing cache for the same request.r   r	   r   r   )r    r   r   r   r   start_blockstart_offsetend_pos	end_blockr   r   block_startlocal_start	local_ends                 r   r   -FullAttentionCacheAllocator.get_write_indices  s     &&**:6@MNN!__4"__4,q[T__4	{M2A%.4??:K*+*:,K?@~17!;SWSbSbI##E+*C[E\$]^ 3  r#   r   c                     U R                   R                  U5      nUc  [        SU 35      eX#-   nX`R                  -   S-
  U R                  -  n[        R
                  " USU UR                  UR                  S9USU& g)r   Nr   r	   )devicedtype)r   ry   r   r;   r   tensorr   r   )r    r   r   r   r   request_blocksr   num_blocks_neededs           r   r   ,FullAttentionCacheAllocator.fill_block_table  s     ))--j9!@MNN"1)OO;a?DOOS*/,,--.{7I7IQ\QbQb+
&&'r#   )r   r;   r   r   )r,   r-   r.   r/   r0   r1   r4   r!   r2   r8   r   r   r   r   r   r   r   r5   r6   r#   r   r   r   ^  s    =	c 	s 	 	RV 	  \ ^adh^h $ 3  S  PS  X\]`Xa  * C  c  QT  Y]^aYb  *

,/
?B
QVQ]Q]
	
r#   r   c                       \ rS rSrSrS\S\S\S\S\SS	4S
 jrS\S\S\S\S	-  4S jr	S\S\S\S\
\   4S jrS\S\S\S\
\   4S jrS\S\S\S\R                  SS	4
S jrSrg	)SlidingAttentionCacheAllocatori  z2Cache manager for sliding window attention layers.r   r;   sliding_windowsentinel_indextrash_indexr   Nc                     Xl         SU l        X l        X0l        X@l        XPl        [        U R                  U R                  -  5      U l        0 U l        g)a  Initializes the cache manager for a group of sliding window attention layers. ``sentinel_index`` and
``trash_index`` are valid cache positions in the padding zone, used instead of -1 in read and write indices
respectively so that index_select/index_copy_ never receive negative values.
FN)	r   r   r;   r   r   r   r   _max_blocks_per_requestr   )r    r   r;   r   r   r   s         r   r!   'SlidingAttentionCacheAllocator.__init__  sL     "'$,,&'+D,?,?$//,Q'R$r#   rF   r   r   c                 ^   X R                   ;  a  / U R                   U'   [        U R                   U   5      nX@R                  :X  a  g[        XA-   U R                  5      nXT-
  nUR	                  USU R
                  U R                  5      nUc  gU R                   U   R                  U5        U$ )a  Allocate (n_blocks) for a given (request_id) using the (block_manager). Returns the number of blocks
allocated otherwise. For group of sliding window attention layers, we only allocate up to the point where we can
fit an entire sliding window in the cache tensor.r   N)r   r   r   minrW   r   r   r\   )r    rF   r   r   already_allocatedafter_allocationactual_n_blocksru   s           r   r   .SlidingAttentionCacheAllocator.allocate_blocks  s     ---+-DZ( 0 0 <= < <<0;T=Y=YZ*>(88T4#:#:DKK
 #$++,<=r#   r   r   c                    U R                   R                  U5      nUc  [        SU 35      eX R                  :  a  SOX R                  -  n[	        X R                  S-
  5      n/ n[        XUU-   5       HR  nXR                  -  nXR                  -  n	XR                  -  n
XI   U R                  -  U
-   nUR                  U5        MT     XpR                  /U-  -   $ )a  Returns the physical indices of where to read request_id's cache in the cache tensor.
For a group of sliding window attention layers, we read from the cache tensor before writing on it, because the
new cache can overwrite the old one. To form the cache + new key / values states, we read the at most
sliding_window - 1 cache page and then manually add the new key / values states after. Hence the sentinel
indices which indicate where to store the new key or values indices.r   r   r	   )	r   ry   r   r   r   r=   r;   rJ   r   )r    r   r   r   r   start_indexcache_lengthr   r~   	block_idxblock_offsetphysical_indexs               r   r   /SlidingAttentionCacheAllocator.get_read_indices  s     &&**:6@MNN&)<)<<a+PcPcBc;(;(;a(?@{,$>?A$$$A__,I.L(3dooETN##N3 @  #6#6"7,"FFFr#   c                    U R                   R                  U5      nUc  [        SU 35      eX R                  -  n[	        X0R                  5      nX6-
  n/ n[        XUU-   5       HR  n	XR                  -  n	XR                  -  n
XR                  -  nXJ   U R                  -  U-   nUR                  U5        MT     US:  a  U R                  /U-  U-   nU$ )a2  Returns the physical indices of where to write request_id's cache in the cache tensor. For a group of
sliding window attention layers, we write the new cache in rolling-buffer kind of way: if we reach the end of
the allocated physical cache, we start writing from the beginning of the physical cache again.r   r   )	r   ry   r   r   r   r=   r;   rJ   r   )r    r   r   r   r   r   r   padding_lengthr   r~   r   r   r   s                r   r   0SlidingAttentionCacheAllocator.get_write_indices  s    
 &&**:6@MNN!$7$77<)<)<=%4{,$>?A$$$A__,I.L(3dooETN##N3 @ A $ 0 01NBEUUr#   r   c                     [        S5      e)Nz:Sliding window attention layers do not support block table)NotImplementedErrorr   s        r   r   /SlidingAttentionCacheAllocator.fill_block_table  s     ""^__r#   )r   r   r;   r   r   r   r   r   )r,   r-   r.   r/   r0   r1   r!   r2   r8   r   r   r   r   r   r   r   r5   r6   r#   r   r   r     s    <&);>PSbe	   \ ^adh^h ,G3 GS GPS GX\]`Xa G. C  c  QT  Y]^aYb  2``,/`?B`QVQ]Q]`	`r#   r   )abcr   r   collectionsr   collections.abcr   mathr   typingr   r   requestsr
   r   r   r   r1   r   r   r8   r   r   r   r6   r#   r   <module>r      s    $  $     CL$q' huS!V}&= % %*_< _<DA"S A"HV
. V
v\`^ \`r#   