
    Z j@                     x    S r SSKJr  SSKJr  SSKrSSKJr  SSKJ	r	  SS	K
JrJrJrJr  SS
KJr   " S S5      rg)u  Centralized offloading logic for continuous batching.

Handles two offloading strategies when the GPU KV cache is full:
  1. CPU offloading: copy the KV cache to a pre-allocated pinned CPU buffer, preserving exact request state.
  2. Soft reset: discard the KV cache and re-prefill from scratch when the request is re-scheduled. This incurs no data
    transfer overhead, but we need to re-run prefill over all intial + generated tokens (so more compute overhead).

The CPU swap pool is a static set of pinned tensors allocated once at init (like vLLM/SGLang). Blocks are tracked
with a simple free set — no dynamic allocation or deallocation of tensors ever happens at runtime.
    )deque)nullcontextN   )is_psutil_available   )PagedAttentionCache)FutureRequestStateRequestStateRequestStatuslogger)	Schedulerc                      \ rS rSrSrS\S\S\S-  S\S\R                  R                  S-  S	S4S
 jrS\S-  S\S	\4S jrS rSS jrS\\   S	S4S jrS\S	S4S jrSS jrSS jrS\S\S	\4S jrS\S	\\\   \\   4   4S jrSrg)OffloadingManager$   u#  Manages request offloading and restoration for continuous batching.

Owns a static CPU swap pool (pre-allocated pinned tensors mirroring the GPU cache layout), performs GPU↔CPU block
copies, decides between CPU offloading and soft reset, and ensures cleanup on cancellation/failure/reset.
cache	schedulercpu_offload_space_gibNsafety_thresholdcompute_streamreturnc           	         Xl         X l        XPl        / U l        / U l        / U l        / U l        [        5       U l        0 U l	        0 U l
        U R                  X45      U l        US L=(       a    US:  nU R                  S:X  a#  U(       a  [        R                  " SUS S35        g U R                  UR                  UR                   UR"                  4nUR$                   Hs  nU R                  R'                  [(        R*                  " XqR,                  SS95        U R                  R'                  [(        R*                  " XqR,                  SS95        Mu     SUR                  UR                   UR"                  4n	[/        UR$                  UR0                  5       HU  u  pU R
                  R'                  U
R2                  " U	6 5        U R                  R'                  UR2                  " U	6 5        MW     [        [5        U R                  5      5      U l        [(        R*                  " U R                  [(        R6                  SS9U l        [(        R*                  " U R                  [(        R6                  UR:                  S9U l        U R                  S   nS	UR?                  5       -  URA                  5       -  [C        UR$                  5      -  n[        RD                  " S
U R                   SUS-  S S35        g )Nr   cpu_offload_space=.1fz8 GiB is too small for even one block. No CPU offloading.T)dtype
pin_memory)r   device   zCPU swap pool initialized: z	 blocks (   @.2fz GiB pinned))#r   r   _compute_stream_cpu_key_cache_cpu_value_cache_gpu_key_views_gpu_value_viewsr   _free_cpu_blocks_request_id_to_cpu_blocks!_request_id_to_group_block_counts_compute_num_cpu_blocks_num_cpu_blocksr   warning
block_sizenum_key_value_headshead_dim	key_cacheappendtorchemptyr   zipvalue_cacheviewrangeint32_cpu_ids_scratchr   _gpu_ids_scratchnumelelement_sizeleninfo)selfr   r   r   r   r   offloading_enabledcpu_cache_shape_block_shapek_cachev_cachecache_tensorsize_in_bytess                 ڏ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/offloading_manager.py__init__OffloadingManager.__init__+   s    
"- 35462446,1G?A&GI.  $;;<Qd2$>\CX[\C\1$!()>s(C D) )   //1A1A5C\C\^c^l^lmA&&u{{?++bf'gh!!((_KKdh)ij !
 5++U-F-FW #EOOU5F5F GG&&w||['AB!!(({)CD !H
 !&eD,@,@&A B !&D,@,@`d e %D,@,@\a\h\h i **1-L..00<3L3L3NNQTUZUdUdQee)$*>*>)?yZaIbcfHggst	
    c                    Ub  [        US-  5      OSn[        5       (       a,  SSKnUR                  5       R                  n[        XR-  5      nOSnUb:  Ub7  X6:  a1  US-  n[
        R                  " SUS SUS SWS-  S S	US S
3	5        UnOIUb  [
        R                  " S5        O/Ub!  Un[
        R                  " SUS-  S S
35        O[        S5      eS[        U R                  R                  5      -  U R                  R                  -  U R                  R                  -  U R                  R                  -  U R                  R                  R                  -  nUS:X  a  [!        S5      eX8-  $ )z?Returns the number of blocks that can fit in the CPU swap pool.Nr   r   r   r   z GiB exceeds z.0%z of total RAM (z GiB). Clamping to z GiB.u{   psutil is not available — cpu_offload_space_safety_threshold cannot be enforced. Install psutil to enable the safety cap.z1Auto-sizing CPU swap pool from safety threshold: r    ztcpu_offload_space=None requires psutil to auto-size the CPU swap pool. Install psutil or pass an explicit GiB value.r   z9The number of bytes per block is 0. This is not possible.)intr   psutilvirtual_memory	availabler   r+   ImportErrorr<   r   r/   r,   r-   r.   r   itemsize
ValueError)	r>   r   r   offload_bytesrM   	total_ram	max_bytesclamped_gibbytes_per_blocks	            rG   r)   )OffloadingManager._compute_num_cpu_blocksf   s    CXBc1W=>im   --/99II89II $)>('73()>s(C=QabePf g!W-c22EkRUEVV[] !*&NN;
 "%MNNNy\cOdehNiinop &  $**&&'(jj##$ jj,,- jj!!	"
 jj''( 	 aXYY//rJ   c                     U R                   b)  [        R                  R                  U R                   5      $ [	        5       $ )zdReturns a context manager that runs enclosed ops on the compute stream, or a no-op when none is set.)r!   r1   cudastreamr   r>   s    rG   _stream_ctxOffloadingManager._stream_ctx   s1    :>:N:N:Zuzz  !5!56m`k`mmrJ   c           
         U R                   nUR                  5       u  p#[        R                  " SU S[	        UR
                  5       S[	        UR                  5       S35        U R                  X#5      nU(       a  SUl        UR                  [        R                  :X  a  UR                  SS Ul        [        R                  Ul	        Un[        R                  " SU S[	        U R                   5       S	35        O?UR#                  5       n[        R$                  Ul	        [        R                  " S
U S35        UR'                  U5        UR)                  U5        SUl        g)zOffload one active request to make room in the GPU cache. Tries CPU offloading first; if the pool is full,
falls back to the legacy soft reset.zOffloading request  with z initial tokens and  generated tokens.r   NzOffloaded request z	 to CPU: z free blocks remaining.zSoft reset request .T)r   pop_request_to_evictr   r=   r<   initial_tokensgenerated_tokens_offload_to_cpuallocated_blocks_statusr   DECODINGtokens_to_processremaining_prefill_tokensPENDINGdebugr&   !create_equivalent_initial_requestFINISHEDfinish_requestadd_waiting_requestblock_new_requests)r>   r   
request_idstateoffloaded_to_cpu	new_states         rG   offload_one_request%OffloadingManager.offload_one_request   s6    NN	%::<
!*VC8L8L4M3NNb5))*++=?	
  //
B%&E" }} 6 66161H1H1K. *11EMILL-j\3tG\G\C]B^^uvw??AI)22EMLL.zl!<=  ,%%i0'+	$rJ   requests_in_batchc                    U R                   n/ n/ nU GH  nUR                  nUR                  (       d  M#  U R                  R	                  UR
                  5      nU R                  R	                  UR
                  5      nUR                  U5        Sn	[        U5       HW  u  pUR                  U
   R                  R                  UR
                  / 5      nUR                  USU 5        [        X5      n	MY     SUl        Xl        UR                  (       a,  U=R                  UR                   UR"                  -  -  sl        [$        R&                  " SUR
                   S[)        UR*                  5       S[)        UR,                  5       S35        GM     U(       d  gU R.                  S[)        U5       nU R0                  S[)        U5       nUR3                  [4        R6                  " U[4        R8                  S95        U R;                  5          UR3                  [4        R6                  " U[4        R8                  S95        [=        U R>                  U R@                  5       H  u  nnUU   R3                  X   5        M     [=        U RB                  U RD                  5       H  u  nnUU   R3                  UU   5        M     SSS5        U RF                  R                  U5        g! , (       d  f       N*= f)	zRestore KV caches from CPU for any CPU-offloaded requests in the scheduled batch. Indices are accumulated
per group across all requests, then copied in one batched operation per layer.r   NFzRestored CPU-offloaded request r`   z prefill tokens and ra   r   )$r   rt   is_cpu_offloadedr'   poprs   r(   extend	enumerategroup_cache_managersblock_tablegetmaxrg   allow_block_sharingcomplete_blocksposition_offsetr,   r   rm   r<   rd   re   r8   r9   copy_r1   	as_tensorr7   r]   r3   r"   r$   r#   r%   r&   )r>   ry   r   all_cpu_indicesall_gpu_indicesfuture_statert   cpu_indicesgroup_countsmax_allocated_blocks	group_idxn
gpu_blockscpu_idsgpu_idscpu_kgpu_kcpu_vgpu_vs                      rG   restore_scheduled_requests,OffloadingManager.restore_scheduled_requests   sy    

%'%'-L &&E)) 88<<U=M=MNKAAEEeFVFVWL"";/ $%  ), 7	"77	BNNRRSXScScegh
&&z"1~6'*+?'C$ !8
 &+E"%9"((,,0E0EIYIY0YY,LL1%2B2B1C6#eNbNbJcId e511233EG/ .:  ''(>#o*>?''(>#o*>?eoooU[[IJMM%///MN #D$7$79L9L Mug$$U^4 !N #D$9$94;P;P Qug$$U7^4 !R	   	$$_5  s   B4K**
K8rt   c                 l    UR                   (       a#  U R                  UR                  5        SUl         gg)z=Free CPU blocks for a single request (e.g., on cancellation).FN)r|   _return_cpu_blocksrs   r>   rt   s     rG   free_request_cpu_cache(OffloadingManager.free_request_cpu_cache   s,    !!##E$4$45%*E" "rJ   c                 |    U R                   R                  R                  5        H  nU R                  U5        M     g)zPFree all CPU-offloaded caches in the waiting queue (e.g., on fail_all or reset).N)r   waiting_requestsvaluesr   r   s     rG   free_all_waiting_cpu_caches-OffloadingManager.free_all_waiting_cpu_caches   s-    ^^44;;=E''. >rJ   c                     U R                  5         U R                  R                  5         U R                  R                  5         [	        [        U R                  5      5      U l        g)z8Reset CPU offloading state for a new generation session.N)r   r'   clearr(   r   r6   r*   r&   r\   s    rG   resetOffloadingManager.reset   sJ    ((*&&,,...446 %eD,@,@&A BrJ   rs   c                 .   / n/ nU R                   R                   HJ  nUR                  R                  U/ 5      nUR	                  U5        UR                  [        U5      5        ML     [        U5      nUS:X  d  [        U R                  5      U:  a  g[        U5       Vs/ s H  oR                  R                  5       PM     n	nU R                  SU n
U R                  SU nU
R                  [        R                  " U	[        R                  S95        U R!                  5          UR                  [        R                  " U[        R                  S95        [#        U R$                  U R&                  5       H  u  pX   R                  X   5        M     [#        U R(                  U R*                  5       H  u  pX   R                  X   5        M     SSS5        XR,                  U'   X@R.                  U'   SUl        gs  snf ! , (       d  f       N7= f)zzCopy a request's KV cache blocks from GPU to the static CPU swap pool. Returns True on success, False if
the pool is full.r   FNr{   T)r   r   r   r   r~   r0   r<   r&   r6   popleftr8   r9   r   r1   r   r7   r]   r3   r"   r$   r#   r%   r'   r(   r|   )r>   rs   rt   gpu_indicesgroup_block_countscmblockstotal_gpu_blocksrA   r   r   r   cpu_key_cachegpu_key_viewcpu_value_cachegpu_value_views                   rG   rf   !OffloadingManager._offload_to_cpu  s   
 **11B^^''
B7Fv&%%c&k2 2 {+q C(=(=$>AQ$Q AFFV@WX@W1,,446@WX ''(9)9:''(9)9:eookEFMM%//+U[[IJ/243F3FH[H[/\+&,,\-BC 0] 47t7L7LdNcNc3d/(..~/FG 4e   6A&&z2=O..z:!%+ Y  s   !#H&B/H
Hc                     U R                   R                  U5      nU R                  R                  U5      nU R                  R	                  U5        X#4$ )z<Return CPU blocks to the free pool without copying anything.)r'   r}   r(   r&   r~   )r>   rs   r   r   s       rG   r   $OffloadingManager._return_cpu_blocks.  sK    0044Z@==AA*M$$W-$$rJ   )r!   r8   r"   r#   r&   r9   r$   r%   r*   r'   r(   r   r   )r   N)__name__
__module____qualname____firstlineno____doc__r   r   floatr1   rZ   StreamrH   rL   r)   r]   rw   listr	   r   r
   r   r   r   strboolrf   tupler   __static_attributes__ rJ   rG   r   r   $   s    9
"9
 9
  %t|	9

  9
 

))D09
 
9
v30UT\ 30]b 30gj 30jn,B16DAS<T 16Y] 16f+L +T +/
C'# 'l 't 'R%S %U49d3i;O5P %rJ   r   )r   collectionsr   
contextlibr   r1   utilsr   r   r   requestsr	   r
   r   r   r   r   r   r   rJ   rG   <module>r      s0   	  "  ( & M M  O% O%rJ   