
    Z jX                         S SK r S SKJrJr  S SKJr  SSKJrJr  SSK	J
r
  SSKJrJrJrJr   " S	 S
\5      r\" 5        " S S\5      5       r\" 5        " S S\5      5       r\\S.rg)    N)ABCabstractmethod)deque   )attach_tracertraced   )PagedAttentionCache)FutureRequestStateRequestStateRequestStatusloggerc                   $   \ rS rSrSrS\4S jrS!S jr\S\	4S	 j5       r
\S
\S\S\\\   S-  \\\4   4S j5       r\S\4S j5       r\S\SS4S j5       rS\\\	4   4S jr\S\S\\   4S j5       r\S\4S j5       r\S\\	   4S j5       r\S\S\4S j5       r\S\	S\S\4S j5       rS\	S\\   S\\   4S jrS\	S\\   S
\S\\   SS4
S jr S"S\\	   S
\S\S\\   S\S\\\   \\\\4   4S jjrS\\	   4S jr S\\   SS4S jr!S r"g)#	Scheduler   a1  
Abstract base class for scheduling requests in the continuous batch processor. Schedulers manage the lifecycle of
requests from when they are added to the waiting queue to when they are scheduled for processing. Different
schedulers implement different strategies for prioritizing and batching requests.
cachec                 L   Xl         [        R                  " 5       U l        U R                   R                  (       a  S OU R                   R
                  R                  U l        U R                   R                  U R                   R                  -  U l
        U R                  5         g )N)r   	threadingLock_cancellation_locknum_full_attention_groupsconfigsliding_windowread_cache_limitmax_blocks_per_request
block_sizemax_decode_fast_path_lengthreset)selfr   s     چ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/scheduler.py__init__Scheduler.__init__   si    
"+.."2(,

(L(LRVR\R\RcRcRrRr+/::+L+LtzzOdOd+d(

    returnNc                 x    0 U l         0 U l        [        5       U l        [	        5       U l        / U l        SU l        g)z0Reset scheduler state for a new generation loop.FN)active_requestswaiting_requestsr   waiting_requests_orderset_requests_to_cancel_requests_to_forkblock_new_requestsr   s    r    r   Scheduler.reset'   s6    8:9;27'#-0U 57"'r#   statec                 ~    XR                   UR                  '   U R                  R                  UR                  5        g)z#Adds a request to the waiting list.N)r'   
request_idr(   append)r   r/   s     r    add_waiting_requestScheduler.add_waiting_request0   s2     38e../##**5+;+;<r#   token_budgetcache_budgetc                     g)a_  Schedules requests for the next batch based on available token and cache budgets. This method selects which
requests should be processed in the current batch, considering the budgets and the scheduler's prioritization
rules. The token_budget is the maximum number of tokens that can be processed in a batch, and the cache_budget
is the maximum number of KV cache entries that can be read in a batch.
Returns the list of scheduled requests in their "FutureRequestState" form, a boolean indicating if the decode
fast path can be used, the total number of query tokens and the maximum number of kv tokens read.N )r   r5   r6   s      r    schedule_batchScheduler.schedule_batch6   s    r#   c                 v    [        [        U R                  5      =(       d    [        U R                  5      5      $ )z3Checks if there are requests ready to be processed.)boollenr&   r'   r-   s    r    has_pending_requestsScheduler.has_pending_requestsA   s*     C,,-KT5J5J1KLLr#   r1   c                 r    U R                   R                  U5        U R                  R                  US5        g)zCompletes processing of a request and frees its allocated cache blocks. This method is called
when a request has finished generation or encountered an error.
N)r   free_blocksr&   popr   r1   s     r    finish_requestScheduler.finish_requestF   s,    
 	

z*  T2r#   c                     U R                   (       a  U R                  R                  5       $ [        [	        U R                  5      5      nXR                  R                  U5      4$ )zRemove and return an active request chosen as the eviction victim for cache-pressure offload or soft reset.
Picks the newest active request when `block_new_requests` is set, else the oldest.)r,   r&   popitemnextiterrB   rC   s     r    pop_request_to_evictScheduler.pop_request_to_evictN   sQ     ""''//11$t3345
//33J???r#   c                 V    XR                   ;   a  U R                   U   R                  $ / $ )z,Gets generated tokens for an active request.)r&   generated_tokensrC   s     r    !get_active_request_static_outputs+Scheduler.get_active_request_static_outputsV   s,     ---''
3DDD	r#   c                     U R                      U R                  R                  U5        SSS5        g! , (       d  f       g= f)z!Marks a request for cancellation.N)r   r*   addrC   s     r    set_request_cancellation"Scheduler.set_request_cancellation]   s-     $$$$((4 %$$s	   2
A c                    / nU R                      U R                   H  nU R                  R                  US5      nU R                  R                  US5      nU=(       d    UnUb  UR                  U5        X R                  ;   a  U R                  R                  U5        U R                  R                  U5        M     [        5       U l        SSS5        U$ ! , (       d  f       U$ = f)z=Remove all cancelled requests from active and waiting queues.N)r   r*   r&   rB   r'   r2   r(   remover   rA   r)   )r   cancelled_statesr1   state_astate_wr/   s         r    clear_cancelled_requests"Scheduler.clear_cancelled_requestsc   s     $$"66
..22:tD//33JE*7$$++E2!<!<<//66zB

&&z2 7 (+uD$ %   %$  s   B?C
C'c                 r    XR                   ;   =(       d#    XR                  ;  =(       a    XR                  ;  $ )z2Checks if a request has been cancelled or removed.)r*   r&   r'   rC   s     r    request_is_cancelledScheduler.request_is_cancelledu   s4     555 
222^zI^I^7^	
r#   len_next_tokensc                 n   UR                  5       nUR                  U R                  R                  -  U-
  nXB:  d  UR                  S:X  aj  X$-
  S-   U R                  R                  -  S-   nU R                  R	                  XQR
                  UR                  5      nUc  gU=R                  U-  sl        g)a  Allocate additional cache blocks for a request if the currently allocated blocks are insufficient to
accommodate the next tokens. It calculates how many blocks are needed based on the request's current
cache occupancy and the number of tokens to be processed. The allocation itself is done by the CacheAllocator
objects. Returns a boolean indicating if the allocation was successful or not.
r   r	   FT)current_lenallocated_blocksr   r   allocate_blocksr1   )r   r/   r^   r`   	occupancyblocks_needed	allocateds          r    _allocate_blocks_if_needed$Scheduler._allocate_blocks_if_needed|   s     '')**TZZ-B-BB[P	&%*@*@A*E-9A=$**BWBWW[\\M

22=BRBRTYTjTjkI ""i/"r#   "request_ids_to_remove_from_waitingc                    U R                   R                  (       Ga$  UR                  [        R                  :X  Ga  UR
                  (       d  U R                   R                  UR                  UR                  5      nUS:  a  XR                  UR                  '   UR                  UR                  5        [        R                  Ul        U=R                  X0R                   R                  -  -  sl        [        U[        UR                  5      S-
  5      nUR                  US Ul        U=R                   U-  sl        UR                  [        R"                  :X  a  UR$                  nU$ UR                  nU$ )zPrepares a request for processing in the current batch. If prefix sharing is enabled, and the request was
pending, this is where we look for a prefix match and split the request if found.r   r	   N)r   use_prefix_sharingstatusr   PENDINGis_cpu_offloadedsearch_prefix_matchr1   remaining_prefill_tokensr&   rQ   
PREFILLINGra   r   minr=   position_offsetDECODINGtokens_to_process)r   r/   rh   prefill_lengthrequest_tokenss        r    _infer_request_tokensScheduler._infer_request_tokens   s*    ::(((U\\]=R=R-R[`[q[q!ZZ;;E<L<LeNlNlmN!9>$$U%5%56266u7G7GH,77&&.JJ<Q<Q*QQ&!$^S9W9W5X[\5\!]161O1OP^P_1`.%%7% <<=111"44N  #;;Nr#   rv   c                    UR                   S:  a;  U[        U5      S-
  :  a)  [        U5      S-
  nU R                  R                  U5        [        U5      U::  a  UR                  [
        R                  :X  a3  XR                  UR                  '   UR                  UR                  5        UR                  [
        R                  ::  a.  UR                  Ul        / Ul        [
        R                  Ul        ggUR                  [
        R                  :X  aH  XR                  UR                  '   [
        R                  Ul        UR                  UR                  5        X#S Ul        USU Ul        g)a:  Schedules a request for the current batch, updating the request's status according to the token budget left.
After a request is scheduled, it is part of the next batch unless there is an error.
If the request has children (for parallel decoding), it ensures at least one token remains before the request is
forked.r   r	   N)num_childrenr=   r+   r2   rk   r   rl   r&   r1   rQ   rp   ro   rt   rs   )r   r/   rv   r5   rh   s        r    _schedule_requestScheduler._schedule_request   s0    !lc.6IA6M&M~.2L""))%0 ~,.||}4449>$$U%5%56266u7G7GH||}777*/*H*H'13.  -55 8 ||}4449>$$U%5%56,77266u7G7GH-;M-JE*&4]l&CE#r#   
candidatessafety_marginc           	         / nSnU R                   R                  S:  nXPR                   R                  -  n	X#pU GH  nU R                   R                  5       nX:  nU(       aF  U(       a?  UR                  [
        R                  :w  a!  [        R                  " SU< SU	< 35          GOCU R                  X5      n[        [        U5      U5      nUS:H  =(       a    UR                  U R                  :  nUR                  5       nU R                  b  [        UU R                  5      nU(       a  U(       d	  UU:  a  GM  U R!                  UU5      nU(       dL  SnUS:X  aA  UR"                  U R$                  ;   a'  [        R                  " SUR"                   S	35          GOLGMi  U R'                  XX$5        [        UR(                  5      nUUS:H  =(       a    UR                  U R                  :  -  nUU-  nUU-  nU R                   R*                  (       aF  UR                  5       U R                   R,                  -  nUU-   nUU R                   R,                  -  nOSnUR.                  (       + nUR1                  [3        UUUU5      5        UR"                  nU R$                  R5                  US5      SLnU(       a  UR7                  U5        US:X  d  US::  d  GM  U(       a  GM    O   X-
  nX-
  nXgUUU4$ )
a  Schedules candidate requests for the current batch.

This method contains the common logic shared by all schedulers: it checks token and cache budgets, allocates
cache blocks if needed, updates request states, and tracks which waiting requests should be removed from the
waiting queue.
Fr   zJOutside safety margin, breaking out of scheduling loop. num_free_blocks = z safety_margins = r	   NTzBreaking mid-loop for request z because the cache is full)r   r   
num_blocksget_num_free_blocksrk   r   rs   r   inforw   rq   r=   rr   r   r`   r   rf   r1   r'   r{   rt   allow_block_sharingr   ro   r2   r   rB   rQ   )r   r}   r5   r6   rh   r~   scheduled_requestsone_allocation_faileddecode_fast_pathsafety_marginsoriginal_token_budgetoriginal_cache_budgetr/   num_free_blocksoutside_safety_marginrv   request_lenis_decode_eligibleread_cache_neededallocation_successfultokens_in_current_blocktokens_after_forwardcomplete_blockshas_new_tokenreq_idwas_waitingnum_q_tokensmax_kv_reads                               r    _process_candidatesScheduler._process_candidates   s      %::<<q@&)>)>>7C4E"jj<<>O$3$D!$);P]PfPf@faNbbudrcvw  "77bNc.1<@K
 "-!1!ne6K6KdNnNn6n % 1 1 3$$0$'(94;P;P$Q!$);PaAa %)$C$CE;$W! )(,% #a'E,<,<@U@U,UKK"@AQAQ@RRl mn ""5,ke556K q 0 mU5J5JTMmMm5mm K'L--L zz--*/*;*;*=

@U@U*U''>'L$"6$**:O:O"O"# !& > >>M%%&8`k&lm %%F//33FDAMK266v> q \Q%6?O?OM  P -;+:!:JLZeeer#   c                    [        5       n[        5       nU R                   H6  nU R                  U   nUR                  (       a  UOUR	                  U5        M8     / nU(       d  U(       a^  U(       a  UR	                  UR                  5       5        U(       a  UR	                  UR                  5       5        U(       a  MU  U(       a  M^  U$ )zReturns waiting requests in priority order. Since CPU-offloaded requests are cheaper to restore than fresh
requests, they get priority, but we interleave them with fresh request to not saturate new batches with only
offloaded requests.)r   r(   r'   rm   r2   popleft)r   	offloadedfreshr   r/   ordereds         r    _get_waiting_candidates!Scheduler._get_waiting_candidates.  s     */	%*W11F))&1E00YeCCEJ 2 ')5y0023u}}/	 i55
 r#   c                 r    [        U R                   Vs/ s H  o"U;  d  M
  UPM     sn5      U l        gs  snf )z8Removes processed requests from the waiting queue order.N)r   r(   )r   rh   r   s      r    _cleanup_waiting_queue Scheduler._cleanup_waiting_queue?  s2    &+"&"="=r"=OqAqV"=r'
#rs   	44)
r   r*   r+   r&   r,   r   r   r   r'   r(   )r$   N)        )#__name__
__module____qualname____firstlineno____doc__r
   r!   r   r   r   r3   r   inttuplelistr   r<   r9   r>   strrD   rJ   rN   rR   rY   r\   rf   r)   rw   r{   floatr   r   r   __static_attributes__r8   r#   r    r   r      s   1 ( = = =
 mm/2m	t&'$.c3>	?m m Md M M 3 3 3 3@eC,=&> @ C DI   53 5 5
  $|*<    " 
s 
t 
 
  s W[  $< ]`ad]e jnorjs 2$D$D S	$D 	$D
 -0H$D 
$DX  #^f&^f ^f 	^f
 -0H^f ^f 
t&'tS#=	>^f@l); "
S 
VZ 
r#   r   c                   v   ^  \ rS rSrSrSS\S\4U 4S jjjr\S\	S\	S\
\\   S	-  \\	\	4   4S
 j5       rSrU =r$ )FIFOScheduleriG  aC  This scheduler processes requests in the order they arrive, meaning decoding requests has priority over
prefilling requests. Additionally, it includes a safety margin mechanism to prevent cache exhaustion. By default,
when 80% of the cache is full, new requests will not be scheduled to prioritize decoding active requests.r   r~   c                 0   > [         TU ]  U5        X l        g)aC  Initializes the FIFO scheduler. The safety margin is the percentage of free blocks under which we stop
scheduling new prefill requests, so safety_margin = 0.1 means that when there is less than 10% of free blocks,
or equivalently when more than 90% of blocks are already allocated, we stop scheduling new prefill requests.
N)superr!   r~   )r   r   r~   	__class__s      r    r!   FIFOScheduler.__init__M  s    
 	*r#   r5   r6   r$   Nc                     / n/ nU R                   R                  5        He  nUR                  [        R                  :X  a  UR                  U5        M4  UR                  [        R                  :X  d  MT  UR                  U5        Mg     U R                  (       d  UR                  U R                  5       5        X4-   n[        5       nU R                  UUUUU R                  S9u  ppnU R                  U5        U(       d  U	(       a  S U
SS4$ XX4$ )Nr~   r   )r&   valuesrk   r   rs   r2   rp   r,   extendr   r)   r   r~   r   r   r5   r6   priority_statessecond_priority_statesr/   r}   rh   r   r   r   r   r   s                r    r9   FIFOScheduler.schedule_batchU  s    /157))002E||}555&&u-!9!99&--e4	 3 &&"))$*F*F*HI$=
-0U*$$2"00 %  	_3CS^ 	##$FG "&;)1a//!\NNr#   r   )g?)r   r   r   r   r   r
   r   r!   r   r   r   r   r   r<   r9   r   __classcell__)r   s   @r    r   r   G  sl    q+1 +% + + #O#O/2#O	t&'$.c3>	?#O #Or#   r   c                   R    \ rS rSrSr\S\S\S\\\	   S-  \
\\4   4S j5       rSrg)	PrefillFirstScheduleri~  zScheduler that prioritizes split prefill requests over decoding requests. This scheduler ensures that split
prefill requests (which are continuations of partially processed prompts) are completed before processing new
decoding requests.r5   r6   r$   Nc                    / n/ nU R                   R                  5        He  nUR                  [        R                  :X  a  UR                  U5        M4  UR                  [        R                  :X  d  MT  UR                  U5        Mg     U R                  (       d  UR                  U R                  5       5        X4-   n[        5       nU R                  UUUUSS9u  ppnU R                  U5        U(       d  U	(       a  S U
SS4$ XX4$ )Nr   r   r   )r&   r   rk   r   rp   r2   rs   r,   r   r   r)   r   r   r   s                r    r9   $PrefillFirstScheduler.schedule_batch  s     /157))002E||}777&&u-!7!77&--e4 3 &&"))$*F*F*HI$=
-0U*$$2! %  	_3CS^ 	##$FG "&;)1a//!\NNr#   r8   )r   r   r   r   r   r   r   r   r   r   r<   r9   r   r8   r#   r    r   r   ~  sQ     $O$O/2$O	t&'$.c3>	?$O $Or#   r   )fifoprefill_first)r   abcr   r   collectionsr   utils.metricsr   r   r   r
   requestsr   r   r   r   r   r   r   SCHEDULER_MAPPINGr8   r#   r    <module>r      s}     #  2 & M Ml
 l
`	 1OI 1O 1Ol *OI *O *O\ * r#   