
    Z j@#                     r   S SK Jr  S SKJr  S SKJrJr  S SKJr  S SK	r	S SK
Jr  S SKJr  SS	KJrJrJrJr   " S
 S5      r\ " S S5      5       rS\S\4S jrS\S\S\S\4S jrS'S\S\S\S\4S jjrS\S\S\S\4S jr S(S\	R6                  S\\   S\\   S\SS4
S  jjrS!\S"\S#\S$\S%\S\\   4S& jrg))    )OrderedDict)	dataclass)ceillog2)AnyN)PretrainedConfig)ContinuousBatchingConfig   )FutureRequestStateRequestStateRequestStatusloggerc                       \ rS rSrSrS\SS4S jrSS jrS\\S	4   S\	R                  R                  S-  4S
 jrSS\SS4S jjrS\\S	4   S\	R                  R                  SS4S jrSrg)CudaGraphBuffer   z>A fixed-size dict for CUDA graphs with LRU eviction when full.max_sizereturnNc                 V    US::  a  [        SU 35      eXl        [        5       U l        g )Nr   z#max_size must be positive, but got )
ValueErrorr   r   _storage)selfr   s     ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/utils.py__init__CudaGraphBuffer.__init__   s*    q=B8*MNN LWM    c                 T    U R                   nSU l         U R                  SS9  Xl         g )Nr
   T)silent)r   plan_for_new_graph)r   original_max_sizes     r   __del__CudaGraphBuffer.__del__$   s)     MMt,)r   key.c                 x    U R                   R                  U5      nUb  U R                   R                  U5        U$ N)r   getmove_to_endr   r"   graphs      r   	get_graphCudaGraphBuffer.get_graph*   s3    !!#&MM%%c*r   r   c                 .   [        U R                  5      U R                  :  ar  U R                  R                  SS9u  p#U(       d  [        R
                  " SU< 35        UR                  5         [        U R                  5      U R                  :  a  Mq  g g )NF)lastz!Evicting graph for evicted_key = )lenr   r   popitemr   inforeset)r   r   evicted_keyevicted_graphs       r   r   "CudaGraphBuffer.plan_for_new_graph0   sl    $-- DMM1)-)>)>E)>)J&K@+1ABC!	 $-- DMM1r   r(   c                 @    U R                  5         X R                  U'   g r$   )r   r   r'   s      r   	set_graphCudaGraphBuffer.set_graph7   s    !"cr   )r   r   )r   N)F)__name__
__module____qualname____firstlineno____doc__intr   r    tupletorchcuda	CUDAGraphr)   boolr   r5   __static_attributes__ r   r   r   r      s    HZ Z Z*U38_ 1E1E1L " "$ "#U38_ #UZZ5I5I #d #r   r   c                   @    \ rS rSr% SrSr\\S'   Sr\\S'   S	S jr	Sr
g)
WorkloadHints=   zRA tiny dataclass containing hints to help choose good continuous batching defaultsr   max_prompt_lengthmax_generated_lengthNc                     U R                   (       ai  U R                  (       aW  UR                  cI  U R                   U R                  -   n[        [	        X!R
                  -  5      5      S-   nX3S-  -   Ul        gggg)z*Resolves the config using the given hints.Nr
      )rG   rH   max_blocks_per_requestr<   r   
block_size)r   	cb_configmax_sequence_lengthblocks_per_requests       r   resolve_using_hints!WorkloadHints.resolve_using_hintsE   sw     !!d&?&?//7&*&<&<t?X?X&X#%(.ADXDX.X)Y%Z]^%^"3E^_I_3`	0 8 '@!r   rC   )rM   r	   r   N)r7   r8   r9   r:   r;   rG   r<   __annotations__rH   rP   rB   rC   r   r   rE   rE   =   s!    \s !#!ar   rE   configr   c                      U R                   S;   $ )z:Checks if attention mask is needed for the given (config).)zpaged|eagerz
paged|sdpa)_attn_implementation)rS   s    r   attn_mask_is_neededrV   O   s    &&*GGGr   sizeinterval_size	max_valuec                 X    US::  a  U$ U S:  a  [        X-  5      U-  OUn[        X25      $ )zQReturn the smallest multiple of (interval_size) >= (size), capped at (max_value).r   )r   min)rW   rX   rY   paddeds       r   pad_to_intervalr]   T   s5    ;?!8T$&'-7Fv!!r   value	min_valuec                     [        U [        SU5      5      n S[        [        [        U 5      5      5      -  n[	        X15      $ )zReturn the smallest power of 2 >= (value), capped at (max_value). If a minimum value is provided, the value is at
least padded to that value.r
   rJ   )maxr<   r   r   r[   )r^   rY   r_   r\   s       r   pad_to_pow2rb   \   s:     s1i()E#d4;'((Fv!!r   x	divide_byalign_toc                 V    [        [        X-  5      5      n X-  (       a	  XX-  -
  -  n U $ r$   )r<   r   )rc   rd   re   s      r   aligned_dividerg   d   s,    D A|	&&Hr   attention_maskcumulative_seqlens_qcumulative_seqlens_ksliding_windowc                 *   [         R                  " U R                  5      R                  n[	        [        U5      S-
  5       H  nXS-      X   -
  nX%S-      X%   -
  nXg:  a  US:  a  Xv-
  S-   nOSn[        X   XS-      5      n	[        X%   X%S-      5      n
[         R                  " U SX4   R                  UU R                  U R                  S9n[         R                  " XS9nUS:  a  Xv-
  U-
  nU[         R                  " XS9-  nXSX4'   M     g)u~  Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
equivalent) so it's more of an attention score bias tensor.
The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.

An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:

CAUSAL MASK:

       █ █ █ █ █ ░ ░ ░
       █ █ █ █ █ █ ░ ░
       █ █ █ █ █ █ █ ░
       █ █ █ █ █ █ █ █

SLIDING WINDOW MASK:
     ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the left
   <─┴─>
 ░ █ | █ █ █ █ █ █ █ █
 ░ ░ | █ █ █ █ █ █ █ █
 ░ ░ | ░ █ █ █ █ █ █ █
 ░ ░ | ░ ░ █ █ █ █ █ █

ATTENTION MASK (sum of causal and sliding window masks):

       █ █ █ █ █ ░ ░ ░
       █ █ █ █ █ █ ░ ░
       ░ █ █ █ █ █ █ ░
       ░ ░ █ █ █ █ █ █

Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:

CAUSAL MASK:

       █ █ █ ░ ░
       █ █ █ █ ░
       █ █ █ █ █

SLIDING WINDOW MASK:
     ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the left
    <┴>
     | ░ █ █ █ █
     | ░ ░ █ █ █
     | ░ ░ ░ █ █

ATTENTION MASK (sum of causal and sliding window masks):

       ░ █ █ ░ ░
       ░ ░ █ █ ░
       ░ ░ ░ █ █

r
   .)dtypedevice)diagonalN)r>   finform   r[   ranger-   slicefullshapern   triutril)rh   ri   rj   rk   r_   iseqlen_qseqlen_kcausal_diagonalquery_range	key_range	minus_infmaskedsliding_diagonals                 r   build_attention_maskr   k   s-   t N00155I3+,q01'A.1E1HH'A.1E1HH8q=&1A5OO035Ia%5PQ.13GA3NO	JJ367== &&!((	
	 I@A'2^CejjFFF6<sK23- 2r   numstatusnum_query_tokensnum_cache_tokenscachec           
      |   [        U 5       Vs/ s H  nSUR                   SU S3PM     nnX#-   n[        XtR                  -  5      n/ n	U Hg  n
[	        U
S/U-  SS9nXl        S/U-  Ul        X;l        UR                  XR                  S5      nUc  U	s  $ U	R                  [        USSUS95        Mi     U	$ s  snf )	zQAn utility function to create a list of FutureRequestStates for the warmup of CB.	__warmup____r   r
   )
request_idinitial_tokensmax_new_tokensT)has_new_tokencomplete_blocksquery_length)rq   namer   rL   r   _statustokens_to_processposition_offsetallocate_blocksr   appendr   )r   r   r   r   r   rw   request_idstotal_tokensblocks_neededfuture_statesreq_idstate	allocateds                r   create_warmup_future_statesr      s     =B#JGJqYv{{m1QCr2JKG#6L(8(889MMs\?Qbcd#$#(8"8 0))-9I9I1M	  uD!Zjk	
  # Hs   B9)r   )r
   )collectionsr   dataclassesr   mathr   r   typingr   r>    transformers.configuration_utilsr   +transformers.generation.configuration_utilsr	   requestsr   r   r   r   r   rE   rA   rV   r<   r]   rb   rg   Tensorlistr   r   rC   r   r   <module>r      sQ   $ !    = P M M# #D a a a"H 0 HT H
"# "c "c "c ""s "s "s "3 "c c S S  	Q=LLQ=s)Q= s)Q= 	Q=
 
Q=h	  	
  

r   