
    Z j                        S SK Jr  S SKJr  S SKJr  S SKJr  S SKrS SK	J
r
  SSKJr  SS	KJr  S
SKJr  S
SKJr  S
SKJrJrJr  S
SKJrJrJrJrJr  \ " S S5      5       r " S S5      r " S S5      r " S S5      rg)    )nullcontext)	dataclass)partial)AnyN)PretrainedConfig   )get_available_devices)traced   )PagedAttentionCache)%ContinuousBatchingLogitsProcessorList)TMP_TOKEN_IDFutureRequestStatelogger)CudaGraphBufferaligned_divideattn_mask_is_neededbuild_attention_maskpad_to_pow2c                      \ rS rSr% Sr\R                  \S'   \R                  \\	\R                  4   -  S-  \S'   \R                  \S'   \R                  \S'   \R                  \\	\R                  4   -  \S'   \
\S	'   \
\\	\
4   -  \S
'   \\R                     \S'   \\R                     \S'   \R                  \S'   \\S'   \R                  S-  \S'   \R                  \S'   Sr\\S'   S\\	\4   4S jrSrg)PagedAttentionArgs   a  Dataclass containing the keyword arguments for a forward pass using paged attention.

Attributes:
    input_ids: Input token IDs tensor of shape `(1, total_query_tokens)`.
    attention_mask: Attention mask tensor or dictionary mapping layer types to masks. Can be `None` if the
        attention implementation doesn't require explicit masks.
    position_ids: Position IDs tensor of shape `(1, total_query_tokens)`.
    cu_seq_lens_q: Cumulative sequence lengths for queries, used for variable-length batching.
    cu_seq_lens_k: Cumulative sequence lengths for keys/values. Can be a tensor or dictionary mapping layer
        types (e.g., "full_attention", "sliding_attention") to tensors for hybrid models.
    max_seqlen_q: Maximum query sequence length in the batch.
    max_seqlen_k: Maximum key/value sequence length. Can be an int or dictionary for hybrid models.
    write_index: List of tensors indicating where to write new KV states in the cache, one per attention group.
    read_index: List of tensors indicating which cache positions to read from, one per attention group.
    logits_indices: Tensor indicating which positions in the output should be used for next-token prediction.
    cache: The [`PagedAttentionCache`] instance managing the KV cache.
    block_table: Block table for paged KV cache. If provided, uses `flash_attn_with_kvcache` for fused attention +
        cache update. More information in src/transformers/integrations/flash_paged.py
    logits_processor_args: List of tensors containing the arguments for the logits processors, one per request.
    use_cache: Whether to use caching (always `False` in continuous batching as the cache is managed externally).
	input_idsNattention_maskposition_idscu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kwrite_index
read_indexlogits_indicescacheblock_tablelogits_processor_argsF	use_cachereturnc                 <   U R                   U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  U R                  U R                  S.$ )Nr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r)   selfs    ڊ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/generation/continuous_batching/input_outputs.pyasdictPagedAttentionArgs.asdictF   s    "11 --!//!// -- --++//"11ZZ++%)%?%?
 	
     )__name__
__module____qualname____firstlineno____doc__torchTensor__annotations__dictstrintlistr   r&   boolr   r-   __static_attributes__r0   r/   r,   r   r      s    , ||LL4U\\(9#::TAA,,<<<<$sELL'8"999S#X&&ell##U\\""LL $$ <<'It
S#X 
r/   r   c                   T   \ rS rSr% SrSr\\S'    S0S\S\	S\
R                  S\
R                  S	\S
\S\S\SS4S jjr\" SS9S\SS4S j5       r S0SS S\
R$                  R&                  S\SS4S jjr\\
R*                  " 5       S0S\SS4S jj5       5       rS1S jrS\\
R2                  \\\
R2                  4   4   4S jrS\
R2                  S\
R2                  S\
R2                  SS4S jrS1S jrS\\\    \\   \\!   S-  4   4S  jr"\S!\\    S"\S#\S$\S%\SS4S& j5       r#S0S'\S\\\$4   4S( jjr%S\\
R2                  \
R2                  \
R2                  4   4S) jr&S\\S*4   4S+ jr'S\
R$                  RP                  S-  4S, jr)S-\
R$                  RP                  SS4S. jr*S/r+g)2ContinuousBatchingIOsY   zA class to hold inputs and outputs for a continuous batching forward pass, using static tensors as storage. The
class is meant to be self-contained, so once a set of inputs have been created, the class can be used to update the
batch alone.
   static_inputsr#   configdevicemodel_dtype
max_graphsreturn_logprobslogit_processoruse_cuda_graph_varlenr'   Nc	                    Xl         X0l        X l        X@l        Xl        [        USS5      c  SOUR                  U l        X`l        SU l        SU l	        SU l
        [        UR                  5       V	s/ s H  n	SPM     sn	U l        [        UR                  5       V	s/ s H  n	SPM     sn	U l        SU l        / U l        0 U l        [%        U5      U l        UR(                  U l        U R-                  US9  U R/                  SS9  UR0                  S	:X  a-  [2        R4                  R7                  U R                  S
9U l        gSU l        gs  sn	f s  sn	f )a  Initialize the continuous batching I/O manager. Args:
- cache: The [`PagedAttentionCache`] instance managing the KV cache. Meant to be unique.
- config: The model's pretrained configuration.
- device: The device to allocate tensors on. If the device is CPU, then the memory is pinned.
- model_dtype: The data type for model computations.
- max_graphs: Maximum number of CUDA graphs to cache. Uses LRU eviction when full.
- return_logprobs: Whether to return log probabilities along with the token IDs.
- logit_processor: The [`ContinuousBatchingLogitsProcessorList`] object used to process the logits.
- use_cuda_graph_varlen: Whether CUDA graphs are enabled for the varlen (prefill) path.
sliding_windowNr   r   F)rI   T
full_resetcudarE   )r#   rE   rD   rF   rJ   getattrrL   rH   num_q_tokensmax_kv_readtrue_batch_sizerange
num_groupstrue_read_sizestrue_write_sizesuse_block_tablerequests_in_batchreq_id_to_new_token_positionr   graphstrash_index_trash_index_setup_static_tensors_reset_static_tensorstyper6   rO   Streamcompute_stream
r+   r#   rD   rE   rF   rG   rH   rI   rJ   _s
             r,   __init__ContinuousBatchingIOs.__init__a   s4   , 
&%:"#*63CT#J#RaX^XmXm. +01A1A+BC+Ba+BC,1%2B2B,C D,Cq,C D$;=<>)'6z'B!--""?"C""d"3GM{{V\G\ejj//t{{/Cbf  D Ds   2E
ET)
standalonec                    U R                   R                  nU R                   R                  nU R                   R                  U R                   R                  -  nU R
                  R                  S:H  =(       a    [        [        5       5      S:  nU R                  UR                  -   n[        US-   SS5      n[        R                  " Xg4[        R                  U R
                  US9U l        [        R                  " UR                  S4[        R                  U R
                  S9U l        UR#                  U R                   5        U R                  SSU24   U l        U R                  SSU24   U l        U R                  SSUS-   24   U l        U R                  S	SU24   U l        U R                  S
SUS-   24   nU R                  SSUS-   24   n	U R                  SSU24   U l        0 U l        U R                   R0                  (       a  XR.                  S'   U R                   R2                  (       a  XR.                  S'   U R4                  (       a  SOSn
[        R                  " XS-   4[        R                  U R
                  US9U l        U R6                  R9                  5         SU l        SU l        [>        RA                  U R.                  RC                  5       S5      U l"        [G        U RH                  5      (       ag  0 U l%        U R.                  RC                  5        HA  n[        R                  " SSX4U-   4U RL                  U R
                  US9U RJ                  U'   MC     OSU l%        U R                   RN                  S:  a  UOSn[        R                  " XU R                   RN                  4[        R                  U R
                  US9U l(        [        R                  " X#4[        RR                  U R
                  US9U l*        [        R                  " X$U-   4[        RR                  U R
                  US9U l+        g)a  Allocates static tensors for generation inputs and outputs. This is called only once at init time, to avoid
repeated allocations and enable CUDA graphs. All tensors are allocated with maximum possible sizes.
The allocated tensors are:

- `_bulk_input_tensor`: Storage for all the small inputs: `input_ids`, `position_ids`, `cumulative_seqlens_q`,
  `logits_indices`, `cumulative_seqlens_k`, `carry_over_ids`.
- `attention_mask`: Optional attention masks (only for eager/SDPA implementations)
- `write_index` and `read_index` storage: Cache indexing tensors for each attention group
- `output_ids`: Storage for generated token IDs and maybe log probabilities if return_logprobs is True
cpur       )dtyperE   
pin_memoryrl   rE   r   N   r            full_attentionsliding_attention)sizerl   rE   rm   ),r#   rV   max_batch_tokens
num_blocks
block_sizerE   ra   lenr	   rC   tensors_requiredr   r6   emptyint32_bulk_input_tensorlogits_processors_defaultsfill_defaultsr   r   cumulative_seqlens_qr"   carry_over_idscumulative_seqlens_knum_full_attention_groupsnum_sliding_attention_groupsrH   
output_idszero_total_seqlen_qr   r9   fromkeyskeysr   r   rD   r   rF   max_blocks_per_requestr$   int64write_index_storageread_index_storage)r+   rI   rV   rv   	num_pagesrm   
bulk_linesbulk_columns#full_attention_cumulative_seqlens_k&sliding_attention_cumulative_seqlens_knum_output_rows
layer_typens                r,   r_   +ContinuousBatchingIOs._setup_static_tensors   s    ZZ**
::66JJ))DJJ,A,AA	[[%%.S37L7N3ORS3S
 ''/*J*JJ
%&6&:ArB"'++&ekk$++Zd#
 +0++--q1T[[+
' 	%%d&E&EF004E5E4E1EF 33A7H8H7H4HI$($;$;A?UAQTUAU?U<U$V!"55a9J:J9J6JK.2.E.EaI_K[^_K_I_F_.`+151H1HLbN^abNbLbIb1c."55a9J:J9J6JK >@!:://:]%%&67::22=c%%&9:  $33!++235;;t{{gq
 	 MM$*C*C*H*H*JAN t{{++"$D"77<<>
27++Q 0>N2NO**;;)	3##J/ ? #'D **;;a?JQ ;;$**"C"CD++;;!	
 $);;*%++dkk^h$
  #(++%556ekkRVR]R]jt#
r/   otherstreamnon_blockingc                 .   U R                   Ul         U R                  Ul        U R                  Ul        U R                  S S  Ul        U R                  S S  Ul        U R
                  Ul        U R                  Ul        U R                  Ul        [        U R                  R                  5       5      Ul	        Ub  [        R                  R                  U5      O	[        5       nU   UR                  R!                  U R                  US9  U R
                  (       a%  UR"                  R!                  U R"                  US9  OXUR$                  R!                  U R$                  US9  U R                  S:  a$  UR&                  R!                  U R&                  US9  U R(                  bX  UR(                  bK  U R(                  R+                  5        H-  nUR(                  U   R!                  U R(                  U   US9  M/     S S S 5        g ! , (       d  f       g = f)Nr   r   )rR   rS   rT   rW   rX   rY   r   r   r9   r   itemsr6   rO   r   r   r}   copy_r$   r   r   r   r   )r+   r   r   r   maybe_streamr   s         r,   _transfer_inputs&ContinuousBatchingIOs._transfer_inputs   s    ".. ,, $ 4 4 $ 4 4Q 7!%!6!6q!9 $ 4 4#22!..!$"3"3"9"9";<4:4Fuzz((0KM$$**4+B+BQ]*^##!!''(8(8|'T ))//0H0HWc/d##a',,2243J3JYe2f"".53G3G3S"&"5"5":":"<J((4::4;N;Nz;Ziu:v #= \\s   %DH
HrN   c                 J   U(       a  U R                   R                  S5      OU R                  nU(       a  U R                  R                  S5      OU R                  nU R
                  SU R                  2SUS-   24   R                  5         U(       a#  U R                  U R
                  U R                  S& SU l	        U R                  SU R                  5         U R                  SS2SU24   R                  5         U R                   Hx  nSU R                  U'   U R                  c  M!  U R                  U   SS2SS2SU2SX#-   24   R                  [         R"                  " U R$                  5      R&                  5        Mz     U(       a  U R(                  SS2SU24   R                  S5        U R                   SS2SU24   R                  U R*                  5        U R                  SS2SX#-   24   R                  U R*                  5        gU R,                  (       a%  U R(                  SS2SU24   R                  S5        gU R                   SS2SU24   R                  U R*                  5        U R                  SS2SX#-   24   R                  U R*                  5        g)a  Reset static tensors for the next batch. For efficiency, this only resets the portions of tensors that were
actually used in the previous batch, using the attributes num_q_tokens and max_kv_read. If a (full_reset)
is requested, the entire tensor storage is reset.
Nr   r   )r   ru   rR   r   rS   r}   rC   r   r~   r   r"   r   r   r   r   fill_r6   finforF   minr$   r^   rY   )r+   rN   q_lenkv_lenr   s        r,   r`   +ContinuousBatchingIOs._reset_static_tensors  sJ    6@((--b1TEVEV5?((--b1TEUEU 	 4$"4"4 4k	k ABHHJ<@<[<[D##D$6$6$89 	FU#))+6E6	"((* 33J,-Dj)"".##J/1fuf>N>N0NOUUKK 0 0155 4 QY'--b1$$QY/55d6G6GH##A'7'7$78>>t?P?PQ!!QY'--b1 $$QY/55d6G6GH##A'7'7$78>>t?P?PQr/   c                     U R                  SS9  / U l        0 U l        U R                  b  U R                  R	                  5         gg)z4Reset all relevant states for a new generation loop.TrM   N)r`   rZ   r[   rc   synchronizer*   s    r,   resetContinuousBatchingIOs.reset0  sF    ""d"3!#,.)*++- +r/   c                 2    U R                   U R                  4$ )z:Get the cumulative sequence lengths for the current batch.)r   r   r*   s    r,   get_cumulative_seqlens,ContinuousBatchingIOs.get_cumulative_seqlens9  s    (($*C*CCCr/   r   r   prev_output_idsc                     g Nr0   )r+   r   r   r   s       r,   carry_over_tokens'ContinuousBatchingIOs.carry_over_tokens=  s     	r/   c                 T    U R                   b  U R                   R                  5         g g r   )rc   r   r*   s    r,   retrieve_device_outputs-ContinuousBatchingIOs.retrieve_device_outputsB  s%    *++- +r/   c                 P   U R                   nU R                  SS [        U R                   5      24   R                  5       nU R                  (       aP  U R                  SS [        U R                   5      24   R                  [        R                  S9R                  5       nOS nXU4$ )Nr   r   rl   )rZ   r   ry   tolistrH   viewr6   float32)r+   rZ   
new_tokenslogprobss       r,   prepare_batch_update*ContinuousBatchingIOs.prepare_batch_updateF  s     22__Q(E#d.D.D*E(E%EFMMO
q*GC0F0F,G*G'GHMMTYTaTaMbiikH H h66r/   rZ   logits_processorsuse_decode_fast_pathrR   rS   c           	         U(       d  [        S5      eU=(       a    U R                  R                  5       S:  U l        X@l        U R                  (       a  SOUU l        [        U5      U l        U R                  5         [        U R                  R                  5       Vs/ s H  nSPM     snU l        [        U R                  R                  5       Vs/ s H  nSPM     snU l        / U l        0 U l        / n/ nS/n	/ n
U R                   R#                  5        Vs0 s H  oS/_M     nn[        U R                  R                  5       Vs/ s H  n/ PM     nnU R
                  S:X  a  SO/[        U R                  R                  5       Vs/ s H  n/ PM     snn[%        U5       GH   u  nnUR&                  nUR(                  nUR*                  nU R                  R-                  UU5      nU=R(                  U-  sl        UR/                  UR0                  5        UR/                  [        UUU-   5      5        U	R3                  U	S   U-   5        [5        U R6                  U5      U l        UR9                  5        HG  u  nnX   R3                  X   S   U-   5        [5        U R:                  U   U5      U R:                  U'   MI     U R                  (       a:  U R                  R=                  UR>                  UUU R                  SS2U4   5        O(U R                  RA                  UR>                  UUX5        URB                  (       a?  U
R3                  U	S   S-
  5        [D        /Ul        U
S   U R                  UR>                  '   U R                  R3                  U5        GM     URG                  UU RH                  U RJ                  S S9  [M        [N        RP                  [N        RR                  U RT                  S9nU" U5      U RV                  S[        U5      & U" U5      U RX                  S[        U5      & U" U	5      U RZ                  S[        U	5      & U" U
5      U R\                  S[        U
5      & U	S   U l/        UR9                  5        Ha  u  nnU" U5      U R                   U   S[        U5      & U R`                  c  M6  [c        U R`                  U   U	UUS:X  a  U Rd                  OSS	9  Mc     U R                  (       d  [M        [N        RP                  [N        Rf                  U RT                  S9n[%        U5       H@  u  nnU" U5      U Rh                  US[        U5      24'   [        U5      U R                  U'   MB     UbP  [%        U5       H@  u  nnU" U5      U Rj                  US[        U5      24'   [        U5      U R                  U'   MB     gggs  snf s  snf s  snf s  snf s  snf )
at  Prepare tensors and metadata for the next model forward pass, using the given requests as data. This method:

1. Resets the static tensors from the previous batch
2. Iterates through requests to accumulate input_ids, position_ids, and sequence lengths
3. Extends read/write indices for cache management
4. Builds attention masks if needed (for eager/SDPA implementations)
5. Converts accumulated lists to tensors and copies them to static storage

This method also modifies the `position_offset` attribute of each request to track progress and adds a
temporary token at the end of the requests for which there will a new token.
zNo requests in batchr   Nr   r   )rZ   arg_storagern   rt   )r   r   r   rL   )6
ValueErrorr$   numelrY   rR   rS   ry   rT   r`   rU   r#   rV   rW   rX   rZ   r[   r   r   	enumeratestateposition_offsetquery_lengthget_seqlens_kextendtokens_to_processappendmaxr   r   r   fill_block_table
request_idextend_read_and_write_indiceshas_new_tokenr   prepare_tensor_argsr}   rC   r   r6   tensorr|   rE   r   r   r   r"   r   r   r   rL   r   r   r   )r+   rZ   r   r   rR   rS   re   r   r   r   r"   r   r   r    r!   ifuture_stater   past_lengthr   	seqlens_klayer_type_seqlen_k	to_tensorlayer_type_seqlens_kto_index_tensorgroup_write_indicesgroup_read_indicess                              r,   prepare_batch_tensors+ContinuousBatchingIOs.prepare_batch_tensorsQ  s}   * !344  4T8H8H8N8N8PST8T( $ 4 41+"#45""$ ,11F1F+GH+Ga+GH,1$**2G2G,H I,Hq,H I!#,.) 	 !sBFB[B[B`B`BbcBbJQCBbc#()>)>#?@#?ar#?@!--2TU4::K`K`Ea8bEaEa8b
  )):;OA| &&E//K'44L

00lKI !!\1! U445k;3M NO ''(<R(@<(OP #D$5$5| DD 4=??3D/
/$0778L8XY[8\_r8rs03D4E4Ej4QSf0g!!*- 4E
 ##

++E,<,<k<Y]YiYijkmnjnYop

88$$k<
 ))%%&:2&>&BC+7.'FTUWFX11%2B2BC"")),7G  <L 	--///0B0B0DE 	. 	
 ELLDKKP	 ,5Y+?'Y(1:<1H-C-.AJK_A`!!"=C(<$=>5>~5N1c.12226 1E0J0J0L,J,QZ[oQpD%%j12MC8L4MN"".$#'#6#6z#B)=)=:DH[:[4#6#6ab	 1M ##%ell%++dkkZO*3K*@&&JYZmJn((,Fc2E.F,F)FG+./B+C%%a( +A %-6z-B)A)L[\nLoD++A/H5G1H/H,HI.12D.ED((+ .C & $e  I I  d@8bs   W W%W*W/W4use_paddingc                 n   U R                   nU R                  U R                   -   nU(       a  U R                   OU R                  n[        U R                  SU R                  S5      U R                  SU R                  S5      U R                  SUS-    U R                  U R                  SU U R                  U R                  S2SU24   0 0 U R                  c  SO0 / / U R                  U R                  (       a  U R                  SS2SU24   OSSS9nU(       a  U R                   UR"                  U R                  S-   S& U R                  (       dt  U R$                  (       ac  X l        U R&                  R)                  5        Vs0 s H2  nU[+        U R&                  U   U R                  R,                  S5      _M4     snU l        U R                  (       a  SOU R                  Ul        [/        U R                  R0                  5       H  nU(       a  UOU R2                  U   nUR4                  R7                  U R8                  USU24   5        U R                  S:X  a  Sn	OU(       a  UOU R:                  U   n	UR<                  R7                  U R>                  USU	24   5        M     U R@                  RC                  5        H  u  pjU
SUS-    URD                  U'   U(       a+  XR                     URD                  U   U R                  S-   S& U R                  (       a  SOU R&                  U   UR&                  U'   U R                  c  M  U(       a  UOX   nU R                  U   SSU2SU24   UR                  U'   M     [G        U R@                  R)                  5       5      S:X  as  URD                  RI                  5       S   Ul"        UR&                  RI                  5       S   Ul        U R                  b"  UR                  RI                  5       S   Ul        URK                  5       $ s  snf )zGet model keyword arguments for the current batch, eventually padding the query dimension and KV dimensions
if use_padding is True. The padding is only useful if we want static shapes, like when using cuda graphs.Nr   r   F)r   r   r   r   r"   r%   r   r   r   r!   r    r#   r$   r&   i   .)&rR   rS   rT   r   r   	unsqueezer   r   r   r"   r}   rC   r   r#   rY   r$   r   r   rJ   r   r   r   r   rU   rV   rX   r    r   r   rW   r!   r   r   r   r   ry   popitemr-   )r+   r   q_sizekv_size
batch_sizekwargsr   r   write_index_sizeread_index_sizer   k_lens               r,   get_model_kwargs&ContinuousBatchingIOs.get_model_kwargs  s    """"T%6%66*5T&&4;O;O
 $nnWf-77:**7F3==a@334Dj1nE**..w7"&"9"9$:L:L:NPWQWPW:W"X#'#6#6#>4B**<@<P<P((KZK8VZ
$ ?C?R?RF  !5!5!9!;< ''D,F,F$*! '+&7&7&<&<&>%&>
 D,=,=j,I4::K_K_ae ff&>%! $(#7#7aT=N=N tzz,,-A)4v$:O:OPQ:R%%d&>&>qBSCSBS?S&TU1$"#-8'd>R>RST>U$$T%<%<Q@P@P=P%QR . &*%>%>%D%D%F!J/89I:>/JF  ,OXYmYmOn$$Z01E1E1I1KL373G3GaTM^M^_iMjF
+"".#.I4I484G4G
4STWY`Z`Y`bhchbhTh4i%%j1 &G t((--/0A5#)#7#7#?#?#A!#DF "("5"5"="="?"BF"".(.(=(=(E(E(G(J%}}K%s   9P2c                 H    U R                   U R                  U R                  4$ )a  Returns the tensors used inside the generation step that are not inputs to the model forward pass. In
synchronous batching, there is no carry over, so the only tensor that will be used is output_ids, but we still
return 3 tensors to have the same interface as when using async batching.)r   r   r*   s    r,   get_cb_kwargs#ContinuousBatchingIOs.get_cb_kwargs  s     ""DOOT__DDr/   .c                     U R                   (       a  U R                  4$ U R                  U R                  /U R                  R	                  5       Q7$ r   )rY   rR   rS   r   valuesr*   s    r,   _get_graph_key$ContinuousBatchingIOs._get_graph_key  sE    %%''!!4#3#3Qd6G6G6N6N6PQQr/   c                     U R                  5       nU R                  R                  U5      nUc4  U R                  R                  5         [        R
                  " SU< 35        U$ )NzCreating graph for key = )r   r\   	get_graphplan_for_new_graphr   info)r+   keygraphs      r,   r   ContinuousBatchingIOs.get_graph#  sR    !!#%%c*=KK**,KK4cX67r/   r   c                     U R                  5       nU R                  R                  X!5        [        R                  " SU< 35        g )NzSetting graph for key = )r   r\   	set_graphr   r   )r+   r   r   s      r,   r   ContinuousBatchingIOs.set_graph,  s6    !!#c)/12r/   )"r}   r^   r   r$   r#   r   rc   rD   r   r   rE   r\   r   r"   r~   rS   r   r   rF   rR   r   r   r   r[   rZ   rH   rL   r   rT   rW   rX   rY   rJ   r   Fr'   N),r1   r2   r3   r4   r5   rC   r;   r8   r   r   r6   rE   rl   r=   r   rf   r
   r_   rO   rb   r   no_gradr`   r   tupler7   r9   r:   r   r   r   r<   r   floatr   r   r   r   r   r   	CUDAGraphr   r   r>   r0   r/   r,   r@   r@   Y   s   
 M3 ',,g",g !,g 	,g
 [[,g ,g ,g ?,g  $,g 
,g\ tT
5Z T
_c T
 T
p _dw,w6;jj6G6GwW[w	w> 
]]_&R &R &R  &RP.DellDellAR<S.S(T D7<||V[VbVb	
.	7eD1C,Dd3iQUV[Q\_cQc,c&d 	7 {F 23{F A{F #	{F
 {F {F 
{F {FzED ET#s(^ ENEuU\\5<<%MN ERc3h R5:://$6 3uzz33 3 3r/   r@   c                       \ rS rSr SS\S\S\R                  S\R                  S\	S\
S\S	\
S
S4S jjrSS jrS\R                  R                  S
S4S jrS\R                  R                  S-  S
S4S jrSrg)HostDeviceIOPairi2  r#   rD   rE   rF   rG   rH   rI   rJ   r'   Nc	           
      D   [        UU[        R                  " S5      UUUUU5      U l        [        UUUUUUUU5      U l        [        R
                  R                  5       (       a  [        R
                  R                  5       OS U l        [        R
                  R                  5       (       a  [        R
                  R                  5       OS U l	        [        R
                  R                  5       (       a$  [        R
                  R                  5       U l
        g S U l
        g )Nrj   )r@   r6   rE   host_io	device_iorO   is_availableEventh2d_overcompute_overd2h_over)	r+   r#   rD   rE   rF   rG   rH   rI   rJ   s	            r,   rf   HostDeviceIOPair.__init__3  s     -LL!	
 /!	
 /4jj.E.E.G.G

((*T27**2I2I2K2KEJJ,,.QU.3jj.E.E.G.G

((*Tr/   c                     U R                   R                  5         U R                  R                  5         U R                  U R                  U R
                  4 H  nUc  M  UR                  5         M     g r   )r	  r   r
  r  r  r  r   )r+   events     r,   r   HostDeviceIOPair.resetX  sT    mmT%6%6FE !!# Gr/   r   c                 N    U R                   R                  U R                  USS9  g )NT)r   r   )r	  r   r
  )r+   r   s     r,   transfer_inputs_h2d$HostDeviceIOPair.transfer_inputs_h2d_  s     %%dnnVRV%Wr/   c                    Ub  [         R                  R                  U5      O	[        5       nU   U R                  R
                  R                  U R                  R
                  SS9  S S S 5        g ! , (       d  f       g = f)NTr   )r6   rO   r   r   r	  r   r   r
  )r+   r   r   s      r,   transfer_outputs_d2h%HostDeviceIOPair.transfer_outputs_d2hb  sT    4:4Fuzz((0KMLL##))$..*C*CRV)W \\s   9A11
A?)r  r  r
  r  r	  r   r  )r1   r2   r3   r4   r   r   r6   rE   rl   r;   r=   r   rf   r   rO   rb   r  r  r>   r0   r/   r,   r  r  2  s     ',#R"#R !#R 	#R
 [[#R #R #R ?#R  $#R 
#RJ$X%***;*; X XX5::+<+<t+C X Xr/   r  c                      \ rS rSrSr S&S\S\S\R                  S\R                  S\
S\S	\S
\SS4S jjrS\\R                  \\\R                  4   4   4S jrS\\   S\S\S\
S\
SS4S jrS\R                  4S jrS&S\S\\\4   4S jjrS\\R                  \R                  \R                  4   4S jrS\R                  S\R                  S\R                  SS4S jr\S\R                  4S j5       rS\R:                  R<                  S-  4S jrS\R:                  R<                  SS4S  jr \S\4S! j5       r!S'S" jr"S\\\   \\
   \\#   S-  4   4S# jr$S'S$ jr%S%r&g)(ContinuousBatchingAsyncIOsih  uh  A class to handle the inputs and outputs for the asynchronous API. It uses two IO pairs to avoid race conditions
between the two batches, which means twice as more VRAM is used for static input tensors and CUDA graph. If your GPU
is large enough or you want to generate long sequences, this is a good trade-off to make.

Asynchronous batching works by creating two pairs of host - device inputs and ouputs:

                                inputs
                  ┌──────────┐ ────────► ┌────────────┐
IO pair object:   │ Host IOs │           │ Device IOs │       (for a CUDA sytem, Host = CPU and Device = GPU)
                  └──────────┘ ◄──────── └────────────┘
                                outputs

Each pair is separate from the other. This means that each pairs has its own CUDA graphs set, because CUDA graphs
need to have static adresses for input tensors. To have a unique set of CUDA graph, we would need to copy the input
tensors to a third device-side buffer. This could limit the memory cost of CUDA graphs but would slow down the
forward pass.
But the CUDA streams orchestrating the transfer from host to device (H2D) and device to host (D2H) are the same for
both pairs. Same for the compute stream.
The order of steps in async batching looks like this (for 3 batches of compute):

     │ ┌────┬────┐                  ┌────┬────┐     ┌────┬────┐       ┌────┐          ┌────┐
CPU  │ │PR 0│PR 1│                  │UP 0│PR 2│     │UP 1│PR 3│       │UP 2│          │UP 3│
     │ └────┼───┬┴──┐               └────┴────┼───┐ └────┴────┼───┐   └────┘          └────┘
H2D  │      │0->│1->│               ¦         │2->│ ¦         │3->│   ¦               ¦
     │      └───┼───┴───────────┬─────────────┴─┬─┼───────────┴───┼───────────────┐   ¦
GPU  │          │   COMPUTE 0   │   COMPUTE 1   │█│   COMPUTE 2   │   COMPUTE 3   │   ¦
     │          └───────────────┼───┬───────────┼─┴─┬─────────────┼───┬───────────┼───┤
D2H  │                          │0<-│           │1<-│             │2<-│           │3<-│
     │                          └───┘           └───┘             └───┘           └───┘

with: - CPU: actions happening on the CPU (host-side)
      - GPU: actions happening on the GPU (device-side)
      - H2D: host to device transfer
      - D2H: device to host transfer
and:
      - PR N: preparation of batch N
      - ->N: host to device transfer of batch N
      - COMPUTE N: compute step for batch N
      - <-N: device to host transfer of batch N
      - UP N: update of batch N

You can see that the GPU is almost always busy, except where the █ is.
Proper ordering of steps is ensured through the use of CUDA events and streams.
r#   rD   rE   rF   rG   rH   rI   rJ   r'   Nc	                    [         R                  R                  5       (       d+  [        S[         R                  R                  5       < 35      eSU l        [        S5       V	s/ s H  n	[        UUUUUUUU5      PM     sn	U l        [         R                  R                  US9U l	        [         R                  R                  US9U l
        [         R                  R                  US9U l        S U R                  S   R                  l        S U R                  S   R                  l        S U R                  S   R                  l        S U R                  S   R                  l        UR                  U l        g s  sn	f )Nz>Async batching requires CUDA, but torch.cuda.is_available() = r   ro   rP   r   )r6   rO   r  RuntimeErrorcurrent_pairrU   r  io_pairsrb   
h2d_stream
d2h_streamrc   r	  r
  rv   rd   s
             r,   rf   #ContinuousBatchingAsyncIOs.__init__  sC    zz&&((!`EJJD[D[D]Cabcc 1X
  %	 
  **++6+:**++6+:#jj//v/>26a  /48a""126a  /48a""1 % 6 6/
s   #E7c                 d    U R                   U R                     R                  R                  5       $ r   )r  r  r	  r   r*   s    r,   r   1ContinuousBatchingAsyncIOs.get_cumulative_seqlens  s&    }}T../77NNPPr/   rZ   r   r   rR   rS   c                     U R                   U R                     nUR                  R                  XX4U5        UR                  R                  R                  U R                  5       5        g r   )r  r  r	  r   r   r   infer_carry_over_ids)r+   rZ   r   r   rR   rS   io_pairs          r,   r   0ContinuousBatchingAsyncIOs.prepare_batch_tensors  sV     -- 1 12--2FVa	
 	&&,,T-F-F-HIr/   c                    U R                   U R                     R                  R                  nU R                   SU R                  -
     R                  R                  n[	        U R
                  5       Vs/ s H  nSPM     nn[        UR                  5       5       H  u  pVUR                  U5      nUc  M  XTU'   M!     [        R                  " U[        R                  S9$ s  snf )a  Infers the ids of the tokens to carry over from batch N to batch N+1. In asynchronous batching mode, we can
schedule a request for batch N+1 without knowing the token predicted for that request in batch N. For that
reason, we might need to carry over tokens just predicted in batch N before launching the forwar pass of batch
N+1. This method computes the ids of the tokens to carry over.r   r   r   )r  r  r	  r[   rU   rv   r   r   getr6   r   r|   )r+   !next_req_id_to_new_token_position!prev_req_id_to_new_token_positionre   r   r   req_idnew_token_positions           r,   r&  /ContinuousBatchingAsyncIOs.infer_carry_over_ids  s    
 -1MM$:K:K,L,T,T,q,q),0MM!d>O>O:O,P,X,X,u,u)&+D,A,A&BC&B"&BC ##D#I#I#KLIA!B!F!Fv!N!-5612 M ||N%++>> Ds   5C"r   c                 0   U R                   U R                     nUR                  U R                  5        U R                  R	                  UR
                  5        U R                  R                  UR
                  5        UR                  R                  US9$ )N)r   )
r  r  r  r   record_eventr  rc   
wait_eventr
  r   )r+   r   r'  s      r,   r   +ContinuousBatchingAsyncIOs.get_model_kwargs  st    -- 1 12##DOO4$$W%5%56&&w'7'78  11k1JJr/   c                     U R                   U R                     nU R                   SU R                  -
     nUR                  R                  UR                  R                  UR                  R                  4$ )a  Returns the tensors used inside the generation step that are not inputs to the model forward pass. Those
tensors could be retrieved using this object, but it would trigger a recompile if using torch.compile. They are:
- output_ids: the output ids of the current batch
- prev_output_ids: the output ids of the previous batch, required to carry over outputs tokens of the previous
    batch to the input tokens of the next batch.
- carry_over_ids: a mask representing how to carry over tokens.
r   )r  r  r
  r   r   )r+   r  previous_pairs      r,   r   (ContinuousBatchingAsyncIOs.get_cb_kwargs  sh     }}T%6%67a$*;*;&;<""11##..""--
 	
r/   r   r   r   c                     USU4   nUS:g  R                  5       nUSUR                  S5       nUSUR                  S5       nXE-  US   SU-
  -  -   US'   g)zAs explained in the infer_carry_over_ids method, we might need to carry over tokens just predicted in batch N
before launching the forwar pass of batch N+1. This method performs the carry over, and is recorded in CUDA
graphs if they are enabled.r   r   Nr   )r;   ru   )r+   r   r   r   carried_over_idscarried_over_masks         r,   r   ,ContinuousBatchingAsyncIOs.carry_over_tokens  sv     +1n+<=+r1668+,?innQ.?@-.A	q0AB';ilaRcNc>dd	!r/   c                 \    U R                   U R                     R                  R                  $ r   )r  r  r
  r   r*   s    r,   r   %ContinuousBatchingAsyncIOs.output_ids  s%     }}T../99DDDr/   c                 d    U R                   U R                     R                  R                  5       $ r   )r  r  r
  r   r*   s    r,   r   $ContinuousBatchingAsyncIOs.get_graph  s&    }}T../99CCEEr/   r   c                 h    U R                   U R                     R                  R                  U5        g r   )r  r  r
  r   )r+   r   s     r,   r   $ContinuousBatchingAsyncIOs.set_graph  s%    d''(22<<UCr/   c                 \    U R                   U R                     R                  R                  $ r   )r  r  r	  rY   r*   s    r,   rY   *ContinuousBatchingAsyncIOs.use_block_table  s#    }}T../77GGGr/   c                 r   U R                   U R                     nU R                  R                  UR                  5        U R
                  R                  UR                  5        UR                  U R
                  5        U R
                  R                  UR                  5        SU R                  -
  U l        g )Nr   )	r  r  rc   r1  r  r!  r2  r  r  r+   r'  s     r,   r   2ContinuousBatchingAsyncIOs.retrieve_device_outputs  s    -- 1 12(()=)=>""7#7#78$$T__5$$W%5%56 1 11r/   c                     U R                   U R                     nUR                  R                  5         UR                  R                  5       $ r   )r  r  r  r   r	  r   rD  s     r,   r   /ContinuousBatchingAsyncIOs.prepare_batch_update)  s;    -- 1 12$$&3355r/   c                     SU l         U R                   H  nUR                  5         M     U R                  R	                  5         U R
                  R	                  5         U R                  R	                  5         g)zWReset all state for a new generation session. Used in persistent mode between sessions.r   N)r  r  r   r   r   r!  rc   rD  s     r,   r    ContinuousBatchingAsyncIOs.reset.  sS    }}GMMO %##%##%'')r/   )rc   r  r!  r   r  rv   r   r  )'r1   r2   r3   r4   r5   r   r   r6   rE   rl   r;   r=   r   rf   r  r7   r9   r:   r   r<   r   r   r&  r   r   r   r   propertyr   rO   r  r   r   rY   r   r  r   r   r>   r0   r/   r,   r  r  h  s*   +l ','7"'7 !'7 	'7
 [['7 '7 '7 ?'7  $'7 
'7TQellDellAR<S.S(T QJ 23J AJ #	J
 J J 
J?ell ?(KD KT#s(^ K
uU\\5<<%MN 
 ee7<||eV[VbVbe	e  EELL E EF5:://$6 FDuzz33 D D H H H	26eD1C,Dd3iQUV[Q\_cQc,c&d 6
*r/   r  ) 
contextlibr   dataclassesr   	functoolsr   typingr   r6    transformers.configuration_utilsr   utilsr	   utils.metricsr
   r#   r   cb_logits_processorsr   requestsr   r   r   r   r   r   r   r   r   r@   r  r  r0   r/   r,   <module>rT     sq    # !    = * # & G > > j j 6
 6
 6
rV3 V3r3X 3XlM* M*r/   