
    Z j(                        S SK Jr  S SKrS SKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  \Rb                  " \25      r3\" SS9\ " S S\5      5       5       r4 " S S\)5      r5 " S S\*5      r6 " S S\'5      r7 " S S \$5      r8 " S! S"\%5      r9\" S#S9\ " S$ S%\5      5       5       r:\ " S& S'\(\:5      5       r; " S( S)\Rx                  5      r=\" S*S9 " S+ S,\&\5      5       r> " S- S.\Rx                  5      r?\ " S/ S0\(5      5       r@\" S1S9 " S2 S3\:\05      5       rA/ S4QrBg)5    )	dataclassN   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging)merge_with_config_defaults)is_torchdynamo_compiling)capture_outputs   )	AutoModel)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingTransformersKwargs   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\R                  S-  \S'   Srg)CsmOutputWithPast1   a=	  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the depth decoder model.
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
depth_decoder_past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the backbone model.
Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss )__name__
__module____qualname____firstlineno____doc__r'   torchFloatTensor__annotations__r(   r)   r   r*   tupler+   r,   r-   r.   r/   r0   r1   __static_attributes__r2       t/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/csm/modular_csm.pyr%   r%   1   s   8 &*D%

d
")'+FE$+$(OUT\(:>M5**C/047>7;Je'',-4;37))D0759%++d2926!54<6HLu'8'8#'=!>!ELEIeE$5$5s$:;dBI.2M5$$t+2r=   r%   c                       \ rS rSrSrg)
CsmRMSNormb   r2   Nr3   r4   r5   r6   r<   r2   r=   r>   r@   r@   b       r=   r@   c                       \ rS rSrSrg)CsmRotaryEmbeddingf   r2   NrB   r2   r=   r>   rE   rE   f   rC   r=   rE   c                       \ rS rSrSrg)CsmMLPj   r2   NrB   r2   r=   r>   rH   rH   j   rC   r=   rH   c                       \ rS rSrSrg)CsmAttentionn   r2   NrB   r2   r=   r>   rK   rK   n   rC   r=   rK   c                       \ rS rSrSrg)CsmDecoderLayerr   r2   NrB   r2   r=   r>   rN   rN   r   rC   r=   rN   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS/r	S/r
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )CsmPreTrainedModelv   configmodel)audiotextTrN   r)   )r*   r+   c                   > [         TU ]  U5        [        U[        5      (       aV  UR                  n[        US-
  5       H7  n[        R                  " UR                  SU R                  R                  S9  M9     g [        U[        5      (       aa  [        R                  " UR                  [        R                  " U R                  R                  5      U R                  R                   -  5        g g )Nr   g        )meanstd)super_init_weights
isinstanceCsmCodebooksHeadnum_codebooksrangeinitnormal_weightrS   initializer_rangeCsmBackboneModelEmbeddingscopy_audio_tokens_offsetsr8   arange
vocab_size)selfmoduler^   i	__class__s       r>   r[    CsmPreTrainedModel._init_weights   s    f%f.//"00M=1,-V]]$++:W:WX . :;;JJv22ELLAZAZ4[^b^i^i^t^t4tu <r=   r2   )r3   r4   r5   r6   r    r:   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendrN   rK   _can_record_outputsr8   no_gradr[   r<   __classcell__rl   s   @r>   rQ   rQ   v   sr     (&*#*+#4"5N ""&("
 ]]_v vr=   rQ   c                   ,  ^  \ rS rSr% \\S'   U 4S jr\\\	       SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )CsmDepthDecoderModel   rS   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  -  UR                  5      U l        [        R                  " UR                  UR                  SS9U l
        g NF)bias)rZ   __init__nn	Embeddingr^   rh   backbone_hidden_sizeembed_tokensLinearhidden_sizeinputs_embeds_projectorri   rS   rl   s     r>   r   CsmDepthDecoderModel.__init__   s]     LL&*>*>ARAR*RU[UpUpq')yy1L1LfN`N`gl'm$r=   N	input_idsbackbone_last_hidden_stateattention_maskposition_idsr)   inputs_embeds	use_cachekwargsreturnc           
         Ub&  [        5       (       d  [        R                  S5        SnUSL USL-  (       a  [        S5      eU(       a  Uc  [	        U R
                  S9nUb  UR                  5       OSn	Ub  UR                  S   OUR                  S   n
Ub  UR                  OUR                  n[        R                  " XU
-   US9nUcx  [        R                  " US-
  SS9nXR                  -  nU R                  X-   5      nUS   S:H  nUb	  X&SS2S4'   O+[        5       (       d  U(       a  [        R                  S	5        U R                  U5      n[!        U R
                  UUUUS
9nUnUR#                  S5      nU R%                  UUS9nU R&                  SU R
                  R(                    H  nU" U4UUUUUS.UD6nM     U R+                  U5      n[-        UU(       a  US9$ SS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.)rS   r   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)rS   r   r   r)   r   )r   )r   r   r)   r   position_embeddings)last_hidden_stater)   )r   loggerwarning_once
ValueErrorr   rS   get_seq_lengthshaper   r8   rg   clamprh   r   warningr   r	   	unsqueeze
rotary_emblayersnum_hidden_layersnormr
   )ri   r   r   r   r   r)   r   r   r   past_seen_tokensinputs_seq_lengthr   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr*   r   decoder_layers                      r>   forwardCsmDepthDecoderModel.forward   s    & #,D,F,Fw  L-t";<Z[[0*$++>O?N?Z?99;`a6C6OM//2U^UdUdefUg)6)B%%	HXHX||$4IZ6Zcij !KKq(8a@M"__4F --i.@AM+7?a+?()5&@ad#/116RNN Q 44]C(;;')+%
 & $--a0"oom,oW![[)H4;;+H+HIM)*) /#$7 M J 		-0&+/8O
 	
>B
 	
r=   )r   r   )NNNNNNN)r3   r4   r5   r6   r!   r:   r   r   r   r   r8   
LongTensorr9   Tensorr   boolr   r   r;   r
   r   r<   ry   rz   s   @r>   r|   r|      s    !!n
   .2?C.204(,26!%N
##d*N
 %*$5$5$<N
 t+	N

 &&-N
 N
 ((4/N
 $;N
 +,N
 
(	(N
    N
r=   r|   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )r]      c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g )Nr   )rZ   r   r^   r   	Parameterr8   emptyrb   )ri   r   r^   rh   rl   s       r>   r   CsmCodebooksHead.__init__   s:    *ll5;;t/A/AA/E{#_`r=   c           
         US-
  nU R                   U   n[        UR                  S   5       Vs/ s H9  n[        R                  R                  US S 2US S 24   X4   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr   r   dim)	rb   r_   r   r   
functionallinearTr8   stack)ri   r*   codebook_indicescodebook_weightcodebook_idxs        r>   r   CsmCodebooksHead.forward   s    +a/++&67 !&o&;&;A&> ?
 ? MM  q,/A!BODaDcDcd ? 	 
 Mq9
s   A B)r^   rb   Nr3   r4   r5   r6   r   r   r<   ry   rz   s   @r>   r]   r]      s    a
 r=   r]   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                     ^  \ rS rSrSrSrSrU 4S jr     SS\R                  S\
S-  S\S-  S\R                  S-  S\R                  S-  S	\S-  4U 4S
 jjjr\\         SS\R                  S-  S\R                  S-  S\R"                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\
\R"                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )CsmDepthDecoderForCausalLMi  Nc                    > [         TU ]  U5        U ?[        UR                  UR
                  UR                  5      U l        [        U5      U l	        g r   )
rZ   r   lm_headr]   r   r^   rh   codebooks_headr|   rT   r   s     r>   r   #CsmDepthDecoderForCausalLM.__init__  sE     L.v/A/A6CWCWY_YjYjk)&1
r=   r   next_sequence_lengthr)   r   r   is_first_iterationc                    > [         T	U ]  " XX4U40 UD6nU(       d  UR                  S5        UR                  S5        U$ )Nr   r   )rZ   prepare_inputs_for_generationpop)
ri   r   r   r)   r   r   r   r   model_inputsrl   s
            r>   r   8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation  sM     w<_m
_e
 "9: 	(r=   r   r   labelsr   logits_to_keepr   r   c
                    Ub  UR                  5       OSnUb  UR                  S   OUR                  S   nUb  UR                  OUR                  n[        R                  " XS9U-   nU R
                  " S	UUUUUUUS.U
D6nUS   n[        U	[        5      (       a!  U	S:X  a  [        SS5      nO[        U	* S5      nOU	nU R                  USS2USS24   UU   5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " S	USU R                  R                  US.U
D6n[        UUUR                  UR                   UR"                  S9$ )
a  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr   r   r   )r   r   r   r   r)   r   r   .)r(   r   rh   shift_labels)r'   r(   r)   r*   r+   r2   )r   r   r   r8   rg   rT   r\   intslicer   
contiguousloss_functionrS   rh   r   r)   r*   r+   )ri   r   r   r   r   r)   r   r   r   r   r   r   seq_lenr   r   outputsr*   slice_indicesr(   r'   r   s                        r>   r   "CsmDepthDecoderForCausalLM.forward3  s   0 @O?Z?99;`a,9,E-%%a(9??[\K])6)B%%	HXHX <<?BRR** 	
'A)%+'	
 	
  
nc**" %a %~ot <*M$$]1mQ3F%GIYZgIhi""$!#qr'?557L%% dt{{7M7M\hlrD &#33!//))
 	
r=   )r   rT   )NNNNF)	NNNNNNNNr   )r3   r4   r5   r6   _tied_weights_keys_tp_plan_pp_planr   r8   r   r   r   r9   r   r   r   r   r   r   r   r;   r   r   r<   ry   rz   s   @r>   r   r     s    HH2 ,0(,2626*/## "Dj 	
 ((4/ ((4/ !4K ,  .2?C.204(,26*.!%-.A
##d*A
 %*$5$5$<A
 t+	A

 &&-A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
'	'A
  A
r=   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )rd   iy  c                   > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )Nrf   F)
persistent)rZ   r   r   r   r^   codebook_sizer   embed_audio_tokensregister_bufferr8   rg   r   s     r>   r   #CsmBackboneModelEmbeddings.__init__z  sn    "$,,0D0DvG[G[0[^d^p^p"q"ELL1E1E$FI]I]$]jo 	 	
r=   c                 ^    U R                  XR                  -   5      nUR                  SS9nU$ )Nr   r   )r   rf   sum)ri   r   r   s      r>   r   "CsmBackboneModelEmbeddings.forward  s4    //	<U<U0UV%))a)0r=   )r   r   rz   s   @r>   rd   rd   y  s    
 r=   rd   c                   R   ^  \ rS rSrU 4S jr\\\U 4S j5       5       5       rSr	U =r
$ )CsmBackboneModeli  c                 D   > [         TU ]  U5        [        U5      U l        g r   )rZ   r   rd   r   r   s     r>   r   CsmBackboneModel.__init__  s     6v>r=   c                 $   > [         TU ]  " S0 UD6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
r2   )rZ   r   )ri   super_kwargsrl   s     r>   r   CsmBackboneModel.forward  s      w...r=   )r   )r3   r4   r5   r6   r   r   r   r   r   r<   ry   rz   s   @r>   r   r     s,    ?  /    /r=   r   z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                     ^  \ rS rSrSS0rU 4S jrS rS r\U 4S j5       r	U 4S jr
    SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  4
S jjr    SS
\R                  S\S	-  S\S	-  S\R                  S	-  S\R"                  S	-  4
U 4S jjjr\\          SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R"                  S	-  S\R                  S	-  S\S	-  S\\R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )CsmForConditionalGenerationi  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g r   )rZ   r   rh   r   r   r   r   r   text_vocab_sizeembed_text_tokensr   _from_configbackbone_modelr   depth_decoder_configdepth_decoderr   from_configcodec_configcodec_model	post_initr   s     r>   r   $CsmForConditionalGeneration.__init__  s      ++yy!3!3V5F5FUS!#f.D.DfFXFX!Y.;;FC7DDVE`E`a$001D1DEr=   c                 .    U R                   R                  $ r   r   r   )ri   s    r>   get_input_embeddings0CsmForConditionalGeneration.get_input_embeddings  s    ""///r=   c                 $    XR                   l        g r   r   )ri   values     r>   set_input_embeddings0CsmForConditionalGeneration.set_input_embeddings  s    +0(r=   c                    > UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 s H"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )Noutput_loading_infoFdepth_decoder__from_model_config)getrZ   from_pretrainedlenvarsgeneration_configitems
startswithr   updatedelattr)clsargsr   rT   loading_infoprefix
prefix_lenattrr  depth_decoder_attrsrl   s             r>   r  +CsmForConditionalGeneration.from_pretrained  s   ::+U33"''"94"J6"JE<G+T<V<E "[
  $E$;$;<BBD
Dv& %Du$D 	 
 	U  223::<PRW;o[n;op (DE++V]; ( !F*,&&L
s   /C:	C:c                    > SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr	  transformers_version)r   r  to_diff_dictr   r  setattrrZ   save_pretrained)ri   r  r   r  r  r  r  rl   s          r>   r   +CsmForConditionalGeneration.save_pretrained  sq    !"00BBOOQ 6=.446KDD**FM5A 7 	00r=   Nr   input_valuesinput_values_cutoffsr   r   c                    U R                  U5      nUGbN  [        R                  R                  US5      nX3S:     R	                  5       nXfS:     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n[
        R                  " 5          / n[        X#5       H  u  pXS:     n
[        U
R                  S   S-
  5       Hp  nX   nXS-      nU	SX24   nU R                   R#                  UR                  S5      5      nUR$                  R'                  SS5      nUR)                  US   5        Mr     M     [        S U 5       5      n[
        R*                  " U Vs/ s H7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                   R-                  U5      nSSS5        U R.                  R0                  nUU:H  nU R2                  R5                  W5      nUW   UU'   [
        R6                  " SSU R.                  R8                  4UR                  [
        R:                  S	9U R.                  R<                  -  nU R2                  R5                  U5      R?                  S5      nXR.                  R@                  :H  nURC                  URE                  5       S5      UU'   Ubg  UR                  S5      RC                  SSU R.                  R8                  5      nUU   UU'   UUU'   US
:H  RG                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf ! , (       d  f       GN= f)a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retrieve codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
Nr   r   r   r   r   .c              3   >   #    U  H  oR                   S    v   M     g7f)r   N)r   ).0els     r>   	<genexpr>QCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>  s     &O=Nrxx{=Ns   )r   dtypeiTas_tuple)r   r   )$r   r   r   paddiffr8   rg   maxr   expandr  r   rx   zipr_   r   r   encodeaudio_codes	transposeappendr   get_audio_codes_maskrS   audio_token_idr   r   onesr^   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr   nonzero)ri   r   r"  r#  r   r   audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsrk   	start_idxend_idxaudio_batchcodec_outputscodebook_idsmax_audio_framesr)  batched_audio_token_idsaudio_codes_maskr:  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxss                                r>   "_merge_input_ids_with_input_values>CsmForConditionalGeneration._merge_input_ids_with_input_values  su   * ..y9##%==#4#45I6#R 01JKPPRM)!*;<M %-A-E-E-GP\PcPc d k kM"B! !24K4KA4N N
 $&!FI,FmB&1KjkLk1l."#=#C#CA#F#JK$>$A	"<U"C&8i>O9O&P(,(8(8(?(?@U@UVW@X(Y'4'@'@'J'J1b'Q)00aA L Gn $'&O=N&O#O */++`qr`qZ\R]]&&rAq!5EQR5S+TU`qr+' $(#3#3#H#HIZ#[ ! !$ "[[77N(N:..;;<STL.:;K.LM*+ 

Aq$++";";<YEUEU]b]g]gh++334    $22??@ST\\]^_#,0N0N#N 2B2I2IJ^JbJbJdfg2hM./ !"("2"22"6"="=aDKKD]D]"^4KL\4] 018K 454:dN3K3KUY3K3Z0pt @ CEefgEhjkjl lm(!.AA= s !s    CM->M(
"M-(M--
M<r   r)   r   r   c           	      2  > [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)r   r   r)   r   r   r   r   r"  r#  r   )r   r"  r#  r   )r   r   r   r2   )rZ   r   ndimr  rV  r  )
ri   r   r   r)   r   r   r   r   merged_inputsrl   s
            r>   r   9CsmForConditionalGeneration.prepare_inputs_for_generation1  s     w< 
!5+)'
 
  Y^^q%8\=M=Mo=^=f CC##ZZ7%+ZZ0F%Gzz(+	 D M "/"@MZbLcrvw r=   r   r   r   r   c                    Ub.  UR                   S:X  a  U R                  XXH5      nUS   nUS   nSnU R                  " SUUUUUU	S.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnSnSnSnUb  USS2SS2S4   nU R                  " SUUU R                  R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                  R                  S-
  24   n[        R                  R                  USSS9nUR                  SS9nUUS   US   S-
  SS24   nUU   nU R                   " SUUU	SUS.UD6nUR"                  nUU-   n[%        UUUUUR&                  UR(                  UR*                  Ub  UR,                  OSUb  UR&                  OSUb  UR(                  OSUb  UR*                  S9$ SS9$ )a`  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "sesame/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr   r   r   )r   r   r   r)   r   r   r   )r(   r   rh   r   r/  r&  r   .r%  )r  Tr-  )r   r   r   return_dictr   )r'   r1   r,   r(   r)   r*   r+   r-   r.   r/   r0   r2   )rY  rV  r   r\   r   r   r   r   rS   rh   allr^   r   r   r0  rA  r   r'   r%   r)   r*   r+   r(   )ri   r   r"  r   r#  r   r)   r   r   r   r   r   rZ  backbone_outputsbackbone_hidden_statesr   backbone_logitsr'   r1   r,   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                              r>   r   #CsmForConditionalGeneration.forwardP  s   d  Y^^q%8 CC)=M */:M"8,FI.. 
)%+'
 
 "2!!48B>SV8W8W~ot4]k,,'=aPQ>Q'RS! $$Q1WoO .. &4;;KaKaekM "!Q(+t388R8@@J&,Z&8>]@Y@Y\]@]>]9]&^#&(mm&7&78OQW_`&7&a##++T+:J*@APZ[\P]`aPacdAd*e'#)*#5 $($6$6 %1+F# +% %! "7!;!; #55D '1",<<*88'22AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%E
 	
 qu
 	
r=   )r   r   r   r   r   rh   )NNNN)
NNNNNNNNNr   )r3   r4   r5   r6   r   r   r  r  classmethodr  r   r8   r   rV  r   r   r   r9   r   r   r   r   r   r   r;   r%   r   r<   ry   rz   s   @r>   r   r     sI    	@Aj01  41 *.,048&*PB<<$&PB llT)PB $llT1	PB
 t#PB 
	PBj ,0(,2626## "Dj 	
 ((4/ ((4/ >  .2,0.24804(,26*.!%-.Y
##d*Y
 llT)Y
 t+	Y

 $llT1Y
 &&-Y
 Y
 ((4/Y
   4'Y
 $;Y
 ell*Y
 +,Y
 
"	"Y
  Y
r=   r   )rQ   r   r|   r   r   )Cdataclassesr   r8   torch.nnr    r   r`   cache_utilsr   r   
generationr   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   autor   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_csmr    r!   generation_csmr"   
get_loggerr3   r   r%   r@   rE   rH   rK   rN   rQ   r|   Moduler]   r   rd   r   r   __all__r2   r=   r>   <module>r     s   "   & . ) / O - & K K 7 : 5 	 	 	 @ . 
		H	% 
 '3 '3 '3V	 		- 		X 		> 		' 	 
 v v v< Y
:'9 Y
 Y
xryy ( d
!1? d
d
N  /z / /0 
F
"46H F

F
R
r=   