
    Z jm                        S SK Jr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SSKJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  SSK1J2r2  SSK3J4r4  SSK5J6r6J7r7  SSK8J9r9  \+Rt                  " \;5      r<\)" SS9\ " S S\'5      5       5       r=\" S5       " S S \R|                  5      5       r? " S! S"\R|                  5      r@ " S# S$\R|                  5      rAS% rB\" S&5      SKS' j5       rCS(\R                  S)\ES*\R                  4S+ jrF SLS,\R|                  S-\R                  S.\R                  S/\R                  S0\R                  S-  S1\GS2\GS3\%\(   4S4 jjrH\" \C5       " S5 S6\R|                  5      5       rI " S7 S8\5      rJ\)" S9S9\) " S: S;\#5      5       5       rK\) " S< S=\K5      5       rL " S> S?\R|                  5      rM\)" S@S9 " SA SB\K\5      5       rN " SC SD\R|                  5      rO\) " SE SF\K5      5       rP\)" SGS9 " SH SI\K\95      5       rQ/ SJQrRg)M    )Callable)	dataclass)OptionalN   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)is_torchdynamo_compiling)capture_outputs   )	AutoModel   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\R                  S-  \S'   Srg)CsmOutputWithPast3   a=	  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the depth decoder model.
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
depth_decoder_past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the backbone model.
Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss )__name__
__module____qualname____firstlineno____doc__r+   torchFloatTensor__annotations__r,   r-   r	   r.   tupler/   r0   r1   r2   r3   r4   r5   __static_attributes__r6       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/csm/modeling_csm.pyr)   r)   3   s   8 &*D%

d
")'+FE$+$(OUT\(:>M5**C/047>7;Je'',-4;37))D0759%++d2926!54<6HLu'8'8#'=!>!ELEIeE$5$5s$:;dBI.2M5$$t+2rA   r)   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )
CsmRMSNormc   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z)
CsmRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parameterr<   onesweightvariance_epsilon)selfhidden_sizerG   	__class__s      rB   rK   CsmRMSNorm.__init__e   s/     	ll5::k#:; #rA   r.   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr!   T)keepdim)	dtypetor<   float32powmeanrsqrtrP   rO   )rQ   r.   input_dtypevariances       rB   forwardCsmRMSNorm.forwardm   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::rA   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r?   rO   shaperP   rQ   s    rB   
extra_reprCsmRMSNorm.extra_reprt   s*    ))*+6$2G2G1HIIrA   )rP   rO   )gư>)r7   r8   r9   r:   floatrK   r<   Tensorr`   re   r@   __classcell__rS   s   @rB   rE   rE   c   sB    $ $$ $ $;U\\ ;ell ;J JrA   rE   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )CsmRotaryEmbeddingx   inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrn   F
persistentoriginal_inv_freq)rJ   rK   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenro   rope_parametersrq   compute_default_rope_parametersr   attention_scalingregister_bufferclone)rQ   ro   devicerope_init_fnrn   rS   s        rB   rK   CsmRotaryEmbedding.__init__{   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUrA   r~   ztorch.deviceseq_lenrH   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNg      ?r   r!   rX   r~   rX   )	ry   getattrrR   num_attention_headsr<   arangeint64rY   rg   )ro   r~   r   basedimattention_factorrn   s          rB   rz   2CsmRotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))rA   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rV   r#   mpscpuF)device_typeenabledr!   r   r   )rn   rg   expandrc   rY   r~   
isinstancetypestrr   	transposer<   catcosr{   sinrX   )
rQ   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             rB   r`   CsmRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)r{   ro   rw   rx   rq   NNNN)r7   r8   r9   r:   r<   rh   r>   r$   rK   staticmethodr   intr?   rg   rz   no_gradr   r`   r@   ri   rj   s   @rB   rl   rl   x   s    llVy V V  #'+/"*D *(* t* 
~u$	%	* *: ]]_<  <rA   rl   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CsmMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)rJ   rK   ro   rR   intermediate_sizerL   Linearmlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrQ   ro   rS   s     rB   rK   CsmMLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../rA   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r   r   r   r   )rQ   r   r   s      rB   r`   CsmMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	rA   )r   ro   r   r   rR   r   r   r7   r8   r9   r:   rK   r`   r@   ri   rj   s   @rB   r   r      s    0 rA   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrV   r!   r   )rc   r<   r   )r   x1x2s      rB   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''rA   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          rB   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0GrA   r.   n_reprH   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r#   N)rc   r   reshape)r.   r   batchnum_key_value_headsslenr   s         rB   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrA   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr!   r   rV   )r   rX   )ptrainingr#   )r   num_key_value_groupsr<   matmulr   rL   
functionalsoftmaxrZ   rY   rX   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               rB   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$rA   c                     ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )CsmAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperro   	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )Nr   g      Tr   )rJ   rK   ro   r   r   rR   r   r   r   r   r   attention_dropout	is_causalrL   r   attention_biasq_projk_projv_projo_projrQ   ro   r   rS   s      rB   rK   CsmAttention.__init__  sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
rA   Nr.   position_embeddingsr   r-   r   rH   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )NrV   r#   r!           )r   r   )rc   r   r   viewr   r   r   r   updater   r   get_interfacero   _attn_implementationr   r   r   r   r   r   r   )rQ   r.   r   r   r-   r   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                   rB   r`   CsmAttention.forward*  s~    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((rA   )r   ro   r   r   r   r   r   r   r   r   r   r   )r7   r8   r9   r:   r;   r$   r   rK   r<   rh   r?   r	   r   r   r`   r@   ri   rj   s   @rB   r   r     s    G
y 
S 
4 IM.2(,&)||&) #5<<#=>E&) t+	&)
 &) +,&) 
u||U\\)	*&) &)rA   r   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )CsmDecoderLayeriS  ro   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)ro   r   rG   )rJ   rK   rR   r   	self_attnr   mlprE   rms_norm_epsinput_layernormpost_attention_layernormr   s      rB   rK   CsmDecoderLayer.__init__T  si    !--%VI&>)&*<*<&BUBUV(263E3E6K^K^(_%rA   Nr.   r   r   r-   	use_cacher   r   rH   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r.   r   r   r-   r  r   r6   )r  r  r  r  )
rQ   r.   r   r   r-   r  r   r   residual_s
             rB   r`   CsmDecoderLayer.forward^  s     !,,];>> 
')%+ 3
 
 !0 !55mD/ 0rA   )rR   r  r  r  r  )NNNFN)r7   r8   r9   r:   r$   r   rK   r<   rh   
LongTensorr	   boolr?   r   r   r`   r@   ri   rj   s   @rB   r   r   S  s    `y `S ` /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 rA   r   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS/r	S/r
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )CsmPreTrainedModeli~  ro   model)audiotextTr   r-   )r.   r/   c                   > [         TU ]  U5        [        U[        5      (       aV  UR                  n[        US-
  5       H7  n[        R                  " UR                  SU R                  R                  S9  M9     g [        U[        5      (       aa  [        R                  " UR                  [        R                  " U R                  R                  5      U R                  R                   -  5        g g )Nr#   r   )r\   std)rJ   _init_weightsr   CsmCodebooksHeadnum_codebooksrangeinitnormal_rO   ro   initializer_rangeCsmBackboneModelEmbeddingscopy_audio_tokens_offsetsr<   r   
vocab_size)rQ   r   r  irS   s       rB   r   CsmPreTrainedModel._init_weights  s    f%f.//"00M=1,-V]]$++:W:WX . :;;JJv22ELLAZAZ4[^b^i^i^t^t4tu <rA   r6   )r7   r8   r9   r:   r$   r>   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr<   r   r  r@   ri   rj   s   @rB   r  r  ~  sr     (&*#*+#4"5N ""&("
 ]]_v vrA   r  c                   ,  ^  \ rS rSr% \\S'   U 4S jr\\\	       SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )CsmDepthDecoderModeli  ro   c           	      j  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  -  UR                  5      U l	        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                   UR"                  S9U l        ['        US9U l        SU l        [
        R,                  " UR                  UR                   SS9U l        U R1                  5         g s  snf )Nr  ro   Fr   )rJ   rK   pad_token_idpadding_idxr   rL   	Embeddingr  backbone_hidden_sizeembed_tokens
ModuleListr  num_hidden_layersr   layersrE   rR   r  normrl   
rotary_embgradient_checkpointingr   inputs_embeds_projector	post_initr   s      rB   rK   CsmDepthDecoderModel.__init__  s     !.. ++LL&*>*>ARAR*RU[UpUpqmmAFvG_G_A`aA`I_V/A`a
 v11v7J7JK	,F;&+#')yy1L1LfN`N`gl'm$ 	 bs   D0N	input_idsbackbone_last_hidden_stater   r   r-   inputs_embedsr  r   rH   c           
         Ub&  [        5       (       d  [        R                  S5        SnUSL USL-  (       a  [        S5      eU(       a  Uc  [	        U R
                  S9nUb  UR                  5       OSn	Ub  UR                  S   OUR                  S   n
Ub  UR                  OUR                  n[        R                  " XU
-   US9nUcx  [        R                  " US-
  SS9nXR                  -  nU R                  X-   5      nUS   S:H  nUb	  X&SS2S4'   O+[        5       (       d  U(       a  [        R                  S	5        U R                  U5      n[!        U R
                  UUUUS
9nUnUR#                  S5      nU R%                  UUS9nU R&                  SU R
                  R(                    H  nU" U4UUUUUS.UD6nM     U R+                  U5      n[-        UU(       a  US9$ SS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.r0  r   r#   r~   )minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.ro   rA  r   r-   r   r   )r   r   r-   r  r   last_hidden_stater-   )r   loggerwarning_once
ValueErrorr
   ro   get_seq_lengthrc   r~   r<   r   clampr   r5  warningr<  r   r   r:  r8  r7  r9  r   )rQ   r?  r@  r   r   r-   rA  r  r   past_seen_tokensinputs_seq_lengthr~   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr.   r   decoder_layers                      rB   r`   CsmDepthDecoderModel.forward  s    & #,D,F,Fw  L-t";<Z[[0*$++>O?N?Z?99;`a6C6OM//2U^UdUdefUg)6)B%%	HXHX||$4IZ6Zcij !KKq(8a@M"__4F --i.@AM+7?a+?()5&@ad#/116RNN Q 44]C(;;')+%
 & $--a0"oom,oW![[)H4;;+H+HIM)*) /#$7 M J 		-0&+/8O
 	
>B
 	
rA   )r5  r;  r<  r8  r9  r2  r:  r   )NNNNNNN)r7   r8   r9   r:   r%   r>   rK   r   r    r   r<   r  r=   rh   r	   r  r   r   r?   r   r`   r@   ri   rj   s   @rB   r.  r.    s    !!    .2?C.204(,26!%N
##d*N
 %*$5$5$<N
 t+	N

 &&-N
 N
 ((4/N
 $;N
 +,N
 
(	(N
    N
rA   r.  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )r  i
  c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g )Nr#   )rJ   rK   r  rL   rM   r<   emptyrO   )rQ   rR   r  r   rS   s       rB   rK   CsmCodebooksHead.__init__  s:    *ll5;;t/A/AA/E{#_`rA   c           
         US-
  nU R                   U   n[        UR                  S   5       Vs/ s H9  n[        R                  R                  US S 2US S 24   X4   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr#   r   r   )	rO   r  rc   rL   r   linearTr<   stack)rQ   r.   codebook_indicescodebook_weightcodebook_idxs        rB   r`   CsmCodebooksHead.forward  s    +a/++&67 !&o&;&;A&> ?
 ? MM  q,/A!BODaDcDcd ? 	 
 Mq9
s   A B)r  rO   r   r   rj   s   @rB   r  r  
  s    a
 rA   r  a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                     ^  \ rS rSrSrSrSrU 4S jr\\	         SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S	\
R                  S-  S
\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       r     SS\
R                  S\S-  S\S-  S\
R                  S-  S	\
R                  S-  S\S-  4U 4S jjjrSrU =r$ )CsmDepthDecoderForCausalLMi  Nc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [        UR                  UR                  UR                  5      U l        U R                  5         g r   )
rJ   rK   r.  r  r   r  rR   r  codebooks_headr=  r   s     rB   rK   #CsmDepthDecoderForCausalLM.__init__*  sY     )&1
 ++.v/A/A6CWCWY_YjYjk 	rA   r?  r@  r   r   r-   rA  labelsr  logits_to_keepr   rH   c
                    Ub  UR                  5       OSnUb  UR                  S   OUR                  S   nUb  UR                  OUR                  n[        R                  " XS9U-   nU R
                  " S	UUUUUUUS.U
D6nUS   n[        U	[        5      (       a!  U	S:X  a  [        SS5      nO[        U	* S5      nOU	nU R                  USS2USS24   UU   5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " S	USU R                  R                  US.U
D6n[        UUUR                  UR                   UR"                  S9$ )
a  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr   r#   rC  )r?  r@  r   r   r-   rA  r  .)r,   rh  r   shift_labels)r+   r,   r-   r.   r/   r6   )rL  rc   r~   r<   r   r  r   r   slicerf  r   loss_functionro   r   r   r-   r.   r/   )rQ   r?  r@  r   r   r-   rA  rh  r  ri  r   rO  r   r~   r_  outputsr.   slice_indicesr,   r+   rk  s                        rB   r`   "CsmDepthDecoderForCausalLM.forward3  s   0 @O?Z?99;`a,9,E-%%a(9??[\K])6)B%%	HXHX <<?BRR** 	
'A)%+'	
 	
  
nc**" %a %~ot <*M$$]1mQ3F%GIYZgIhi""$!#qr'?557L%% dt{{7M7M\hlrD &#33!//))
 	
rA   next_sequence_lengthis_first_iterationc                    > [         T	U ]  " XX4U40 UD6nU(       d  UR                  S5        UR                  S5        U$ )Nr@  r   )rJ   prepare_inputs_for_generationpop)
rQ   r?  rq  r-   r   rA  rr  r   model_inputsrS   s
            rB   rt  8CsmDepthDecoderForCausalLM.prepare_inputs_for_generationx  sM     w<_m
_e
 "9: 	(rA   )rf  r  r   )	NNNNNNNNr   )NNNNF)r7   r8   r9   r:   _tied_weights_keys_tp_plan_pp_planrK   r   r   r<   r  r=   rh   r	   r  r   r   r   r?   r   r`   rt  r@   ri   rj   s   @rB   rd  rd    s    HH  .2?C.204(,26*.!%-.A
##d*A
 %*$5$5$<A
 t+	A

 &&-A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
'	'A
  A
L ,0(,2626*/## "Dj 	
 ((4/ ((4/ !4K rA   rd  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i  c                   > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )Nr  Frs   )rJ   rK   rL   r3  r  codebook_sizerR   embed_audio_tokensr|   r<   r   r   s     rB   rK   #CsmBackboneModelEmbeddings.__init__  sn    "$,,0D0DvG[G[0[^d^p^p"q"ELL1E1E$FI]I]$]jo 	 	
rA   c                 ^    U R                  XR                  -   5      nUR                  SS9nU$ )Nr!   r   )r~  r  sum)rQ   r?  rA  s      rB   r`   "CsmBackboneModelEmbeddings.forward  s4    //	<U<U0UV%))a)0rA   )r~  r   rj   s   @rB   r  r    s    
 rA   r  c                      ^  \ rS rSrU 4S jr\\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S	\S-  S
\\   S\4S jj5       5       5       rSrU =r$ )CsmBackboneModeli  c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr  r0  F)rJ   rK   r1  r2  r   r  r5  rL   r6  r  r7  r   r8  rE   rR   r  r9  rl   r:  r;  r=  r   s      rB   rK   CsmBackboneModel.__init__  s     !.. ++6v>mmAFvG_G_A`aA`I_V/A`a
 v11v7J7JK	,F;&+# 	 bs   *CNr?  r   r   r-   rA  r  r   rH   c           
      >   USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  SU R                  R                    H  nU" U
4U	UUUUS	.UD6n
M     U R                  U
5      n
[        U
US
9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
Nz:You must specify exactly one of input_ids or inputs_embedsr0  r   r#   rC  rE  rF  )r   r   r   r-   r  rG  )rK  r5  r
   ro   rL  r<   r   rc   r~   r   r   r:  r8  r7  r9  r   )rQ   r?  r   r   r-   rA  r  r   rO  rT  r.   r   rU  s                rB   r`   CsmBackboneModel.forward  sF   2 -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[)H4;;+H+HIM)*$7) /# M J 		-0&++
 	
rA   )r5  r;  r8  r9  r2  r:  r   )NNNNNN)r7   r8   r9   r:   rK   r   r    r   r<   r  rh   r	   r=   r  r   r   r   r`   r@   ri   rj   s   @rB   r  r    s       .2.204(,26!%>
##d*>
 t+>
 &&-	>

 >
 ((4/>
 $;>
 +,>
 
!>
    >
rA   r  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                     ^  \ rS rSrSS0rU 4S jrS rS r\U 4S j5       r	U 4S jr
    SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  4
S jjr    SS
\R                  S\S	-  S\S	-  S\R                  S	-  S\R"                  S	-  4
U 4S jjjr\\          SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R"                  S	-  S\R                  S	-  S\S	-  S\\R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )CsmForConditionalGenerationi  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g )NFr   )rJ   rK   r   rL   r   rR   lm_headr3  text_vocab_sizeembed_text_tokensr  _from_configbackbone_modelrd  depth_decoder_configdepth_decoderr"   from_configcodec_configcodec_modelr=  r   s     rB   rK   $CsmForConditionalGeneration.__init__  s      ++yy!3!3V5F5FUS!#f.D.DfFXFX!Y.;;FC7DDVE`E`a$001D1DErA   c                 .    U R                   R                  $ r   r  r5  rd   s    rB   get_input_embeddings0CsmForConditionalGeneration.get_input_embeddings  s    ""///rA   c                 $    XR                   l        g r   r  )rQ   r   s     rB   set_input_embeddings0CsmForConditionalGeneration.set_input_embeddings	  s    +0(rA   c                    > UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 s H"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )Noutput_loading_infoFdepth_decoder__from_model_config)getrJ   from_pretrainedlenvarsgeneration_configitems
startswithr  r   delattr)clsargsr   r  loading_infoprefix
prefix_lenattrr   depth_decoder_attrsrS   s             rB   r  +CsmForConditionalGeneration.from_pretrained  s   ::+U33"''"94"J6"JE<G+T<V<E "[
  $E$;$;<BBD
Dv& %Du$D 	 
 	U  223::<PRW;o[n;op (DE++V]; ( !F*,&&L
s   /C:	C:c                    > SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr  transformers_version)r  r  to_diff_dictru  r  setattrrJ   save_pretrained)rQ   r  r   r  r  r  r   rS   s          rB   r  +CsmForConditionalGeneration.save_pretrained'  sq    !"00BBOOQ 6=.446KDD**FM5A 7 	00rA   Nr?  input_valuesinput_values_cutoffsrh  rH   c                    U R                  U5      nUGbN  [        R                  R                  US5      nX3S:     R	                  5       nXfS:     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n[
        R                  " 5          / n[        X#5       H  u  pXS:     n
[        U
R                  S   S-
  5       Hp  nX   nXS-      nU	SX24   nU R                   R#                  UR                  S5      5      nUR$                  R'                  SS5      nUR)                  US   5        Mr     M     [        S U 5       5      n[
        R*                  " U Vs/ s H7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                   R-                  U5      nSSS5        U R.                  R0                  nUU:H  nU R2                  R5                  W5      nUW   UU'   [
        R6                  " SSU R.                  R8                  4UR                  [
        R:                  S	9U R.                  R<                  -  nU R2                  R5                  U5      R?                  S5      nXR.                  R@                  :H  nURC                  URE                  5       S5      UU'   Ubg  UR                  S5      RC                  SSU R.                  R8                  5      nUU   UU'   UUU'   US
:H  RG                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf ! , (       d  f       GN= f)a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retrieve codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
Nr#   r   r   rC  rV   r#   .c              3   >   #    U  H  oR                   S    v   M     g7f)r   N)rc   ).0els     rB   	<genexpr>QCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>a  s     &O=Nrxx{=Ns   r   iTas_tuple)rA  rh  )$r  rL   r   paddiffr<   r   maxr~   r   r  r   r   zipr  rc   r  encodeaudio_codesr   appendr^  get_audio_codes_maskro   audio_token_idr  r5  rN   r  longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr  nonzero)rQ   r?  r  r  rh  rA  audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsr!  	start_idxend_idxaudio_batchcodec_outputscodebook_idsmax_audio_framesr  batched_audio_token_idsaudio_codes_maskr  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxss                                rB   "_merge_input_ids_with_input_values>CsmForConditionalGeneration._merge_input_ids_with_input_values1  su   * ..y9##%==#4#45I6#R 01JKPPRM)!*;<M %-A-E-E-GP\PcPc d k kM"B! !24K4KA4N N
 $&!FI,FmB&1KjkLk1l."#=#C#CA#F#JK$>$A	"<U"C&8i>O9O&P(,(8(8(?(?@U@UVW@X(Y'4'@'@'J'J1b'Q)00aA L Gn $'&O=N&O#O */++`qr`qZ\R]]&&rAq!5EQR5S+TU`qr+' $(#3#3#H#HIZ#[ ! !$ "[[77N(N:..;;<STL.:;K.LM*+ 

Aq$++";";<YEUEU]b]g]gh++334    $22??@ST\\]^_#,0N0N#N 2B2I2IJ^JbJbJdfg2hM./ !"("2"22"6"="=aDKKD]D]"^4KL\4] 018K 454:dN3K3KUY3K3Z0pt @ CEefgEhjkjl lm(!.AA= s !s    CM->M(
"M-(M--
M<rq  r-   r   rA  c           	      2  > [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)r?  rq  r-   r   rA  r!   rA  r  r  rh  )r?  r  r  rh  )rA  rh  r?  r6   )rJ   rt  ndimr  r  r   )
rQ   r?  rq  r-   r   rA  r   rv  merged_inputsrS   s
            rB   rt  9CsmForConditionalGeneration.prepare_inputs_for_generation  s     w< 
!5+)'
 
  Y^^q%8\=M=Mo=^=f CC##ZZ7%+ZZ0F%Gzz(+	 D M "/"@MZbLcrvw rA   r   r  ri  r   c                    Ub.  UR                   S:X  a  U R                  XXH5      nUS   nUS   nSnU R                  " SUUUUUU	S.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnSnSnSnUb  USS2SS2S4   nU R                  " SUUU R                  R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                  R                  S-
  24   n[        R                  R                  USSS9nUR                  SS9nUUS   US   S-
  SS24   nUU   nU R                   " SUUU	SUS.UD6nUR"                  nUU-   n[%        UUUUUR&                  UR(                  UR*                  Ub  UR,                  OSUb  UR&                  OSUb  UR(                  OSUb  UR*                  S9$ SS9$ )a`  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "sesame/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr!   rA  rh  )r?  r   r   r-   rA  r  r   )r,   rh  r   r#   r  rV   r   .r  )r   Tr  )r?  r@  r  return_dictrh  )r+   r5   r0   r,   r-   r.   r/   r1   r2   r3   r4   r6   )r  r  r  r   r   rl  r  rm  ro   r   allr  rL   r   r  r  r  r+   r)   r-   r.   r/   r,   )rQ   r?  r  r   r  r   r-   rA  rh  r  ri  r   r  backbone_outputsbackbone_hidden_statesro  backbone_logitsr+   r5   r0   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                              rB   r`   #CsmForConditionalGeneration.forward  s   d  Y^^q%8 CC)=M */:M"8,FI.. 
)%+'
 
 "2!!48B>SV8W8W~ot4]k,,'=aPQ>Q'RS! $$Q1WoO .. &4;;KaKaekM "!Q(+t388R8@@J&,Z&8>]@Y@Y\]@]>]9]&^#&(mm&7&78OQW_`&7&a##++T+:J*@APZ[\P]`aPacdAd*e'#)*#5 $($6$6 %1+F# +% %! "7!;!; #55D '1",<<*88'22AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%E
 	
 qu
 	
rA   )r  r  r  r  r  r   )NNNN)
NNNNNNNNNr   )r7   r8   r9   r:   rx  rK   r  r  classmethodr  r  r<   rh   r  r  r   r	   r=   rt  r   r   r  r   r   r?   r)   r`   r@   ri   rj   s   @rB   r  r    sI    	@Aj01  41 *.,048&*PB<<$&PB llT)PB $llT1	PB
 t#PB 
	PBj ,0(,2626## "Dj 	
 ((4/ ((4/ >  .2,0.24804(,26*.!%-.Y
##d*Y
 llT)Y
 t+	Y

 $llT1Y
 &&-Y
 Y
 ((4/Y
   4'Y
 $;Y
 ell*Y
 +,Y
 
"	"Y
  Y
rA   r  )r  r  r.  rd  r  )r#   )r   )Scollections.abcr   dataclassesr   typingr   r<   torch.nnrL    r   r  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.import_utilsr   utils.output_capturingr    autor"   configuration_csmr$   r%   generation_csmr&   
get_loggerr7   rI  r)   ModulerE   rl   r   r   r   rh   r   r   rg   r   r   r   r  r.  r  rd  r  r  r  __all__r6   rA   rB   <module>r     s  * % !    & ! . ) f f / 9 O K F & _ _ G : 5  ? . 
		H	% 
 '3 '3 '3T Y'J J (J(>< ><BRYY  ( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*@)299 @) +@)F(0 (V 
 v v v< d
- d
 d
Nryy ( g!3_ ggT  Q
) Q
 Q
h 
F
"46H F

F
R
rA   