
    Z j                        S SK Jr  S SKJr  S SKrS SKJr  S SKJr  SSK	J
r
  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2J3r3J4r4  SSK5J6r6J7r7J8r8  SSK9J:r:J;r;  \+Rx                  " \=5      r>\)" SS9\ " S S\5      5       5       r?\\)" SS9 " S S\5      5       5       r@ " S  S!\R                  5      rB " S" S#\R                  5      rC " S$ S%\35      rD " S& S'\25      rE " S( S)\65      rF " S* S+\5      rG\) " S, S-\$5      5       rH " S. S/\H5      rI " S0 S1\75      rJ " S2 S3\:5      rK\)" S4S9 " S5 S6\H\5      5       rL/ S7QrMg)8    )Callable)	dataclassN)strict   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightzUsefulSensors/moonshine-tiny)
checkpointc                     ^  \ rS rSr% SrSrS/rSSSSS	.rS
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	S-  \
S'   Sr\	S-  \
S'   Sr\\
S'   Sr\\
S'   Sr\	\
S'   Sr\\
S'   Sr\	\
S'   S r\\
S!'   Sr\\-  S-  \
S"'   S r \\
S#'   S$r!\\
S%'   S&r"\\	-  \
S''   Sr#\	S-  \
S('   S)r$\	\%\	   -  S-  \
S*'   Sr&\	S-  \
S+'   S r'\\
S,'   U 4S- jr(S.r)U =r*$ )/MoonshineConfig2   a	  
encoder_num_key_value_heads (`int`, *optional*):
    This is the number of key_value heads that should be used to implement Grouped Query Attention. If
    `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
    `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
    converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
    by meanpooling all the original heads within that group. For more details, check out [this
    paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
    `num_attention_heads`.
decoder_num_key_value_heads (`int`, *optional*):
    This is the number of key_value heads that should be used to implement Grouped Query Attention. If
    `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
    `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
    converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
    by meanpooling all the original heads within that group. For more details, check out [this
    paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
    `decoder_num_attention_heads`.
pad_head_dim_to_multiple_of (`int`, *optional*):
    Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
    optimized attention implementations.
encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
    The non-linear activation function (function or string) in the encoder.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
    The non-linear activation function (function or string) in the decoder.

Example:

```python
>>> from transformers import MoonshineModel, MoonshineConfig

>>> # Initializing a Moonshine style configuration
>>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

>>> # Initializing a model from the configuration
>>> model = MoonshineModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```	moonshinepast_key_valuesdecoder_num_key_value_headsdecoder_num_attention_headsdecoder_num_hidden_layersdecoder_hidden_act)num_key_value_headsnum_attention_headsnum_hidden_layers
hidden_acti   
vocab_sizei   hidden_sizei  intermediate_size   encoder_num_hidden_layers   encoder_num_attention_headsNencoder_num_key_value_headspad_head_dim_to_multiple_ofgeluencoder_hidden_actsilui   max_position_embeddingsg{Gz?initializer_range   decoder_start_token_idT	use_cacherope_parametersis_encoder_decoderFattention_bias        attention_dropoutbos_token_idr!   eos_token_idpad_token_idtie_word_embeddingsc                    > U R                   c  U R                  U l         U R                  c  U R                  U l        UR	                  SS5        [
        TU ]  " S0 UD6  g )Npartial_rotary_factorg? )r?   r>   r0   r1   
setdefaultsuper__post_init__)selfkwargs	__class__s     ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/moonshine/modular_moonshine.pyrW   MoonshineConfig.__post_init__   sX    ++3/3/O/OD,++3/3/O/OD,137''    )r0   r?   )+__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr8   int__annotations__r9   r:   r<   r2   r>   r1   r?   r0   r@   rB   strr3   rD   rE   floatrG   rH   boolrI   r   dictrJ   rK   rM   rN   rO   listrP   rQ   rW   __static_attributes____classcell__rZ   s   @r[   r,   r,   2   sg   &P J#4"5<<8*	M JK!s!%&s&%&s&'(('((.2t2.2t2.2t2$$$$#&S&#u#"#C#It48O^d*T18## ND %(us{( L#* +,L#S	/D(,#L#*# $$( (r]   r,   z
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   >    \ rS rSr% Sr\R                  S-  \S'   Srg)MoonshineEncoderModelOutput   Nattention_maskrT   )	r^   r_   r`   ra   rt   torchTensorrg   rm   rT   r]   r[   rr   rr      s     +/NELL4'.r]   rr   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineEncoderMLP   c                 
  > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  5      U l	        [
        R                  " UR                  UR                  5      U l
        g NrV   __init__configr   activation_fnnnLinearr9   r:   fc1fc2rX   r~   r7   rZ   s      r[   r}   MoonshineEncoderMLP.__init__   s\    #J/99V//1I1IJ99V55v7I7IJr]   hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r{   )r   r   r   )rX   r   s     r[   forwardMoonshineEncoderMLP.forward   s4    /**=9/r]   r   r~   r   r   
r^   r_   r`   ra   r}   ru   rv   r   rm   rn   ro   s   @r[   rx   rx      s)    KU\\ ell  r]   rx   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineDecoderMLP   c                   > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  S-  5      U l	        [
        R                  " UR                  UR                  5      U l
        g )Nr!   r|   r   s      r[   r}   MoonshineDecoderMLP.__init__   sa    #J/99V//1I1IA1MN99V55v7I7IJr]   r   r   c                     U R                  U5      nUR                  SSS9u  pU R                  U5      U-  nU R                  U5      nU$ )Nr!   )dim)r   chunkr   r   )rX   r   gates      r[   r   MoonshineDecoderMLP.forward   sQ    /+11!1<**40=@/r]   r   r   ro   s   @r[   r   r      s)    KU\\ ell  r]   r   c                       \ rS rSrSrg)MoonshineRotaryEmbedding   rT   N)r^   r_   r`   ra   rm   rT   r]   r[   r   r      s    r]   r   c                   X  ^  \ rS rSrS\S\S\S\S\4
U 4S jjr    SS	\R                  S
\
\R                  \R                  4   S-  S\R                  S-  S\S-  S\R                  S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )MoonshineAttention   r~   	layer_idx	is_causalr5   r4   c                 f  > UR                  XES.5        [        TU ]	  X5        X0l        [	        USUR
                  UR                  -  5      U l        U R                  R                  bA  U R                  R                  nX`R                  U-   S-
  U-  -  nXpR                  -
  U l
        g SU l
        g )N)r5   r4   head_dimrF   r   )updaterV   r}   r   getattrr9   r5   r   r~   r@   head_dim_padding)	rX   r~   r   r   r5   r4   target_multipletarget_head_dimrZ   s	           r[   r}   MoonshineAttention.__init__   s     	.Ano+"
F4F4F&JdJd4de ;;22>"kkEEO---/2QTU2UZi1ijO$3mm$CD!$%D!r]   Nr   position_embeddingsrt   r/   key_value_statesrY   r   c                    UR                   S S u  pxU R                  U5      R                  XxU R                  R                  U R
                  5      R                  SS5      n	US Ln
Ub^  UR                  R                  U R                  5      nU
(       a&  SUR                  U R                  '   UR                  nOUR                  nUb  UOUnU
(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU R!                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU
(       a!  Ub  UR#                  XU R                  5      u  pU
(       d5  Uu  nn[%        XUU5      u  pUb  UR#                  XU R                  5      u  p[&        R(                  " U R                  R*                  [,        5      nU R.                  =(       a    US L =(       a    US:  nU R0                  S:  a  [2        R4                  R6                  R9                  U	SU R0                  45      n	[2        R4                  R6                  R9                  USU R0                  45      n[2        R4                  R6                  R9                  USU R0                  45      nU" U U	UUU4U R:                  (       d  SOU R<                  U R>                  US.UD6u  nnU R0                  S:  a  USS U R0                  * 24   nURA                  XxS5      RC                  5       nU RE                  U5      nUU4$ )	Nr   rF   r!   Tr   rL   )dropoutscalingr   .)#shapeq_projviewr~   r4   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r$   r   get_interface_attn_implementationr'   r   r   ru   r   
functionalpadtrainingrM   r   reshape
contiguouso_proj)rX   r   r   rt   r/   r   rY   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statescossinattention_interfacer   attn_outputattn_weightss                        r[   r   MoonshineAttention.forward   sK    #(("-
 KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+A*\`\j\j+k(
!*HC';LVY[^'_$L*+:+A+A*\`\j\j+k(
(?(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#b9DDFkk+.L((r]   )r   r   r   )NNNN)r^   r_   r`   ra   r,   rf   rj   r}   ru   rv   tupler   r   r   r   rm   rn   ro   s   @r[   r   r      s    && & 	&
 !& !&0 IM.2(,04O)||O) #5<<#=>EO) t+	O)
 O)  ,,-O) -.O) 
u||U\\D0%2E2LL	MO) O)r]   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )MoonshineEncoderLayeri  r~   r   c                 4  > [         TU ]  X5        [        UUSUR                  UR                  S9U l        [        XR                  5      U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NFr~   r   r   r5   r4   bias)rV   r}   r   r>   r?   	self_attnrx   rB   mlpr   	LayerNormr9   input_layernormpost_attention_layernormrX   r~   r   rZ   s      r[   r}   MoonshineEncoderLayer.__init__   s}    ++ & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%r]   )r   r   r   r   )	r^   r_   r`   ra   r,   rf   r}   rm   rn   ro   s   @r[   r   r     s    U U3 U Ur]   r   c                     ^  \ rS rSrSS\S\S-  4U 4S jjjr         SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
S-  S\S-  S\\R                  \R                  4   S-  S\\R                  \R                  4   S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )MoonshineDecoderLayeri0  Nr~   r   c                   > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NTr   Fr   )rV   r}   r9   r   r5   r4   r   encoder_attnr   r7   r   r   r   r   r   final_layernormr   s      r[   r}   MoonshineDecoderLayer.__init__1  s    !--+ & : : & : :
 / & : : & : :
 'v/@/@A!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr]   r   rt   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr/   rH   r   encoder_position_embeddingsrY   r   c           
         UnU R                  U5      nU R                  " SUUUUUU	S.UD6u  pX-   nUb,  UnU R                  U5      nU R                  UUUUUS9u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nU$ )N)r   rt   r   r/   rH   r   )r   r   rt   r/   rH   rT   )r   r   r   r   r   r   )rX   r   rt   r   r   r   r   r/   rH   r   r   rY   residual_s                 r[   r   MoonshineDecoderLayer.forwardI  s     !,,];>> 
')%+ 3
 
 !0 ,$H 99-HM#00+!65 /#  1  M %4M ,,];/ 0r]   )r   r   r9   r   r   r   r   r{   )	NNNNNNFNN)r^   r_   r`   ra   r,   rf   r}   ru   rv   
LongTensorr   rj   r   r   r   FloatTensorr   rm   rn   ro   s   @r[   r   r   0  s]   L L3: L L6 /3596:048<(,!&HLPT,||, t+,  %||d2	,
 !&t 3, &&-, $..5, , $;, #5<<#=>E, &+5<<+E%F%M, +,, 
u  %(9(95;L;L(L"MPT"TT	U, ,r]   r   c                   f    \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrS	\R                  4S
 jrSrg)MoonshinePreTrainedModelix  r~   modelinput_valuesaudioTr   r   input_lengthsc                 ~    [        US-
  S-  S-   5      n[        US-
  S-  S-   5      n[        US-
  S-  S-   5      nU$ )z8
Computes the output length of the convolutional layers
   @   rF      r   r!   )rf   )rX   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r[    _get_feat_extract_output_lengths9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r]   rT   N)r^   r_   r`   ra   r,   rg   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphru   r   r   rm   rT   r]   r[   r   r   x  sN    $O&*#02IJN!#e>N>N #r]   r   c                      ^  \ rS rSrSrSr\\S.rS\	4U 4S jjr
S\R                  4S jrS	\R                  4S
 jr\\ SS\R$                  S\R&                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )MoonshineEncoderi  z
Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

Args:
    config: MoonshineConfig
r   )
attentionsr   r~   c           	      L  > [         TU ]  U5        Xl        UR                  n[        R
                  " SUSSSS9U l        [        R
                  " USU-  SSS	9U l        [        R
                  " SU-  USSS	9U l        [        R                  " SUS
S9U l
        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                   " USS9U l        [%        US9U l        SU l        U R+                  5         g s  snf )NrF   r   r   F)kernel_sizestrider   r!   r   r   )r	  r
  gh㈵>)
num_groupsnum_channelsepsr   r~   )rV   r}   r~   r9   r   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListranger<   r   r   r   
layer_normr   
rotary_embgradient_checkpointing	post_init)rX   r~   	embed_dimidxrZ   s       r[   r}   MoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bc;bC"6/;bc
 ,,yu=2&A&+# ds    D!r   c                     U R                   $ r{   r  rX   s    r[   get_input_embeddings%MoonshineEncoder.get_input_embeddings  s    zzr]   valuec                     Xl         g r{   r  )rX   r#  s     r[   set_input_embeddings%MoonshineEncoder.set_input_embeddings  s    
r]   Nrt   rY   c                 R   UR                  S5      n[        R                  R                  U R	                  U5      5      nU R                  U5      n[        R                  R                  U R                  U5      5      n[        R                  R                  U R                  U5      5      nUR                  SSS5      nSnUb3  U R                  UR                  S   5      nSnUSSSU24   SSU24   nUn[        U R                  UUUS9n[        R                  " SUR                  S   UR                   S	9R                  S5      nU R#                  XHS
9n	U R$                   H  n
U
" U4UUU	S.UD6nM     U R'                  U5      n[)        UUb  UR+                  5       S9$ SS9$ )a  
Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
        Float values of the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
        `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
        the soundfile library (`pip install soundfile`). To prepare the array into
        `input_values`, the [`AutoFeatureExtractor`] should be used for padding
        and conversion into a tensor of type `torch.FloatTensor`.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
rF   r   r!   Nr   i  .r~   inputs_embedsrt   r   devicer   )rt   r   r   )last_hidden_statert   )	unsqueezer   r   tanhr  r  rA   r  r  permuter   r   r   r~   ru   aranger+  r  r   r  rr   rf   )rX   r   rt   rY   r   output_attention_maskmask_lendownsample_strider   r   encoder_layers              r[   r   MoonshineEncoder.forward  s   . $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 !%%<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN$2!2;;')"/	
 ||A}':':1'=mFZFZ[eefgh"oomoW![[M)-)$7	
 M ) 6*+:O:[0446
 	
ae
 	
r]   )	r~   r  r  r  r  r  r  r   r  r{   )r^   r_   r`   ra   rb   r   r   r   _can_record_outputsr,   r}   r   Moduler!  r%  r   r    ru   r   rv   r   r   r   r   r   rm   rn   ro   s   @r[   r  r    s     %O(.
 $bii "))    /3<
''<
 t+<
 +,	<

 
(	(<
   <
r]   r  c                   f  ^  \ rS rSrSr\" \SSS9\\" \SSS9S.rS\	4U 4S	 jjr
\\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\R"                  S
-  S\R                  S
-  S\\   S\\-  4S jj5       5       rSrU =r$ )MoonshineDecoderi  	input_idsrF   r   )index
layer_namer   )r  r   cross_attentionsr~   c           	        > [         TU ]  U5        [        R                  " UR                  SS9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        g s  snf NFr   )rV   r}   r   r   r9   normr  r  r6   r   r   )rX   r~   r  rZ   s      r[   r}   MoonshineDecoder.__init__   s_     LL!3!3%@	mmSXY_YqYqSr$sSrC%:6%GSr$st$ss   A>Nrt   r   r/   r)  rH   r   r   rY   r   c	           
         USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a1  Uc.  [        [        U R                  S9[        U R                  S95      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      n[        U R                  UUUUS9n[        U R                  UUUS9nUnU R                  XS	9nU R                   H  nU" UUU4UUUUUS
.U	D6nM     U R                  U5      n[!        UU(       a  US9$ SS9$ )a\  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embedsr  r   rF   r*  )r~   r)  rt   r/   r   r(  r,  )r   r   r/   rH   r   )r-  r/   )
ValueErrorembed_tokensr
   r	   r~   get_seq_lengthru   r1  r   r+  r.  r   r   r  r   rA  r   )rX   r;  rt   r   r/   r)  rH   r   r   rY   past_seen_tokenscausal_maskr   r   decoder_layers                  r[   r   MoonshineDecoder.forward  s{   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ";;;'1"7	"
 &"oomoW![[M)%
 (>) /#$7
 
M ) 		-08+/8O
 	
>B
 	
r]   )r   rA  )NNNNNNNN)r^   r_   r`   ra   r   r   r   r   r7  r,   r}   r   r    ru   r   rv   r   r   rj   r   r   r   r   r   rm   rn   ro   s   @r[   r:  r:    s2   !O$%7q[Y.*+=QSabu u
   .2.204(,26!%:>6:G
##d*G
 t+G
 &&-	G

 G
 ((4/G
 $;G
  %0047G
 !&t 3G
 +,G
 
(	(G
   G
r]   r:  c                   Z   \ rS rSrS r\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\
\
\R                        S-  S	\S-  S
\
\R                     S-  S\
\R                     S-  S\S-  S\\   S\4S jj5       5       rSrg)MoonshineModeliQ  c                     [        S5      e)NzNot needed for Moonshine)AttributeErrorr   s    r[   _mask_input_features#MoonshineModel._mask_input_featuresR  s    788r]   Nr   rt   decoder_input_idsdecoder_attention_maskencoder_outputsr/   decoder_inputs_embedsdecoder_position_idsrH   rY   r   c
                 P   Uc  U R                   " U4SU0U
D6nU R                  " SUUUR                  UR                  UUUU	S.U
D6n[	        UR                  UR
                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )a:  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, MoonshineModel
>>> from datasets import load_dataset

>>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 288]
```
rt   )r;  rt   r   r   r/   r)  r   rH   )r-  r/   decoder_hidden_statesdecoder_attentionsr>  encoder_last_hidden_stater   encoder_attentionsrT   )	encoderdecoderr-  rt   r   r/   r   r  r>  )rX   r   rt   rQ  rR  rS  r/   rT  rU  rH   rY   decoder_outputss               r[   r   MoonshineModel.forwardU  s    Z "/3||L/rYg/rkq/rOEI\\ 
F
'1"1"C"C#2#A#A+/-
F
 
F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r]   rT   )	NNNNNNNNN)r^   r_   r`   ra   rO  r   r   ru   r   r   r   r
   rj   r   r   r   r   rm   rT   r]   r[   rL  rL  Q  s)   9  262659:>BF6:AE?C!%C
''$.C
 ((4/C
 !++d2	C

 !& 0 04 7C
 uU%6%6784?C
 -t3C
  %U%6%67$>C
 $E$4$45<C
 $;C
 +,C
 
C
  C
r]   rL  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                     ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\\          SS\R                  S
-  S\R                   S
-  S\R                   S
-  S\R                   S
-  S\\\R                        S
-  S\S
-  S\\R                     S
-  S\\R                      S
-  S\S
-  S\R                   S
-  S\\   S\4S jj5       5       rSrU =r$ )!MoonshineForConditionalGenerationi  zproj_out.weightz!model.decoder.embed_tokens.weightr~   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g r@  )
rV   r}   rL  r   r   r   r9   r8   proj_outr  )rX   r~   rZ   s     r[   r}   *MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r]   c                     U R                   $ r{   rb  r   s    r[   get_output_embeddings7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r]   c                     Xl         g r{   re  )rX   new_embeddingss     r[   set_output_embeddings7MoonshineForConditionalGeneration.set_output_embeddings  s    &r]   r   c                 6    U R                   R                  5       $ r{   )r   r!  r   s    r[   r!  6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r]   Nr   rt   rQ  rR  rS  r/   rT  rU  rH   labelsrY   c                    U
b:  Uc7  Uc4  [        XR                  R                  U R                  R                  5      nU R                  " U4UUUUUUUU	S.UD6nU R                  UR                  5      nSnU
b$  U R                  XU R                  R                  S9n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )ah  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values

>>> generated_ids = model.generate(input_values, max_new_tokens=100)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
```N)rt   rQ  rS  rR  r/   rT  rU  rH   )logitsrn  r8   )	lossrp  r/   rW  rX  r>  rY  r   rZ  )r)   r~   rP   rG   r   rb  r-  loss_functionr8   r   r/   rW  rX  r>  rY  r   rZ  )rX   r   rt   rQ  rR  rS  r/   rT  rU  rH   rn  rY   outputsrp  rq  s                  r[   r   )MoonshineForConditionalGeneration.forward  s   d  (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+"7!5'
 '
 w889%%Vt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r]   )r   rb  )
NNNNNNNNNN)r^   r_   r`   ra   _tied_weights_keysr,   r}   rf  rj  r   r8  r!  r   r   ru   r   r   r   r
   rj   r   r   r   r   rm   rn   ro   s   @r[   r`  r`    sr    ,-PQ '1bii 1  262659:>BF6:AE?C!%*.R
''$.R
 ((4/R
 !++d2	R

 !& 0 04 7R
 uU%6%6784?R
 -t3R
  %U%6%67$>R
 $E$4$45<R
 $;R
   4'R
 +,R
 
R
  R
r]   r`  )r,   rL  r   r`  )Ncollections.abcr   dataclassesr   ru   torch.nnr   huggingface_hub.dataclassesr   activationsr   cache_utilsr   r	   r
   configuration_utilsr   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r    glm.modeling_glmr"   r#   r$   llama.modeling_llamar%   r&   r'   whisper.modeling_whisperr(   r)   
get_loggerr^   loggerr,   rr   r8  rx   r   r   r   r   r   r   r  r:  rL  r`  __all__rT   r]   r[   <module>r     s   % !   . ! C C 3 ) J B 9  2 F & R R 7 E U U Y Y G 
		H	% 9:S(& S(  ;S(l 
// / /")) "))  	1 	e) e)PU- U"E6 EP # # #0d
/ d
NV
z V
rI
\ I
X 
h
(@/ h

h
Vr]   