
    Z j                         S SK Jr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r
  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0J1r1  SSK2J3r3  \\*" SS9 " S S\5      5       5       r4 " S S\Rj                  5      r6 " S S\Rj                  5      r7 " S S\Rj                  5      r8S \Rr                  S!\:S"\Rr                  4S# jr; SES$\Rj                  S%\Rr                  S&\Rr                  S'\Rr                  S(\Rr                  S-  S)\<S*\<S+\'\)   4S, jjr=S- r>SFS. jr?\" \?5       " S/ S0\Rj                  5      5       r@ " S1 S2\5      rA " S3 S4\5      rB\* " S5 S6\%5      5       rC " S7 S8\C5      rD\* " S9 S:\C5      5       rE\* " S; S<\C5      5       rFS=\Rr                  S>\:S?\:4S@ jrG\*" SAS9 " SB SC\C\5      5       rH/ SDQrIg)G    )Callable)	dataclass)OptionalN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernelized_func)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )MoonshineConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   >    \ rS rSr% Sr\R                  S-  \S'   Srg)MoonshineEncoderModelOutput3   Nattention_mask )	__name__
__module____qualname____firstlineno__r(   torchTensor__annotations____static_attributes__r)       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/moonshine/modeling_moonshine.pyr&   r&   3   s     +/NELL4'.r2   r&   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineEncoderMLP=   c                 
  > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  5      U l	        [
        R                  " UR                  UR                  5      U l
        g Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr<   
hidden_act	__class__s      r3   r;   MoonshineEncoderMLP.__init__>   s\    #J/99V//1I1IJ99V55v7I7IJr2   hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r8   )rB   r=   rC   )rE   rI   s     r3   forwardMoonshineEncoderMLP.forwardE   s4    /**=9/r2   r=   r<   rB   rC   
r*   r+   r,   r-   r;   r.   r/   rL   r1   __classcell__rG   s   @r3   r5   r5   =   s)    KU\\ ell  r2   r5   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineDecoderMLPL   c                   > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  S-  5      U l	        [
        R                  " UR                  UR                  5      U l
        g )N   r9   rD   s      r3   r;   MoonshineDecoderMLP.__init__M   sa    #J/99V//1I1IA1MN99V55v7I7IJr2   rI   rJ   c                     U R                  U5      nUR                  SSS9u  pU R                  U5      U-  nU R                  U5      nU$ )NrV   dim)rB   chunkr=   rC   )rE   rI   gates      r3   rL   MoonshineDecoderMLP.forwardT   sQ    /+11!1<**40=@/r2   rN   rO   rQ   s   @r3   rS   rS   L   s)    KU\\ ell  r2   rS   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )MoonshineRotaryEmbedding\   inv_freqNr<   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrb   F)
persistentoriginal_inv_freq)r:   r;   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr<   rope_parametersrd   compute_default_rope_parametersr   attention_scalingregister_bufferclone)rE   r<   devicerope_init_fnrb   rG   s        r3   r;   !MoonshineRotaryEmbedding.__init___   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr2   rp   ztorch.deviceseq_lenrJ   ztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorg      ?head_dimNr   rV   dtype)rp   ry   )rk   getgetattrr@   num_attention_headsintr.   arangeint64tofloat)	r<   rp   rs   baserv   rw   r[   attention_factorrb   s	            r3   rl   8MoonshineRotaryEmbedding.compute_default_rope_parameterso   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r2   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rY   r"   mpscpuF)device_typeenabledrV   rZ   rx   )rb   r   expandshaper   rp   
isinstancetypestrr   	transposer.   catcosrm   sinry   )
rE   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r3   rL    MoonshineRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rm   r<   ri   rj   rd   r8   )NNN)r*   r+   r,   r-   r.   r/   r0   r#   r;   staticmethodr   r}   tupler   rl   no_gradr   rL   r1   rP   rQ   s   @r3   r`   r`   \   s    llV V V  )-+/"*$&*(* t* 
~u$	%	* *> ]]_<  <r2   r`   rI   n_reprJ   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)r   r   reshape)rI   r   batchnum_key_value_headsslenrw   s         r3   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr2   modulequerykeyvaluer(   scalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrV   r   rY   )r[   ry   )ptrainingr"   )r   num_key_value_groupsr.   matmulr   r>   
functionalsoftmaxfloat32r   ry   r   r   
contiguous)r   r   r   r   r(   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r3   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r2   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   NrV   r"   rY   rZ   )r.   stackflatten)r   x1x2s      r3   rotate_halfr      sJ    	
319B	
319B;;Ryb)11"55r2   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R
                  " X/SS9n
[        R
                  " X/SS9nX4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.NrY   rV   rZ   )	unsqueezer   repeat_interleaver   r.   r   )qkr   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r3   apply_rotary_pos_embr      s6   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr2   c                   \  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr    SS
\	R                  S\\	R                  \	R                  4   S	-  S\	R                  S	-  S\S	-  S\	R                  S	-  S\\   S\\	R                  \	R                  S	-  \\	R                     S	-  4   4S jjrSrU =r$ )MoonshineAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr<   	layer_idx	is_causalr|   r   c                   > [         TU ]  5         UR                  XES.5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        X0l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR                  SS9U l        U R                  R*                  bA  U R                  R*                  nX`R                  U-   S-
  U-  -  nXpR                  -
  U l        g SU l        g )N)r|   r   rw   g      ࿩biasFr"   r   )r:   r;   updater<   r   r{   r@   r|   rw   r   r   r   attention_dropoutr   r>   r?   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	rE   r<   r   r   r|   r   target_multipletarget_head_dimrG   s	           r3   r;   MoonshineAttention.__init__   s    	.Ano"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO---/2QTU2UZi1ijO$3mm$CD!$%D!r2   NrI   position_embeddingsr(   past_key_valueskey_value_statesr   rJ   c                    UR                   S S u  pxU R                  U5      R                  XxU R                  R                  U R
                  5      R                  SS5      n	US Ln
Ub^  UR                  R                  U R                  5      nU
(       a&  SUR                  U R                  '   UR                  nOUR                  nUb  UOUnU
(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU R!                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU
(       a!  Ub  UR#                  XU R                  5      u  pU
(       d5  Uu  nn[%        XUU5      u  pUb  UR#                  XU R                  5      u  p[&        R(                  " U R                  R*                  [,        5      nU R.                  =(       a    US L =(       a    US:  nU R0                  S:  a  [2        R4                  R6                  R9                  U	SU R0                  45      n	[2        R4                  R6                  R9                  USU R0                  45      n[2        R4                  R6                  R9                  USU R0                  45      nU" U U	UUU4U R:                  (       d  SOU R<                  U R>                  US.UD6u  nnU R0                  S:  a  USS U R0                  * 24   nURA                  XxS5      RC                  5       nU RE                  U5      nUU4$ )	NrY   r"   rV   Tr           )r   r   r   .)#r   r   viewr<   r   rw   r   
is_updatedrz   r   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   r   get_interface_attn_implementationr   r   r   r.   r>   r   padr   r   r   r   r   r   )rE   rI   r   r(   r   r   r   bszq_lenquery_statesis_cross_attentionr   current_statesr   r   r   r   attention_interfacer   r   r   s                        r3   rL   MoonshineAttention.forward  sK    #(("-
 KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+A*\`\j\j+k(
!*HC';LVY[^'_$L*+:+A+A*\`\j\j+k(
(?(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#b9DDFkk+.L((r2   )r   r<   rw   r   r   r   r   r   r   r   r   r   )NNNN)r*   r+   r,   r-   __doc__r#   r}   boolr;   r.   r/   r   r   r   r   rL   r1   rP   rQ   s   @r3   r   r      s   G#&#& #& 	#&
 !#& !#&P IM.2(,04O)||O) #5<<#=>EO) t+	O)
 O)  ,,-O) -.O) 
u||U\\D0%2E2LL	MO) O)r2   r   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )MoonshineEncoderLayerin  r<   r   c                 T  > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l	        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NFr<   r   r   r|   r   r   )r:   r;   r@   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr5   encoder_hidden_actmlpr>   	LayerNorminput_layernormpost_attention_layernormrE   r<   r   rG   s      r3   r;   MoonshineEncoderLayer.__init__o  s    !--+ & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%r2   NrI   r(   r   r   	use_cacher   r   rJ   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )NrI   r(   r   r   r  r   r)   )r  r   r  r  )
rE   rI   r(   r   r   r  r   r   residual_s
             r3   rL   MoonshineEncoderLayer.forward  s     !,,];>> 
')%+ 3
 
 !0 !55mD/ 0r2   )r@   r  r  r  r   )NNNFN)r*   r+   r,   r-   r#   r}   r;   r.   r/   
LongTensorr   r   r   r   r   rL   r1   rP   rQ   s   @r3   r   r   n  s    U U3 U& /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r2   r   c                     ^  \ rS rSrSS\S\S-  4U 4S jjjr         SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
S-  S\S-  S\\R                  \R                  4   S-  S\\R                  \R                  4   S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )MoonshineDecoderLayeri  Nr<   r   c                   > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NTr   Fr   )r:   r;   r@   r   r|   r   r   encoder_attnrS   rF   r  r>   r  r  r  final_layernormr  s      r3   r;   MoonshineDecoderLayer.__init__  s    !--+ & : : & : :
 / & : : & : :
 'v/@/@A!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr2   rI   r(   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r  r   encoder_position_embeddingsr   rJ   c           
         UnU R                  U5      nU R                  " SUUUUUU	S.UD6u  pX-   nUb,  UnU R                  U5      nU R                  UUUUUS9u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nU$ )Nr
  )rI   r   r(   r   r  r)   )r  r   r  r  r  r  )rE   rI   r(   r  r  r   r  r   r  r   r  r   r  r  s                 r3   rL   MoonshineDecoderLayer.forward  s     !,,];>> 
')%+ 3
 
 !0 ,$H 99-HM#00+!65 /#  1  M %4M ,,];/ 0r2   )r  r  r@   r  r  r  r   r8   )	NNNNNNFNN)r*   r+   r,   r-   r#   r}   r;   r.   r/   r  r   r   r   r   r   FloatTensorrL   r1   rP   rQ   s   @r3   r  r    s]   L L3: L L6 /3596:048<(,!&HLPT,||, t+,  %||d2	,
 !&t 3, &&-, $..5, , $;, #5<<#=>E, &+5<<+E%F%M, +,, 
u  %(9(95;L;L(L"MPT"TT	U, ,r2   r  c                   f    \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrS	\R                  4S
 jrSrg)MoonshinePreTrainedModeli  r<   modelinput_valuesaudioTr   r  input_lengthsc                 ~    [        US-
  S-  S-   5      n[        US-
  S-  S-   5      n[        US-
  S-  S-   5      nU$ )z8
Computes the output length of the convolutional layers
   @   r"      r   rV   )r}   )rE   r!  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r3    _get_feat_extract_output_lengths9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r2   r)   N)r*   r+   r,   r-   r#   r0   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr.   r  r)  r1   r)   r2   r3   r  r    sN    $O&*#02IJN!#e>N>N #r2   r  c                      ^  \ rS rSrSrSr\\S.rS\	4U 4S jjr
S\R                  4S jrS	\R                  4S
 jr\\ SS\R$                  S\R&                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )MoonshineEncoderi   z
Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

Args:
    config: MoonshineConfig
r  )
attentionsrI   r<   c           	      L  > [         TU ]  U5        Xl        UR                  n[        R
                  " SUSSSS9U l        [        R
                  " USU-  SSS	9U l        [        R
                  " SU-  USSS	9U l        [        R                  " SUS
S9U l
        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                   " USS9U l        [%        US9U l        SU l        U R+                  5         g s  snf )Nr"   r#  r$  F)kernel_sizestrider   rV   r%  r   )r7  r8  gh㈵>)
num_groupsnum_channelsepsr   r<   )r:   r;   r<   r@   r>   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListrangeencoder_num_hidden_layersr   r   r  
layer_normr`   
rotary_embgradient_checkpointing	post_init)rE   r<   	embed_dimidxrG   s       r3   r;   MoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bc;bC"6/;bc
 ,,yu=2&A&+# ds    D!rJ   c                     U R                   $ r8   r>  rE   s    r3   get_input_embeddings%MoonshineEncoder.get_input_embeddings   s    zzr2   r   c                     Xl         g r8   rN  rE   r   s     r3   set_input_embeddings%MoonshineEncoder.set_input_embeddings#  s    
r2   Nr(   r   c                 R   UR                  S5      n[        R                  R                  U R	                  U5      5      nU R                  U5      n[        R                  R                  U R                  U5      5      n[        R                  R                  U R                  U5      5      nUR                  SSS5      nSnUb3  U R                  UR                  S   5      nSnUSSSU24   SSU24   nUn[        U R                  UUUS9n[        R                  " SUR                  S   UR                   S	9R                  S5      nU R#                  XHS
9n	U R$                   H  n
U
" U4UUU	S.UD6nM     U R'                  U5      n[)        UUb  UR+                  5       S9$ SS9$ )a  
Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
        Float values of the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
        `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
        the soundfile library (`pip install soundfile`). To prepare the array into
        `input_values`, the [`AutoFeatureExtractor`] should be used for padding
        and conversion into a tensor of type `torch.FloatTensor`.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
r"   r   rV   NrY   i  .r<   inputs_embedsr(   r  rp   r   )r(   r   r   )last_hidden_stater(   )r   r>   r   tanhr>  rB  gelur?  r@  permuter)  r   r   r<   r.   r~   rp   rG  r   rF  r&   r}   )rE   r  r(   r   rI   output_attention_maskmask_lendownsample_strider   r   encoder_layers              r3   rL   MoonshineEncoder.forward&  s   . $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 !%%<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN$2!2;;')"/	
 ||A}':':1'=mFZFZ[eefgh"oomoW![[M)-)$7	
 M ) 6*+:O:[0446
 	
ae
 	
r2   )	r<   r>  r?  r@  rH  rB  rF  r   rG  r8   )r*   r+   r,   r-   r   r,  r   r   _can_record_outputsr#   r;   r>   ModulerP  rT  r   r!   r.   r  r/   r   r   r   r   rL   r1   rP   rQ   s   @r3   r4  r4     s     %O(.
 $bii "))    /3<
''<
 t+<
 +,	<

 
(	(<
   <
r2   r4  c                   f  ^  \ rS rSrSr\" \SSS9\\" \SSS9S.rS\	4U 4S	 jjr
\\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\R"                  S
-  S\R                  S
-  S\\   S\\-  4S jj5       5       rSrU =r$ )MoonshineDecoderig  	input_idsr"   r   )index
layer_namer  )r5  rI   cross_attentionsr<   c           	      
  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [
        R                  " UR                  SS9U l        [!        US9U l        SU l        U R'                  5         g s  snf )NFr   r<  )r:   r;   pad_token_idpadding_idx
vocab_sizer>   	Embeddingr@   embed_tokensrC  rD  num_hidden_layersr  r   r  normr`   rG  rH  rI  )rE   r<   rK  rG   s      r3   r;   MoonshineDecoder.__init__p  s     !.. ++LL):):F<N<NPTP`P`ammSXY_YqYqSr$sSrC%:6%GSr$stLL!3!3%@	2&A&+# 	 %ts   D Nr(   r   r   rX  r  r  r  r   rJ   c	           
         USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a1  Uc.  [        [        U R                  S9[        U R                  S95      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      n[        U R                  UUUUS9n[        U R                  UUUS9nUnU R                  XS	9nU R                   H  nU" UUU4UUUUUS
.U	D6nM     U R                  U5      n[!        UU(       a  US9$ SS9$ )a\  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embedsr<  r   r"   rY  )r<   rX  r(   r   r   rW  rZ  )r  r   r   r  r   )r[  r   )
ValueErrorrq  r
   r	   r<   get_seq_lengthr.   r~   r   rp   r   r   r   rG  r   rs  r   )rE   rh  r(   r   r   rX  r  r  r  r   past_seen_tokenscausal_maskrI   r   decoder_layers                  r3   rL   MoonshineDecoder.forward~  s{   0 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ";;;'1"7	"
 &"oomoW![[M)%
 (>) /#$7
 
M ) 		-08+/8O
 	
>B
 	
r2   )rq  rH  r   rs  rn  rG  ro  )NNNNNNNN)r*   r+   r,   r-   r,  r    r   r  rd  r#   r;   r   r!   r.   r  r/   r   r  r   r   r   r   r   rL   r1   rP   rQ   s   @r3   rg  rg  g  s0   !O$%7q[Y.*+=QSab    .2.204(,26!%:>6:G
##d*G
 t+G
 &&-	G

 G
 ((4/G
 $;G
  %0047G
 !&t 3G
 +,G
 
(	(G
   G
r2   rg  c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	\
\         SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\\R                        S-  S\S-  S\\R                     S-  S\\R                     S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )MoonshineModeli  r<   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r8   )r:   r;   r4  encoderrg  decoderrI  rE   r<   rG   s     r3   r;   MoonshineModel.__init__  s2     '/'/r2   c                 .    U R                   R                  $ r8   r  rq  rO  s    r3   rP  #MoonshineModel.get_input_embeddings  s    ||(((r2   c                 $    XR                   l        g r8   r  rS  s     r3   rT  #MoonshineModel.set_input_embeddings  s    $)!r2   c                 8    U R                   R                  5         g)z
Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
not be updated during training.
N)r  _freeze_parametersrO  s    r3   freeze_encoderMoonshineModel.freeze_encoder  s    
 	'')r2   c                     [        S5      e)z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
zNot needed for Moonshine)AttributeErrorrO  s    r3   _mask_input_features#MoonshineModel._mask_input_features  s    
 788r2   Nr  r(   decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr  r   rJ   c
                 P   Uc  U R                   " U4SU0U
D6nU R                  " SUUUR                  UR                  UUUU	S.U
D6n[	        UR                  UR
                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )a:  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, MoonshineModel
>>> from datasets import load_dataset

>>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 288]
```
r(   )rh  r(   r  r  r   rX  r   r  )r[  r   decoder_hidden_statesdecoder_attentionsrk  encoder_last_hidden_stater  encoder_attentionsr)   )	r  r  r[  r(   r   r   rI   r5  rk  )rE   r  r(   r  r  r  r   r  r  r  r   decoder_outputss               r3   rL   MoonshineModel.forward  s    Z "/3||L/rYg/rkq/rOEI\\ 
F
'1"1"C"C#2#A#A+/-
F
 
F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r2   )r  r  )	NNNNNNNNN)r*   r+   r,   r-   r#   r;   rP  rT  r  r  r   r   r.   r  r  r   r
   r   r   r   r   rL   r1   rP   rQ   s   @r3   r}  r}    sE    )**9  262659:>BF6:AE?C!%C
''$.C
 ((4/C
 !++d2	C

 !& 0 04 7C
 uU%6%6784?C
 -t3C
  %U%6%67$>C
 $E$4$45<C
 $;C
 +,C
 
C
  C
r2   r}  rh  rm  decoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
NrY   r"   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosr   ro   rv  masked_fill_)rh  rm  r  shifted_input_idss       r3   shift_tokens_rightr  0  sz     "++IOO<(CRC0668ae4adLMM""#4#<lKr2   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                     ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\\          SS\R                  S
-  S\R                   S
-  S\R                   S
-  S\R                   S
-  S\\\R                        S
-  S\S
-  S\\R                     S
-  S\\R                      S
-  S\S
-  S\R                   S
-  S\\   S\4S jj5       5       rSrU =r$ )!MoonshineForConditionalGenerationi@  zproj_out.weightz!model.decoder.embed_tokens.weightr<   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r:   r;   r}  r  r>   r?   r@   ro  proj_outrI  r  s     r3   r;   *MoonshineForConditionalGeneration.__init__H  sH     #F+
		&"4"4f6G6GeT 	r2   c                     U R                   $ r8   r  rO  s    r3   get_output_embeddings7MoonshineForConditionalGeneration.get_output_embeddingsP  s    }}r2   c                     Xl         g r8   r  )rE   new_embeddingss     r3   set_output_embeddings7MoonshineForConditionalGeneration.set_output_embeddingsS  s    &r2   rJ   c                 6    U R                   R                  5       $ r8   )r  rP  rO  s    r3   rP  6MoonshineForConditionalGeneration.get_input_embeddingsV  s    zz..00r2   Nr  r(   r  r  r  r   r  r  r  labelsr   c                    U
b:  Uc7  Uc4  [        XR                  R                  U R                  R                  5      nU R                  " U4UUUUUUUU	S.UD6nU R                  UR                  5      nSnU
b$  U R                  XU R                  R                  S9n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )ah  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values

>>> generated_ids = model.generate(input_values, max_new_tokens=100)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
```N)r(   r  r  r  r   r  r  r  )logitsr  ro  )	lossr  r   r  r  rk  r  r  r  )r  r<   rm  r  r  r  r[  loss_functionro  r   r   r  r  rk  r  r  r  )rE   r  r(   r  r  r  r   r  r  r  r  r   outputsr  r  s                  r3   rL   )MoonshineForConditionalGeneration.forwardY  s   d  (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+"7!5'
 '
 w889%%Vt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r2   )r  r  )
NNNNNNNNNN)r*   r+   r,   r-   _tied_weights_keysr#   r;   r  r  r>   re  rP  r   r   r.   r  r  r   r
   r   r   r   r   rL   r1   rP   rQ   s   @r3   r  r  @  sr    ,-PQ '1bii 1  262659:>BF6:AE?C!%*.R
''$.R
 ((4/R
 !++d2	R

 !& 0 04 7R
 uU%6%6784?R
 -t3R
  %U%6%67$>R
 $E$4$45<R
 $;R
   4'R
 +,R
 
R
  R
r2   r  )r}  r  r  )r   )r"   )Jcollections.abcr   dataclassesr   typingr   r.   torch.nnr>   activationsr   cache_utilsr   r	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr    r!   configuration_moonshiner#   r&   re  r5   rS   r`   r/   r}   r   r   r   r   r   r   r   r  r  r4  rg  r}  r  r  __all__r)   r2   r3   <module>r     s^  * % !    ! C C ) / J B 9  L F & I I G E 4 
// / /")) "))  @<ryy @<F	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%26%P )*w) w) +w)t.6 .bE6 EP # # #0d
/ d
N _
/ _
 _
D b
- b
 b
J%,, c [^   
h
(@/ h

h
V ^r2   