
    Z jŉ                        S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%  \\" SS9 " S S\5      5       5       r& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r*S r+\" S5      S=S j5       r,S\RZ                  S \.S!\RZ                  4S" jr/ S>S#\RN                  S$\RZ                  S%\RZ                  S&\RZ                  S'\RZ                  S-  S(\0S)\0S*\\   4S+ jjr1\" \,5       " S, S-\RN                  5      5       r2 " S. S/\RN                  5      r3 " S0 S1\5      r4\ " S2 S3\5      5       r5\" S4S9 " S5 S6\55      5       r6\ " S7 S8\5      5       r7\" S9S9 " S: S;\55      5       r8/ S<Qr9g)?    N)Callable)	dataclass)nn   )initialization)ACT2FN)use_kernel_func_from_hubuse_kernelized_func)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )ParakeetCTCConfigParakeetEncoderConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   >    \ rS rSr% Sr\R                  S-  \S'   Srg)ParakeetEncoderModelOutput)   Nattention_mask )	__name__
__module____qualname____firstlineno__r   torchTensor__annotations____static_attributes__r        /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/parakeet/modeling_parakeet.pyr   r   )   s     +/NELL4'.r)   r   c                      ^  \ rS rSr% Sr\R                  \S'   S	S\4U 4S jjjr	\R                  " 5       S\R                  4S j5       rSrU =r$ )
$ParakeetEncoderRelPositionalEncoding3   z*Relative positional encoding for Parakeet.inv_freqconfigc           	      &  > [         TU ]  5         UR                  U l        SnSU[        R                  " SUR
                  S[        R                  S9R                  U[        R                  S9UR
                  -  -  -  nU R                  SUSS	9  g )
N     @      ?r      dtype)devicer5   r.   F)
persistent)
super__init__max_position_embeddingsr%   arangehidden_sizeint64tofloatregister_buffer)selfr/   r6   baser.   	__class__s        r*   r9   -ParakeetEncoderRelPositionalEncoding.__init__8   s    '-'E'E$Q 2 2AU[[ILLTZbgbmbmLn$$%
 	ZeDr)   hidden_statesc                    UR                   S   nX R                  :  a  [        SU SU R                   S35      e[        R                  " US-
  U* SUR
                  S9nU R                  S S S 2S 4   R                  5       R                  UR                   S   SS5      R                  UR
                  5      nUS S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OS	n[        US
S9   UR                  5       UR                  5       -  R                  SS5      nUR                  5       nUR!                  5       n	[        R"                  " X/SS9n
U
R$                  " / U
R                   S S QSP76 n
S S S 5        W
R                  UR&                  S9$ ! , (       d  f       N'= f)Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r6   r   mpscpuF)device_typeenabledr3   dimr4   )shaper:   
ValueErrorr%   r;   r6   r.   r?   expandr>   
isinstancetypestrr   	transposesincosstackreshaper5   )rA   rE   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedrL   freqsrX   rY   	pos_embeds              r*   forward,ParakeetEncoderRelPositionalEncoding.forwardF   s   "((+
444#J< 02262N2N1OqR 
 ||JNZKML`L`aMM$4-(..0778K8KA8NPRTUVYYZgZnZno 	 !-T4] ; A A C -..33S99m>R>R>W>W[`>`   %% 	
 UC&,,.1F1L1L1NNYYZ[]^_E))+C))+CSJB7I!))D9??3B+?DDI D ||-"5"5|66 DCs   6B	G  
G.)r:   N)r!   r"   r#   r$   __doc__r%   r&   r'   r   r9   no_gradrb   r(   __classcell__rC   s   @r*   r,   r,   3   sJ    4llE4 E E ]]_7U\\ 7 7r)   r,   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )ParakeetEncoderFeedForwarde   r/   c                 X  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        UR                  U l        g )Nbias)r8   r9   r   Linearr<   intermediate_sizeattention_biaslinear1r   
hidden_act
activationlinear2activation_dropoutrA   r/   rC   s     r*   r9   #ParakeetEncoderFeedForward.__init__f   s|    yy!3!3V5M5MTZTiTij !2!23yy!9!96;M;MTZTiTij"(";";r)   c                     U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU$ )Nptraining)rt   rr   r   
functionaldropoutrv   r|   ru   )rA   rE   s     r*   rb   "ParakeetEncoderFeedForward.forwardm   sS    ](CD--m?V?Vaeanan-o]3r)   )rt   rv   rr   ru   	r!   r"   r#   r$   r   r9   rb   r(   rg   rh   s   @r*   rj   rj   e   s    <4 < r)   rj   c                   >   ^  \ rS rSrSS\4U 4S jjjrSS jrSrU =r$ ) ParakeetEncoderConvolutionModulet   r/   c           
      ,  > [         TU ]  5         UR                  nUc&  UR                  n[        [        USS5         U l        O#US   n[        UR                  SS5         U l        US-
  S-  U l        [        R                  " USU-  SSSUR                  S	9U l        [        R                  " UUUSU R                  UUR                  S
9U l        [        R                  " U5      U l        [        R                  " X3SSSUR                  S	9U l        g)z
Args:
    config (ParakeetEncoderConfig): Configuration for the model.
    module_config (dict): Configuration for the module (e.g., encoder or decoder).
Nrs   silukernel_sizert   r   r3   r   )r   stridepaddingrn   )r   r   groupsrn   )r8   r9   r<   conv_kernel_sizer   getattrrt   getr   r   Conv1dconvolution_biaspointwise_conv1depthwise_convBatchNorm1dnormpointwise_conv2)rA   r/   module_configchannelsr   rC   s        r*   r9   )ParakeetEncoderConvolutionModule.__init__u   s    	%%  11K$WV\6%JKDO'6K$]%6%6|V%LMDO#aA-!yya(l!QVMdMd 
 !iiLL((
 NN8,	!yyAaI`I` 
r)   c                    UR                  SS5      nU R                  U5      n[        R                  R	                  USS9nUb`  UR
                  [        R                  :X  a  [        R                  " U) SS9nO[        R                  " US:H  ) SS9nUR                  US5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      $ )a!  
Compute convolution module.

Args:
    hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
    attention_mask (`torch.Tensor` of shape `(batch, 1, time, time)`): Attention mask.

Returns:
    `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.

r   r3   rN           )rW   r   r   r}   glur5   r%   boolallmasked_fillr   r   rt   r   )rA   rE   r   all_masked_rowss       r*   rb   (ParakeetEncoderConvolutionModule.forward   s     &//15 ,,];))-Q)? %##uzz1"'))^O"C"'))n.C,D!"L)55osKM ++M:		-06,,];&&q!,,r)   )rt   r   r   r   r   r   rd   r   rh   s   @r*   r   r   t   s      
4  
  
D"- "-r)   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrH   r3   rN   )rQ   r%   cat)xx1x2s      r*   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r)   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrY   rX   unsqueeze_dimq_embedk_embeds          r*   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr)   rE   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rQ   rS   r[   )rE   r   batchnum_key_value_headsslenhead_dims         r*   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr)   modulequerykeyvaluer   scalingr~   kwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr3   r   rH   rO   r5   rz   r   )r   num_key_value_groupsr%   matmulrW   r   r}   softmaxfloat32r>   r5   r~   r|   
contiguous)r   r   r   r   r   r   r~   r   
key_statesvalue_statesattn_weightsattn_outputs               r*   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r)   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \R                  4   4
S jjrS rSrU =r$ )ParakeetEncoderAttentioni  ztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r/   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R*                  " [,        R.                  " UR                  U R                  5      5      U l        [        R*                  " [,        R.                  " UR                  U R                  5      5      U l        g )Nr   g      Frm   )r8   r9   r/   r   r   r<   num_attention_headsr   r   r   r   attention_dropout	is_causalr   ro   rq   q_projk_projv_projo_projrelative_k_proj	Parameterr%   zerosbias_ubias_vrA   r/   r   rC   s      r*   r9   !ParakeetEncoderAttention.__init__  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
  "yy););V=W=WZ^ZgZg=gnstll5;;v/I/I4==#YZll5;;v/I/I4==#YZr)   NrE   position_embeddingsr   r   r   c           
         UR                   S S nUu  pgXgSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nXR                  R                  SU R                  R                  SU R                  5      -   nXR                  R                  SU R                  R                  SU R                  5      -   nU R                  U5      nUR                  USU R                  R                  U R                  5      nXR!                  SSSS5      -  nU R#                  U5      nUSS U24   nUU R$                  -  nUb)  UR'                  UR)                  5       [+        S5      5      nU" U 4UU
UUU R,                  (       d  SOU R.                  U R$                  S	.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )
NrH   r   r3   r   r   .z-infr   )r   r   r   r   r~   r   )rQ   r   r   viewrW   r   r   r   get_interfacer/   _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr?   r|   r   r[   r   r   )rA   rE   r   r   r   input_shape
batch_sizer\   hidden_shapequery_statesr   r   attention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdr   r   s                      r*   rb    ParakeetEncoderAttention.forward#  s]    $))#2.!,
"DMMB{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?(M(MKK,,.E)
 $0++2B2Bt{{..4==3
 $
  $0++2B2Bt{{..4==3
 $
  #223FG166z2t{{GfGfhlhuhuv -/J/J1aQRTU/VV	OOI.	c;J;./	,	% "..~/I/I/KUSY][I %8	%
*$#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r)   c                     UR                   u  p#pE[        R                  R                  USS9nUR	                  X#SU5      nUSS2SS2SS24   R	                  X#XE5      nU$ )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padrH   Nr   )rQ   r   r}   r   r   )rA   attention_scoresr   	num_headsquery_lengthposition_lengths         r*   r   #ParakeetEncoderAttention._rel_shift\  si    ?O?U?U<
|==,,-=6,J+00LY+Aq!"H5:::R^pr)   )r   r   r   r/   r   r   r   r   r   r   r   r   r   r   rd   )r!   r"   r#   r$   re   r   intr9   r%   r&   r   r   tuplerb   r   r(   rg   rh   s   @r*   r   r     s    ~[4 [ [B /3	7)||7) #\\D07) t+	7)
 +,7) 
u||U\\)	*7)r   r)   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr
SS\R                  S\R                  4S	 jjrS
rU =r$ ) ParakeetEncoderSubsamplingConv2Die  r/   c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  S-
  S-  U l        [        [        R                  " UR                  5      5      U l        [        R                  " 5       U l        U R                   R#                  [        R$                  " SU R                  U R                  U R
                  U R                  S95        U R                   R#                  [        R&                  " 5       5        [)        U R                  S-
  5       H  nU R                   R#                  [        R$                  " U R                  U R                  U R                  U R
                  U R                  U R                  S95        U R                   R#                  [        R$                  " U R                  U R                  SS95        U R                   R#                  [        R&                  " 5       5        M     UR*                  U R
                  U R                  -  -  n[        R,                  " UR                  U-  UR.                  SS9U l        g )Nr   r3   )r   r   r   )r   r   r   r   r   Trm   )r8   r9   subsampling_conv_kernel_sizer   subsampling_conv_strider   subsampling_conv_channelsr   r   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsro   r<   linear)rA   r/   i
out_lengthrC   s       r*   r9   )ParakeetEncoderSubsamplingConv2D.__init__f  s   !>>4488((1,2dii(A(ABC mmoIIaD4D4DT[[bfbnbno	
 	2779%t*+AKK		MMMM $ 0 0;; LL==	 KKryySTUVKKrwwy) ," ((T[[$//-IJ
ii @ @: MvOaOahlmr)   input_lengths
conv_layerc                     [        US5      (       aR  UR                  S:w  aB  UR                  nUR                  S   nUR                  S   nXS   -   US   -   U-
  U-  S-   nU$ U$ )Nr   )r   r   r   r   )hasattrr   r   r   )rA   r  r  r   r   r   output_lengthss          r*   _get_output_length3ParakeetEncoderSubsamplingConv2D._get_output_length  sy    :x((Z->->&-H ((G$003K&&q)F+aj871:ESX^^abbN!!r)   input_featuresr   c                     UR                  S5      nUb  UR                  S5      OS nU R                   H  nU" U5      n[        U[        R
                  5      (       d  M,  Uc  M1  U R                  XE5      nUR                  S   n[        R                  " XbR                  S9US S 2S 4   :  nX7S S 2S S S 2S 4   -  nM     UR                  SS5      R                  UR                  S   UR                  S   S5      nU R                  U5      nU$ )Nr   rH   r3   rI   r   )r   sumr  rT   r   r  r  rQ   r%   r;   r6   rW   r[   r  )rA   r  r   rE   current_lengthslayercurrent_seq_lengthchannel_masks           r*   rb   (ParakeetEncoderSubsamplingConv2D.forward  s   &0034B4N.,,R0TX[[E!-0M %++0J"&"9"9/"Q%2%8%8%;"LL!3<Q<QRUdefhlelUmm  aq$.>!?? ! &//15==m>Q>QRS>TVcViVijkVlnpqM2r)   )r   r   r  r  r   r   r   rd   )r!   r"   r#   r$   r   r9   r%   r&   r   r  r  rb   r(   rg   rh   s   @r*   r   r   e  sN    !n4 !nF	 	")) 	ell ELL  r)   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr  SS\R                  S\R                  S-  S\R                  S-  S	\	\
   S
\R                  4
S jjrSrU =r$ )ParakeetEncoderBlocki  Nr/   r   c                 "  > [         TU ]  5         SU l        [        U5      U l        [        X5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g NF)r8   r9   gradient_checkpointingrj   feed_forward1r   	self_attnr   convfeed_forward2r   	LayerNormr<   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r*   r9   ParakeetEncoderBlock.__init__  s    &+#7?1&D4V<	7?"$,,v/A/A"B\\&*<*<=f&8&89"$,,v/A/A"BV%7%78r)   rE   r   r   r   r   c                 l   UnU R                  U R                  U5      5      nUSU-  -   nU R                  U5      nU R                  " SUUUS.UD6u  pxX-   nU R	                  U R                  U5      US9n	X-   nU R                  U R                  U5      5      n
USU
-  -   nU R                  U5      nU$ )Ng      ?)rE   r   r   )r   r    )	r   r%  r&  r!  r"  r'  r#  r(  r)  )rA   rE   r   r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputs              r*   rb   ParakeetEncoderBlock.forward  s     !**4+B+B=+QR 3#66#'#5#5m#D  
2) 3
 	
 &3ii} =ni]%3''(?(?(NO
%j(88m4r)   )
r"  r   r#  r  r'  r%  r(  r)  r&  r!  rd   NN)r!   r"   r#   r$   r   r   r9   r%   r&   r   r   rb   r(   rg   rh   s   @r*   r  r    s    94 9t 9 9$ /337	|| t+ #\\D0	
 +, 
 r)   r  c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSrSrSr\\S	.r\R*                  " 5       U 4S
 j5       rS\R.                  4S jrSS\R.                  S\S-  4S jjrSrU =r$ )ParakeetPreTrainedModeli  r/   modelr  audioTr  F)rE   
attentionsc           	        > [         TU ]  U5        [        U R                  S5      (       a  U R                  R                  nO%[        U R                  R                  5       SS5      n[        U[        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g [        U[        5      (       ax  SS[        R                  " SU R                  R                   S[        R"                  S	9U R                  R                   -  -  -  n[        R$                  " UR&                  U5        g g )
Ninitializer_rangeg{Gz?r   )meanstdr2   r1   r   r3   r4   )r8   _init_weightsr  r/   r9  r   get_text_configrT   r   initnormal_r   r   r,   r%   r;   r<   r=   copy_r.   )rA   r   r;  r.   rC   s       r*   r<  %ParakeetPreTrainedModel._init_weights  s    f%4;; 344++//C $++5579LdSCf677LLSc:LLSc: DEEELLDKK,C,CQekkZ]a]h]h]t]ttuH JJv1	 Fr)   r  c                 "   [        U R                  [        5      (       a  U R                  R                  OU R                  nUR                  nUR
                  n[        [        R                  " UR                  5      5      nUS-
  S-  S-  nXc-
  nUn[        U5       HQ  n	[        R                  " UR                  [        R                  S9U-   U5      S-   n[        R                  " U5      nMS     UR                  [        R                  S9$ )Nr   r3   r4   r2   )rT   r/   r   encoder_configr   r   r   r   r   r   r  r%   divr>   r?   floor)
rA   r  rC  r   r   r   all_paddingsadd_padlengthsr.  s
             r*   _get_subsampling_output_length6ParakeetPreTrainedModel._get_subsampling_output_length  s    7A$++O`7a7a33gkgrgr$AA77>#D#DEF
#aA-1,z"Aii


 = GPSVVGkk'*G # zz		z**r)   Nr   target_lengthc                     U R                  UR                  S5      5      nUb  UOUR                  5       n[        R                  " XAR
                  S9USS2S4   :  nU$ )z
Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
rH   NrI   )rI  r  maxr%   r;   r6   )rA   r   rK  r  
max_lengths        r*   _get_output_attention_mask2ParakeetPreTrainedModel._get_output_attention_mask  sa    
 <<^=O=OPR=ST&3&?]^EWEWEY
j9N9NOR`abdhahRiir)   r    rd   )r!   r"   r#   r$   r   r'   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr%   rf   r<  r&   rI  r   rO  r(   rg   rh   s   @r*   r4  r4    s    &O&*#/0$(!N !!"&-.
 ]]_2 2&+ELL +"	 	VY\`V` 	 	r)   r4  z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	\
\  SS\R                  S\R                  S-  S\S	\\   S
\4
S jj5       5       5       5       rSrU =r$ )ParakeetEncoderi   r/   encoderc           	        > [         TU ]  U5        Xl        SU l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        U5      U l        [        U5      U l        [         R"                  " [%        UR&                  5       Vs/ s H  n[)        X5      PM     sn5      U l        U R-                  5         g s  snf )NFr2   )r8   r9   r/   r  r~   dropout_positions	layerdropscale_inputr   sqrtr<   input_scaler   subsamplingr,   encode_positionsr   r  r  num_hidden_layersr  r  	post_initr   s      r*   r9   ParakeetEncoder.__init__)  s     &+#~~!'!9!9))<B<N<N499V%7%78TW;FC DV LmmFKFLdLdFefFe!&4Fef
 	 gs   DNr  r   output_attention_maskr   r   c                     U R                  X5      nXPR                  -  nU R                  U5      n[        R                  R                  XPR
                  U R                  S9n[        R                  R                  X`R                  U R                  S9nUbp  U R                  X%R                  S   S9nUR                  S5      R                  SUR                  S   S5      nX"R                  SS5      -  nUR                  S5      nU R                   HR  nSn	U R                  (       a'  [        R                  " / 5      n
XR                   :  a  Sn	U	(       a  MF  U" U4UUS	.UD6nMT     [#        UUb  U(       a  WR%                  5       S
9$ SS
9$ )a  
output_attention_mask (`bool`, *optional*, defaults to `True`):
    Whether to return the output attention mask. Only effective when `attention_mask` is provided.

Example:

```python
>>> from transformers import AutoProcessor, ParakeetEncoder
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> encoder = ParakeetEncoder.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"])
>>> encoder_outputs = encoder(**inputs)

>>> print(encoder_outputs.last_hidden_state.shape)
```
rz   Nr   rK  rH   r3   FT)r   r   )last_hidden_stater   )rf  re  rg  r   r}   r~   r|   ra  rO  rQ   r   rS   rW   r  r%   randrb  r   r   )rA   r  r   rk  r   rE   r   output_maskencoder_layerto_dropdropout_probabilitys              r*   rb   ParakeetEncoder.forward<  s   F ((H%(8(88"33MB--m||VZVcVc-d mm33#9#9DMM 4 
 %99.XkXklmXn9oK(2215<<RATATUVAWY[\N+.F.Fq!.LLN+55a8N![[MG}}&+jjn#&7"G7 -!!#1(;! 	! )  *+0>0JOd;??,
 	
jn
 	
r)   )	r/   r~   ra  rg  r  re  rb  r  rf  )NT)r!   r"   r#   r$   r   r'   rQ  r9   r   r   r   r   r%   r&   r   r   r   r   rb   r(   rg   rh   s   @r*   r^  r^     s     "!!4 &  /3&*	A
A
 t+A
  $	A

 +,A
 
A
     A
r)   r^  c                       \ rS rSr% Sr\R                  \S'   Sr\	\R                     S-  \S'   Sr\	\	\R                        S-  \S'   Sr\	\	\R                        S-  \S'   Srg)	ParakeetGenerateOutputi  a,  
Outputs of Parakeet models.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
	sequencesNlogitsr7  rE   r    )r!   r"   r#   r$   re   r%   
LongTensorr'   rx  r   FloatTensorr7  rE   r(   r    r)   r*   rv  rv    sm    & .2FE%##$t+29=JeE--./$6=<@M5u0012T9@r)   rv  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                   `  ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\	R                  S-  S\	R                  S-  S\\   S	\4
S
 jj5       5       r\	R                  " 5         SS\	R                  S\	R                  S-  S\S\\   S	\\	R$                  -  4
S jj5       rSrU =r$ )ParakeetForCTCi  r/   c                    > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  R                  UR                  SS9U l	        U R                  5         g )Nr   r   )r8   r9   r^  rC  r_  r   r   r<   
vocab_sizectc_headri  rw   s     r*   r9   ParakeetForCTC.__init__  sS     &v'<'<=		&"7"7"C"CVEVEVdefr)   Nr  r   labelsr   r   c                    U R                   " SUUS.UD6nUR                  nU R                  UR                  SS5      5      R                  SS5      nSnUGbN  Ub  UO"[        R
                  " U[        R                  S9nU R                  UR                  S5      5      n	X0R                  R                  :g  n
U
R                  S5      nUR                  U
5      n[        R                  R                  US[        R                  S9R                  SS5      n[        R                   R"                  R%                  S	S
9   [        R                  R'                  UUU	UU R                  R                  U R                  R(                  U R                  R*                  S9nSSS5        [-        UUUR.                  UR0                  S9$ ! , (       d  f       N.= f)aV  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> outputs = model(**inputs)

>>> print(outputs.loss)
```r  r   r   r3   Nr4   rH   r   r   F)rM   )blank	reductionzero_infinity)lossrx  rE   r7  r    )r_  rn  r  rW   r%   	ones_likelongrI  r  r/   pad_token_idmasked_selectr   r}   log_softmaxr   backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   rE   r7  )rA   r  r   r  r   encoder_outputsrE   rx  r  r  labels_masktarget_lengthsflattened_targets	log_probss                 r*   rb   ParakeetForCTC.forward  s   : ,, 
))
 
 (99}66q!<=GG1M #1"<%//R`hmhrhrBs  !??@R@RSU@VWM !KK$<$<<K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; )77&11	
 	
 ;:s   ?A G
Greturn_dict_in_generatec                 >   SUS'   U R                   " S	UUS.UD6nUR                  R                  SS9nUb5  U R                  X&R                  S   S9nU R
                  R                  Xb) '   U(       a*  [        UUR                  UR                  UR                  S9$ U$ )
a  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> predicted_ids = model.generate(**inputs)
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

>>> print(transcription)
```
Treturn_dictr  rH   rN   r   rm  )rw  rx  r7  rE   r    )
rb   rx  argmaxrO  rQ   r/   r  rv  r7  rE   )rA   r  r   r  r   outputsrw  s          r*   generateParakeetForCTC.generate  s    : !%}"&,, #
))#
 #
 NN))b)1	 %!<<^[j[jkl[m<nN)-)A)AIo&")#~~"--%33	  r)   )r  r_  r2  r  )r!   r"   r#   r$   r   r'   r9   r   r   r%   r&   r   r   r   rb   rf   r   rv  ry  r  r(   rg   rh   s   @r*   r|  r|    s     0   /3&*	E
E
 t+E
 t#	E

 +,E
 
E
  E
N ]]_ /3(-	33 t+3 "&	3
 +,3 
 %"2"2	23 3r)   r|  )r|  r^  r4  )r   )r   ):r   collections.abcr   dataclassesr   r%   r    r   r>  activationsr   integrationsr	   r
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_parakeetr   r   r   Moduler,   rj   r   r   r   r&   r   r   r?   r   r   r   r  r4  r^  rv  r|  __all__r    r)   r*   <module>r     s8  *  $ !   & ! I 9 ? F & V V G 5 L 
/ / //7299 /7d E-ryy E-P( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*_ ryy _  +_ DBryy BJ,5 ,^ Co C CL 
\
- \

\
~ A[ A A4 
H, H
HV Kr)   