
    Z j"l                     v   S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r
  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  \\" SS9 " S S\5      5       5       r) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\"5      r- " S S \$5      r. " S! S"\RT                  5      r/ " S# S$\5      r0\ " S% S&\5      5       r1\" S'S9 " S( S)\15      5       r2\ " S* S+\5      5       r3\" S,S9 " S- S.\15      5       r4/ S/Qr5g)0zPyTorch Parakeet model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )%FastSpeech2ConformerConvolutionModule)LlamaAttentioneager_attention_forward   )ParakeetCTCConfigParakeetEncoderConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   >    \ rS rSr% Sr\R                  S-  \S'   Srg)ParakeetEncoderModelOutput%   Nattention_mask )	__name__
__module____qualname____firstlineno__r!   torchTensor__annotations____static_attributes__r"       ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/parakeet/modular_parakeet.pyr   r   %   s     +/NELL4'.r+   r   c                      ^  \ rS rSr% Sr\R                  \S'   S	S\4U 4S jjjr	\R                  " 5       S\R                  4S j5       rSrU =r$ )
$ParakeetEncoderRelPositionalEncoding/   z*Relative positional encoding for Parakeet.inv_freqconfigc           	      &  > [         TU ]  5         UR                  U l        SnSU[        R                  " SUR
                  S[        R                  S9R                  U[        R                  S9UR
                  -  -  -  nU R                  SUSS	9  g )
N     @      ?r   r   dtype)devicer6   r0   F)
persistent)
super__init__max_position_embeddingsr'   arangehidden_sizeint64tofloatregister_buffer)selfr1   r7   baser0   	__class__s        r,   r:   -ParakeetEncoderRelPositionalEncoding.__init__4   s    '-'E'E$Q 2 2AU[[ILLTZbgbmbmLn$$%
 	ZeDr+   hidden_statesc                    UR                   S   nX R                  :  a  [        SU SU R                   S35      e[        R                  " US-
  U* SUR
                  S9nU R                  S S S 2S 4   R                  5       R                  UR                   S   SS5      R                  UR
                  5      nUS S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OS	n[        US
S9   UR                  5       UR                  5       -  R                  SS5      nUR                  5       nUR!                  5       n	[        R"                  " X/SS9n
U
R$                  " / U
R                   S S QSP76 n
S S S 5        W
R                  UR&                  S9$ ! , (       d  f       N'= f)Nr   zSequence Length: z= has to be less or equal than config.max_position_embeddings .r7   r   mpscpuF)device_typeenabledr   dimr5   )shaper;   
ValueErrorr'   r<   r7   r0   r@   expandr?   
isinstancetypestrr   	transposesincosstackreshaper6   )rB   rF   
seq_lengthposition_idsinv_freq_expandedposition_ids_expandedrM   freqsrY   rZ   	pos_embeds              r,   forward,ParakeetEncoderRelPositionalEncoding.forwardB   s   "((+
444#J< 02262N2N1OqR 
 ||JNZKML`L`aMM$4-(..0778K8KA8NPRTUVYYZgZnZno 	 !-T4] ; A A C -..33S99m>R>R>W>W[`>`   %% 	
 UC&,,.1F1L1L1NNYYZ[]^_E))+C))+CSJB7I!))D9??3B+?DDI D ||-"5"5|66 DCs   6B	G  
G.)r;   N)r#   r$   r%   r&   __doc__r'   r(   r)   r   r:   no_gradrc   r*   __classcell__rD   s   @r,   r.   r.   /   sJ    4llE4 E E ]]_7U\\ 7 7r+   r.   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )ParakeetEncoderFeedForwarda   r1   c                 X  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        UR                  U l        g )Nbias)r9   r:   r   Linearr=   intermediate_sizeattention_biaslinear1r   
hidden_act
activationlinear2activation_dropoutrB   r1   rD   s     r,   r:   #ParakeetEncoderFeedForward.__init__b   s|    yy!3!3V5M5MTZTiTij !2!23yy!9!96;M;MTZTiTij"(";";r+   c                     U R                  U R                  U5      5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      nU$ )Nptraining)ru   rs   r   
functionaldropoutrw   r}   rv   )rB   rF   s     r,   rc   "ParakeetEncoderFeedForward.forwardi   sS    ](CD--m?V?Vaeanan-o]3r+   )ru   rw   rs   rv   )	r#   r$   r%   r&   r   r:   rc   r*   rh   ri   s   @r,   rk   rk   a   s    <4 < r+   rk   c                   4   ^  \ rS rSrSS\4U 4S jjjrSrU =r$ ) ParakeetEncoderConvolutionModulep   r1   c                 $   > [         TU ]  X5        g re   )r9   r:   )rB   r1   module_configrD   s      r,   r:   )ParakeetEncoderConvolutionModule.__init__q   s    /r+   r"   re   )r#   r$   r%   r&   r   r:   r*   rh   ri   s   @r,   r   r   p   s    04 0 0r+   r   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \R                  4   4
S jjrS rSrU =r$ )ParakeetEncoderAttentionu   ztMulti-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860.r1   	layer_idxc                   > [         TU ]  XS9  SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " [        R                  " UR                  U R                  5      5      U l        [        R                  " [        R                  " UR                  U R                  5      5      U l        g )N)r   Frn   )r9   r:   	is_causalr   rp   r=   num_attention_headshead_dimrelative_k_proj	Parameterr'   zerosbias_ubias_vrB   r1   r   rD   s      r,   r:   !ParakeetEncoderAttention.__init__x   s    5!yy););V=W=WZ^ZgZg=gnstll5;;v/I/I4==#YZll5;;v/I/I4==#YZr+   NrF   position_embeddingsr!   kwargsreturnc           
         UR                   S S nUu  pgXgSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nXR                  R                  SU R                  R                  SU R                  5      -   nXR                  R                  SU R                  R                  SU R                  5      -   nU R                  U5      nUR                  USU R                  R                  U R                  5      nXR!                  SSSS5      -  nU R#                  U5      nUSS U24   nUU R$                  -  nUb)  UR'                  UR)                  5       [+        S5      5      nU" U 4UU
UUU R,                  (       d  SOU R.                  U R$                  S	.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )
NrI   r   r   r   r   .z-inf        )querykeyvaluer!   r   scaling)rR   r   q_projviewrX   k_projv_projr   get_interfacer1   _attn_implementationr   r   r   r   r   permute
_rel_shiftr   masked_fill_logical_notr@   r}   attention_dropoutr\   
contiguouso_proj)rB   rF   r   r!   r   input_shape
batch_sizer]   hidden_shapequery_states
key_statesvalue_statesattention_interfacequery_states_with_bias_uquery_states_with_bias_vrelative_key_states	matrix_bdattn_outputattn_weightss                      r,   rc    ParakeetEncoderAttention.forward   s]    $))#2.!,
"DMMB{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?(M(MKK,,.E)
 $0++2B2Bt{{..4==3
 $
  $0++2B2Bt{{..4==3
 $
  #223FG166z2t{{GfGfhlhuhuv -/J/J1aQRTU/VV	OOI.	c;J;./	,	% "..~/I/I/KUSY][I %8	%
*$#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r+   c                     UR                   u  p#pE[        R                  R                  USS9nUR	                  X#SU5      nUSS2SS2SS24   R	                  X#XE5      nU$ )ztRelative position shift for Shaw et al. style attention. See appendix B of https://huggingface.co/papers/1901.02860.)r   r   )padrI   Nr   )rR   r   r~   r   r   )rB   attention_scoresr   	num_headsquery_lengthposition_lengths         r,   r   #ParakeetEncoderAttention._rel_shift   si    ?O?U?U<
|==,,-=6,J+00LY+Aq!"H5:::R^pr+   )r   r   r   r   re   )r#   r$   r%   r&   rf   r   intr:   r'   r(   r   r   tuplerc   r   r*   rh   ri   s   @r,   r   r   u   s    ~[4 [ [ /3	7)||7) #\\D07) t+	7)
 +,7) 
u||U\\)	*7)r   r+   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr
SS\R                  S\R                  4S	 jjrS
rU =r$ ) ParakeetEncoderSubsamplingConv2D   r1   c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  S-
  S-  U l        [        [        R                  " UR                  5      5      U l        [        R                  " 5       U l        U R                   R#                  [        R$                  " SU R                  U R                  U R
                  U R                  S95        U R                   R#                  [        R&                  " 5       5        [)        U R                  S-
  5       H  nU R                   R#                  [        R$                  " U R                  U R                  U R                  U R
                  U R                  U R                  S95        U R                   R#                  [        R$                  " U R                  U R                  SS95        U R                   R#                  [        R&                  " 5       5        M     UR*                  U R
                  U R                  -  -  n[        R,                  " UR                  U-  UR.                  SS9U l        g )Nr   r   )kernel_sizestridepadding)r   r   r   groupsr   Trn   )r9   r:   subsampling_conv_kernel_sizer   subsampling_conv_strider   subsampling_conv_channelschannelsr   r   mathlog2subsampling_factor
num_layersr   
ModuleListlayersappendConv2dReLUrangenum_mel_binsrp   r=   linear)rB   r1   i
out_lengthrD   s       r,   r:   )ParakeetEncoderSubsamplingConv2D.__init__   s   !>>4488((1,2dii(A(ABC mmoIIaD4D4DT[[bfbnbno	
 	2779%t*+AKK		MMMM $ 0 0;; LL==	 KKryySTUVKKrwwy) ," ((T[[$//-IJ
ii @ @: MvOaOahlmr+   input_lengths
conv_layerc                     [        US5      (       aR  UR                  S:w  aB  UR                  nUR                  S   nUR                  S   nXS   -   US   -   U-
  U-  S-   nU$ U$ )Nr   )r   r   r   r   )hasattrr   r   r   )rB   r   r   r   r   r   output_lengthss          r,   _get_output_length3ParakeetEncoderSubsamplingConv2D._get_output_length   sy    :x((Z->->&-H ((G$003K&&q)F+aj871:ESX^^abbN!!r+   input_featuresr!   c                     UR                  S5      nUb  UR                  S5      OS nU R                   H  nU" U5      n[        U[        R
                  5      (       d  M,  Uc  M1  U R                  XE5      nUR                  S   n[        R                  " XbR                  S9US S 2S 4   :  nX7S S 2S S S 2S 4   -  nM     UR                  SS5      R                  UR                  S   UR                  S   S5      nU R                  U5      nU$ )Nr   rI   r   rJ   r   )	unsqueezesumr   rU   r   r   r   rR   r'   r<   r7   rX   r\   r   )rB   r   r!   rF   current_lengthslayercurrent_seq_lengthchannel_masks           r,   rc   (ParakeetEncoderSubsamplingConv2D.forward   s   &0034B4N.,,R0TX[[E!-0M %++0J"&"9"9/"Q%2%8%8%;"LL!3<Q<QRUdefhlelUmm  aq$.>!?? ! &//15==m>Q>QRS>TVcViVijkVlnpqM2r+   )r   r   r   r   r   r   r   re   )r#   r$   r%   r&   r   r:   r'   r(   r   r   r   rc   r*   rh   ri   s   @r,   r   r      sN    !n4 !nF	 	")) 	ell ELL  r+   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr  SS\R                  S\R                  S-  S\R                  S-  S	\	\
   S
\R                  4
S jjrSrU =r$ )ParakeetEncoderBlocki	  Nr1   r   c                 "  > [         TU ]  5         SU l        [        U5      U l        [        X5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  5      U l        g NF)r9   r:   gradient_checkpointingrk   feed_forward1r   	self_attnr   convfeed_forward2r   	LayerNormr=   norm_feed_forward1norm_self_att	norm_convnorm_feed_forward2norm_outr   s      r,   r:   ParakeetEncoderBlock.__init__
  s    &+#7?1&D4V<	7?"$,,v/A/A"B\\&*<*<=f&8&89"$,,v/A/A"BV%7%78r+   rF   r!   r   r   r   c                 l   UnU R                  U R                  U5      5      nUSU-  -   nU R                  U5      nU R                  " SUUUS.UD6u  pxX-   nU R	                  U R                  U5      US9n	X-   nU R                  U R                  U5      5      n
USU
-  -   nU R                  U5      nU$ )Ng      ?)rF   r!   r   )r!   r"   )	r   r   r   r   r   r   r   r   r   )rB   rF   r!   r   r   residualnormalized_hidden_statesr   _conv_output
ff2_outputs              r,   rc   ParakeetEncoderBlock.forward  s     !**4+B+B=+QR 3#66#'#5#5m#D  
2) 3
 	
 &3ii} =ni]%3''(?(?(NO
%j(88m4r+   )
r   r   r   r   r   r   r   r   r   r   re   NN)r#   r$   r%   r&   r   r   r:   r'   r(   r   r   rc   r*   rh   ri   s   @r,   r   r   	  s    94 9t 9 9$ /337	|| t+ #\\D0	
 +, 
 r+   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSrSrSr\\S	.r\R*                  " 5       U 4S
 j5       rS\R.                  4S jrSS\R.                  S\S-  4S jjrSrU =r$ )ParakeetPreTrainedModeli8  r1   modelr   audioTr   F)rF   
attentionsc           	        > [         TU ]  U5        [        U R                  S5      (       a  U R                  R                  nO%[        U R                  R                  5       SS5      n[        U[        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g [        U[        5      (       ax  SS[        R                  " SU R                  R                   S[        R"                  S	9U R                  R                   -  -  -  n[        R$                  " UR&                  U5        g g )
Ninitializer_rangeg{Gz?r   )meanstdr4   r3   r   r   r5   )r9   _init_weightsr   r1   r  getattrget_text_configrU   r   initnormal_r   r   r.   r'   r<   r=   r>   copy_r0   )rB   moduler  r0   rD   s       r,   r  %ParakeetPreTrainedModel._init_weightsN  s    f%4;; 344++//C $++5579LdSCf677LLSc:LLSc: DEEELLDKK,C,CQekkZ]a]h]h]t]ttuH JJv1	 Fr+   r   c                 "   [        U R                  [        5      (       a  U R                  R                  OU R                  nUR                  nUR
                  n[        [        R                  " UR                  5      5      nUS-
  S-  S-  nXc-
  nUn[        U5       HQ  n	[        R                  " UR                  [        R                  S9U-   U5      S-   n[        R                  " U5      nMS     UR                  [        R                  S9$ )Nr   r   r5   r4   )rU   r1   r   encoder_configr   r   r   r   r   r   r   r'   divr?   r@   floor)
rB   r   r  r   r   r   all_paddingsadd_padlengthsr  s
             r,   _get_subsampling_output_length6ParakeetPreTrainedModel._get_subsampling_output_lengthb  s    7A$++O`7a7a33gkgrgr$AA77>#D#DEF
#aA-1,z"Aii


 = GPSVVGkk'*G # zz		z**r+   Nr!   target_lengthc                     U R                  UR                  S5      5      nUb  UOUR                  5       n[        R                  " XAR
                  S9USS2S4   :  nU$ )z
Convert the input attention mask to its subsampled form. `target_length` sets the desired output length, useful
when the attention mask length differs from `sum(-1).max()` (i.e., when the longest sequence in the batch is padded)
rI   NrJ   )r  r   maxr'   r<   r7   )rB   r!   r!  r   
max_lengths        r,   _get_output_attention_mask2ParakeetPreTrainedModel._get_output_attention_masks  sa    
 <<^=O=OPR=ST&3&?]^EWEWEY
j9N9NOR`abdhahRiir+   r"   re   )r#   r$   r%   r&   r   r)   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flat_attention_mask_supports_sdpa_supports_flex_attn_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr'   rg   r  r(   r  r   r%  r*   rh   ri   s   @r,   r  r  8  s    &O&*#/0$(!N !!"&-.
 ]]_2 2&+ELL +"	 	VY\`V` 	 	r+   r  z{
    The Parakeet Encoder model, based on the [Fast Conformer architecture](https://huggingface.co/papers/2305.05084).
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	\
\  SS\R                  S\R                  S-  S\S	\\   S
\4
S jj5       5       5       5       rSrU =r$ )ParakeetEncoderi  r1   encoderc           	        > [         TU ]  U5        Xl        SU l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        U5      U l        [        U5      U l        [         R"                  " [%        UR&                  5       Vs/ s H  n[)        X5      PM     sn5      U l        U R-                  5         g s  snf )NFr4   )r9   r:   r1   r   r   dropout_positions	layerdropscale_inputr   sqrtr=   input_scaler   subsamplingr.   encode_positionsr   r   r   num_hidden_layersr   r   	post_initr   s      r,   r:   ParakeetEncoder.__init__  s     &+#~~!'!9!9))<B<N<N499V%7%78TW;FC DV LmmFKFLdLdFefFe!&4Fef
 	 gs   DNr   r!   output_attention_maskr   r   c                     U R                  X5      nXPR                  -  nU R                  U5      n[        R                  R                  XPR
                  U R                  S9n[        R                  R                  X`R                  U R                  S9nUbp  U R                  X%R                  S   S9nUR                  S5      R                  SUR                  S   S5      nX"R                  SS5      -  nUR                  S5      nU R                   HR  nSn	U R                  (       a'  [        R                  " / 5      n
XR                   :  a  Sn	U	(       a  MF  U" U4UUS	.UD6nMT     [#        UUb  U(       a  WR%                  5       S
9$ SS
9$ )a  
output_attention_mask (`bool`, *optional*, defaults to `True`):
    Whether to return the output attention mask. Only effective when `attention_mask` is provided.

Example:

```python
>>> from transformers import AutoProcessor, ParakeetEncoder
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> encoder = ParakeetEncoder.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"])
>>> encoder_outputs = encoder(**inputs)

>>> print(encoder_outputs.last_hidden_state.shape)
```
r{   Nr   r!  rI   r   FT)r!   r   )last_hidden_stater!   )r<  r;  r=  r   r~   r   r}   r7  r%  rR   r   rT   rX   r   r'   randr8  r   r   )rB   r   r!   rA  r   rF   r   output_maskencoder_layerto_dropdropout_probabilitys              r,   rc   ParakeetEncoder.forward  s   F ((H%(8(88"33MB--m||VZVcVc-d mm33#9#9DMM 4 
 %99.XkXklmXn9oK(2215<<RATATUVAWY[\N+.F.Fq!.LLN+55a8N![[MG}}&+jjn#&7"G7 -!!#1(;! 	! )  *+0>0JOd;??,
 	
jn
 	
r+   )	r1   r   r7  r=  r   r;  r8  r   r<  )NT)r#   r$   r%   r&   r   r)   r'  r:   r   r   r   r   r'   r(   boolr   r   r
   rc   r*   rh   ri   s   @r,   r4  r4    s     "!!4 &  /3&*	A
A
 t+A
  $	A

 +,A
 
A
     A
r+   r4  c                       \ rS rSr% Sr\R                  \S'   Sr\	\R                     S-  \S'   Sr\	\	\R                        S-  \S'   Sr\	\	\R                        S-  \S'   Srg)	ParakeetGenerateOutputi  a,  
Outputs of Parakeet models.

Args:
    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
        if all batches finished early due to the `eos_token_id`.
    logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
        Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
        at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
        each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
    hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
        Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
        `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
	sequencesNlogitsr  rF   r"   )r#   r$   r%   r&   rf   r'   
LongTensorr)   rO  r   FloatTensorr  rF   r*   r"   r+   r,   rM  rM    sm    & .2FE%##$t+29=JeE--./$6=<@M5u0012T9@r+   rM  zS
    Parakeet Encoder with a Connectionist Temporal Classification (CTC) head.
    c                   `  ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\	R                  S-  S\	R                  S-  S\\   S	\4
S
 jj5       5       r\	R                  " 5         SS\	R                  S\	R                  S-  S\S\\   S	\\	R$                  -  4
S jj5       rSrU =r$ )ParakeetForCTCi  r1   c                    > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  R                  UR                  SS9U l	        U R                  5         g )Nr   r   )r9   r:   r4  r  r5  r   Conv1dr=   
vocab_sizectc_headr?  rx   s     r,   r:   ParakeetForCTC.__init__  sS     &v'<'<=		&"7"7"C"CVEVEVdefr+   Nr   r!   labelsr   r   c                    U R                   " SUUS.UD6nUR                  nU R                  UR                  SS5      5      R                  SS5      nSnUGbN  Ub  UO"[        R
                  " U[        R                  S9nU R                  UR                  S5      5      n	X0R                  R                  :g  n
U
R                  S5      nUR                  U
5      n[        R                  R                  US[        R                  S9R                  SS5      n[        R                   R"                  R%                  S	S
9   [        R                  R'                  UUU	UU R                  R                  U R                  R(                  U R                  R*                  S9nSSS5        [-        UUUR.                  UR0                  S9$ ! , (       d  f       N.= f)aV  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> outputs = model(**inputs)

>>> print(outputs.loss)
```r   r!   r   r   Nr5   rI   )rP   r6   r   F)rN   )blank	reductionzero_infinity)lossrO  rF   r  r"   )r5  rD  rW  rX   r'   	ones_likelongr  r   r1   pad_token_idmasked_selectr   r~   log_softmaxfloat32backendscudnnflagsctc_lossctc_loss_reductionctc_zero_infinityr   rF   r  )rB   r   r!   rY  r   encoder_outputsrF   rO  r_  r   labels_masktarget_lengthsflattened_targets	log_probss                 r,   rc   ParakeetForCTC.forward  s   : ,, 
))
 
 (99}66q!<=GG1M #1"<%//R`hmhrhrBs  !??@R@RSU@VWM !KK$<$<<K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; )77&11	
 	
 ;:s   ?A G
Greturn_dict_in_generatec                 >   SUS'   U R                   " S	UUS.UD6nUR                  R                  SS9nUb5  U R                  X&R                  S   S9nU R
                  R                  Xb) '   U(       a*  [        UUR                  UR                  UR                  S9$ U$ )
a  
Example:

```python
>>> from transformers import AutoProcessor, ParakeetForCTC
>>> from datasets import load_dataset, Audio

>>> model_id = "nvidia/parakeet-ctc-1.1b"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = ParakeetForCTC.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

>>> inputs = processor(ds[0]["audio"]["array"], text=ds[0]["text"])
>>> predicted_ids = model.generate(**inputs)
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

>>> print(transcription)
```
Treturn_dictr[  rI   rO   r   rC  )rN  rO  r  rF   r"   )
rc   rO  argmaxr%  rR   r1   rb  rM  r  rF   )rB   r   r!   rr  r   outputsrN  s          r,   generateParakeetForCTC.generateW  s    : !%}"&,, #
))#
 #
 NN))b)1	 %!<<^[j[jkl[m<nN)-)A)AIo&")#~~"--%33	  r+   )rW  r5  r  r   )r#   r$   r%   r&   r   r)   r:   r   r   r'   r(   r   r   r   rc   rg   rK  rM  rP  rw  r*   rh   ri   s   @r,   rS  rS    s     0   /3&*	E
E
 t+E
 t#	E

 +,E
 
E
  E
N ]]_ /3(-	33 t+3 "&	3
 +,3 
 %"2"2	23 3r+   rS  )rS  r4  r  )6rf   r   collections.abcr   dataclassesr   r'   r    r   r  activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   4fastspeech2_conformer.modeling_fastspeech2_conformerr   llama.modeling_llamar   r   configuration_parakeetr   r   r   Moduler.   rk   r   r   r   r   r  r4  rM  rS  __all__r"   r+   r,   <module>r     sd     $ !   & ! 9 ? F & V V G 5 h J L 
/ / //7299 /7d 0'L 0
L ~ L ^Bryy BJ,5 ,^ Co C CL 
\
- \

\
~ A[ A A4 
H, H
HV Kr+   