
    Z j                    6   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,  \)RZ                  " \.5      r/Sr0S\Rb                  S\2S\24S jr3 S}S\Rb                  S\2S\Rb                  S-  4S jjr4  S~S\5\2\24   S\6S\2S\Rn                  S-  S\2S\Rp                  4S  jjr9 " S! S"\5      r: " S# S$\5      r; " S% S&\5      r< " S' S(\Rz                  5      r> " S) S*\Rz                  5      r? " S+ S,\Rz                  5      r@ " S- S.\R
                  Rz                  5      rA " S/ S0\Rz                  5      rB " S1 S2\Rz                  5      rC " S3 S4\Rz                  5      rD " S5 S6\Rz                  5      rE " S7 S8\Rz                  5      rF " S9 S:\Rz                  5      rG " S; S<\Rz                  5      rH " S= S>\Rz                  \%5      rI " S? S@\Rz                  \%5      rJ " SA SB\Rz                  \%5      rK " SC SD\Rz                  5      rL " SE SF\Rz                  5      rM " SG SH\5      rN " SI SJ\5      rO\( " SK SL\&5      5       rP " SM SN\P5      rQ " SO SP\P5      rR " SQ SR\P5      rS " SS ST\P5      rT " SU SV\P5      rU " SW SX\P5      rV " SY SZ\P5      rW " S[ S\\P5      rX " S] S^\Rz                  5      rY " S_ S`\Rz                  5      rZ\(" SaSb9 " Sc Sd\P5      5       r[\(" SeSb9 " Sf Sg\P\5      5       r\        SSh\PS\R                  Si\R                  S-  S\Rn                  S-  Sj\6Sk\6Sl\6Sm\Rz                  S-  Sn\^So\^S\R                  \5\R                  \R                  4   -  4Sp jjr_\(" SqSb9 " Sr Ss\P5      5       r`\(" StSb9 " Su Sv\P5      5       ra " Sw Sx\Rz                  5      rb\(" SySb9 " Sz S{\&5      5       rc/ S|Qrdg)zPyTorch SpeechT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr+   4   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     US:  a!  U SS2US-
  SU24   n Ub  USS2US-
  SU24   nU R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   UR                  US:H  S5        X24$ )zo
Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
r   Nr#         Y        )r$   r%   r&   r(   )r-   r.   r/   shifted_input_valuess       r*   shift_spectrograms_rightr4   D   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r,   r%   	mask_probmask_length	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ s H  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr6   r5   r7   sequence_lengths     r*   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr,   Nr#   dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper=   put_along_axis)r%   r5   r6   r/   r7   
batch_sizerB   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr>   r?   spec_aug_mask_idxdummy_mask_idxoffsetsr@   rA   s    `` `            @@r*   _compute_mask_indicesre   Z   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I0c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5NoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   ro   %SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r,   c                 J    U R                  U5      nU R                  U5      nU$ N)rw   ry   r{   hidden_statess     r*   forward$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r,   )ry   rw   rq   rr   r   __name__
__module____qualname____firstlineno__ro   r   __static_attributes____classcell__r~   s   @r*   rg   rg      s    A r,   rg   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5LayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   rj   T)elementwise_affine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   	LayerNorm
layer_normr	   rx   ry   rz   s      r*   ro   #SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r,   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nr#   )rw   	transposer   ry   r   s     r*   r   "SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r,   ry   rw   rq   r   rr   r   r   r   s   @r*   r   r      s    A r,   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5GroupNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   rj   T)
num_groupsnum_channelsaffine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   r	   rx   ry   	GroupNormr   rz   s      r*   ro   #SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr,   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rw   r   ry   r   s     r*   r   "SpeechT5GroupNormConvLayer.forward  s2    		-066r,   r   r   r   r   s   @r*   r   r     s    r  r,   r   c            	         ^  \ rS rSrSrSS\S\S\S-  4U 4S jjjrSS\S\S\S-  4S	 jjr\SS\S\S\S-  4S
 jj5       r	\
R                  " 5       SS\
R                  S\4S jj5       r SS\
R                  S\S\S-  4S jjrSrU =r$ )%SpeechT5SinusoidalPositionalEmbeddingi  zDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                    > [         TU ]  5         SU l        Xl        X l        X0l        U R                  XR                  -   X#5        g N   )rn   ro   offsetr   r   r   make_weights)r{   r   r   r   r~   s       r*   ro   .SpeechT5SinusoidalPositionalEmbedding.__init__   s>    **&-++5}Rr,   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsrE   deviceF
persistent)get_embeddinghasattrtor   rE   r   register_buffer)r{   r   r   r   emb_weightss        r*   r   2SpeechT5SinusoidalPositionalEmbedding.make_weights(  s\    ((T4##%..t||/A/A$,,J]J].^KYFr,   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   rD   r   dimr#   N)mathlogtorchexprR   int64float	unsqueezecatsincosviewrO   r   get_default_dtype)r   r   r   half_dimembs        r*   r   3SpeechT5SinusoidalPositionalEmbedding.get_embedding0  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r,   r   past_key_values_lengthc                    UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nX`R
                  R                  S5      :  a3  U R                  X`R                  -   U R                  U R                  5        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       $ )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rK   )r{   r   r   bszseq_lenposition_idsmax_poss          r*   r   -SpeechT5SinusoidalPositionalEmbedding.forwardB  s     ~~'>>yJZJZ\rsvv

 ""Q&0\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVXY``bbr,   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:
Returns: torch.Tensor
r   r   )ner<   r   cumsumtype_aslong)r{   r   r   r   maskincremental_indicess         r*   r   HSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsQ  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r,   )r   r   r   r   r   r   )r   r   r   r   __doc__r<   ro   r   staticmethodr   r   no_gradTensorr   r   r   r   r   s   @r*   r   r     s    NSc S# SCRVJ S SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1" ]]_c cs c c _`88478QTW[Q[8 8r,   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5PositionalConvEmbeddingic  c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr   )rk   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rn   ro   r   rs   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsrw   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r	   rx   ry   )r{   r|   r   r   r   r   r~   s         r*   ro   (SpeechT5PositionalConvEmbedding.__init__d  s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI+F,J,JK !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr   r   )r   rw   r   ry   r   s     r*   r   'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r,   )ry   rw   r   r   r   s   @r*   r   r   c  s    AB r,   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ ) SpeechT5ScaledPositionalEncodingi  uS   
Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
c           	        > [         R                  " X25      n[         R                  " SU5      R                  S5      n[         R                  " [         R                  " SUS[         R
                  S9R                  5       [        R                  " S5      U-  * -  5      n[         R                  " UR                  5       U-  5      US S 2SS S24'   [         R                  " UR                  5       U-  5      US S 2SS S24'   UR                  S5      n[        TU ]1  5         U R                  SUSS9  [        R                  " US	9U l        X l        X0l        [        R&                  " [         R(                  " S
5      5      U l        g )Nr   r   r   rD        @peFr   p      ?)r   rO   rR   r   r   r   r   r   r   r   r   rn   ro   r   r   Dropoutdropoutr   max_len	Parametertensoralpha)r{   r
  r   r  r  positiondiv_termr~   s          r*   ro   )SpeechT5ScaledPositionalEncoding.__init__  s!   [[&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r,   c                     XR                   U R                  S S 2S UR                  S5      24   -  -   nU R                  U5      nU$ )Nr   )r  r  r   r
  )r{   r   s     r*   r   (SpeechT5ScaledPositionalEncoding.forward  s@    JJMchhqkM)9!:::ll3
r,   )r  r   r
  r  )i  )	r   r   r   r   r   ro   r   r   r   r   s   @r*   r  r    s    5 r,   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )"SpeechT5RelativePositionalEncodingi  c                    > [         TU ]  5         Xl        X l        [        R
                  R                  SU-  U5      U l        g r   )rn   ro   r   
max_lengthr   r   	Embeddingpe_k)r{   r   r  r~   s      r*   ro   +SpeechT5RelativePositionalEncoding.__init__  s4    $HH&&q:~s;	r,   c                    UR                   S   n[        R                  " SU5      R                  UR                  [        R
                  S9nUS S 2S 4   US S S 24   -
  n[        R                  " X0R                  * :  U R                  * U5      n[        R                  " X0R                  :  U R                  S-
  U5      nX0R                  -   nU R                  U5      $ )Nr   r   r   rE   )	r%   r   rR   r   r   r   wherer  r  )r{   r   r   pos_seqs       r*   r   *SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55++g(884??:JGT++g8$//A:MwWOO+yy!!r,   )r   r  r  )i  r   r   s   @r*   r  r    s    <	" 	"r,   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr   r   r   )rn   ro   num_pad_remove)r{   r   r~   s     r*   ro   SpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car,   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r"  r   s     r*   r   SpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr,   r&  r   r   s   @r*   r   r     s    K r,   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SpeechT5FeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a@  [        USS9/[	        UR
                  S-
  5       Vs/ s H  n[        XS-   S9PM     sn-   nOVUR                  S:X  a-  [	        UR
                  5       Vs/ s H  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r}   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rn   ro   feat_extract_normr   rN   num_feat_extract_layersrg   r   r'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r{   r|   ir0  r~   s       r*   ro   SpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNNw,V!eDNwN K %%0HMfNlNlHmHm1*6>Hm  K 01I1I0JJst  ==5&+#"Ns   C C%c                 N    U R                  5        H
  nSUl        M     SU l        g )NF)
parametersrequires_gradr2  )r{   params     r*   _freeze_parameters)SpeechT5FeatureEncoder._freeze_parameters  s#    __&E"'E '#r,   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ NT)r2  trainingr7  r0  )r{   r-   r   
conv_layers       r*   r   SpeechT5FeatureEncoder.forward  sK    $QW- 4==*.M'**J&}5M + r,   )r2  r0  r1  )
r   r   r   r   r   ro   r9  r   r   r   r   s   @r*   r)  r)    s    8#&$

 
r,   r)  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeatureProjectioni  c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr#   eps)rn   ro   r   r   rp   layer_norm_epsr   Linearr   
projectionr	  feat_proj_dropoutr
  r{   r|   r~   s     r*   ro   "SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r,   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r   )r   rG  r
  )r{   r   norm_hidden_statess      r*   r   !SpeechT5FeatureProjection.forward  s7    !__];(:;]300r,   )r
  r   rG  r   r   s   @r*   rA  rA    s    <1 1r,   rA  c                   H  ^  \ rS rSrU 4S jrS r  SS\R                  S\R                  S-  S\R                  S-  4S jjr
S	\S\R                  4S
 jrS\R                  \-  4S jr  SS\R                  S\R                  S-  S\R                  S-  4S jjrSrU =r$ )SpeechT5SpeechEncoderPreneti  c                   > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        [%        UR&                  UR(                  -   S-   UR                  UR(                  5      U l        g )Nr2   r   )rn   ro   r|   r)  feature_encoderrA  feature_projectionmask_time_probmask_feature_probr   r  r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr    pos_sinusoidal_embedrI  s     r*   ro   $SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r,   c                 8    U R                   R                  5         g r   )rQ  r9  r{   s    r*   freeze_feature_encoder2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r,   Nr-   r/   mask_time_indicesc                    U R                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R	                  U5      u  pTU R                  XSUS9nU R                  U5      nXV-   nUb   UR                  S5      R                  5       nO;[        R                  " UR                  S S [        R                  UR                  S9nU R                  U5      nXX-   nXR4$ )Nr   r   )r_  r/   r   )rQ  r   "_get_feature_vector_attention_maskr%   rR  _mask_hidden_statesrW  r   r   r   rO   r   rY  )	r{   r-   r/   r_  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r*   r   #SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S'00~ 1 
 %)$7$7$F!%A%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%H,,r,   feature_vector_lengthc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rO   rE   r   rR   fliprP   )r{   rh  r/   non_padded_lengthsoutput_lengthsr\   s         r*   ra  >SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r^   c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   div)r>   rk   rl   s      r*   _conv_out_lengthVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s      99\7wWZ[[[r,   )zipr|   rt   ru   )r{   r^   ru  rk   rl   s        r*   rk  <SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r,   r   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )r5   r6   r/   r7   r  )r5   r6   r7   r#   )getattrr|   r   rV  r   rE   rS  r=  re   mask_time_lengthmask_time_min_masksr   r  r   rP   rT  mask_feature_lengthmask_feature_min_masksexpand)r{   r   r_  r/   r\   rA   r   mask_feature_indicess           r*   rb  /SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r,   )r|   rQ  rR  rV  rW  rY  NN)r   r   r   r   ro   r]  r   r   
LongTensorFloatTensorr   r<   ra  rk  rb  r   r   r   s   @r*   rO  rO    s    
"2 376:	 -ll - ((4/ - !,,t3	 -F ]b]m]m  e>N>NQT>T & 7;26	,((, !,,t3, ((4/	, ,r,   rO  c                   t   ^  \ rS rSrU 4S jrS r S	S\R                  S\R                  S-  4S jjrSr	U =r
$ )
SpeechT5SpeechDecoderPreneti  c           	      f  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H@  n[        R                  " US:X  a  UR                  OUR                  UR                  5      PMB     sn5      U l
        [        R                  " UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        [        R                  " UR"                  UR                  -   UR                  5      U l        g s  snf r%  )rn   ro   r|   r   r/  rN   speech_decoder_prenet_layersrF  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr  positional_dropoutrX  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr{   r|   r3  r~   s      r*   ro   $SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 DA	 		+,6F''v7Y7Y66 D
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD.c                     [         R                  " US   US9nUR                  S5      R                  UR	                  S5      SS5      n[         R
                  " US:H  US5      S-  SU-
  -  $ )Nr   r  r   )r   	bernoullir   repeatr   r  )r{   inputs_embedsr  r   	all_maskss        r*   _consistent_dropout/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr,   Nr-   speaker_embeddingsc                 6   UnU R                    HM  n[        R                  R                  U" U5      5      nU R	                  X0R
                  R                  5      nMO     U R                  U5      nU R                  U5      nUb  [        R                  R                  U5      nUR                  S5      R                  SUR                  S5      S5      n[        R                  " X2/SS9n[        R                  R                  U R                  U5      5      nU$ )Nr   r#   r   )r  r   
functionalrelur  r|   speech_decoder_prenet_dropoutr  r  	normalizer   r  r   r   r   r  )r{   r-   r  r  r,  s        r*   r   #SpeechT5SpeechDecoderPrenet.forward  s     %[[EMM..u]/CDM 44]KKDmDmnM ! ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}&IrRMMM..t/H/H/WXMr,   )r|   r  r  r  r  r   )r   r   r   r   ro   r  r   r   r   r   r   r   s   @r*   r  r    s=    u,K 37ll "LL4/ r,   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5BatchNormConvLayeri  c           	        > [         TU ]  5         US:X  a  UR                  nOUR                  nX!R                  S-
  :X  a  UR                  nOUR                  n[
        R                  " UUUR                  SUR                  S-
  S-  SS9U l        [
        R                  " U5      U l
        X!R                  S-
  :  a  [
        R                  " 5       U l        OS U l        [
        R                  " UR                  5      U l        g )Nr   r   r   F)rk   rl   r   rm   )rn   ro   r  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rs   speech_decoder_postnet_kernelrw   BatchNorm1d
batch_normTanhry   r	  speech_decoder_postnet_dropoutr
  )r{   r|   r}   rq   rr   r~   s        r*   ro   #SpeechT5BatchNormConvLayer.__init__  s    q= --K ==K;;a??!..L!>>LII<<99A=!C
	 ..6::Q>> ggiDO"DOzz&"G"GHr,   c                     U R                  U5      nU R                  U5      nU R                  b  U R                  U5      nU R                  U5      nU$ r   )rw   r  ry   r
  r   s     r*   r   "SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r,   )ry   r  rw   r
  r   r   r   s   @r*   r  r    s    I< r,   r  c                   l   ^  \ rS rSrU 4S jrS\R                  4S jrS\R                  4S jrSr	U =r
$ )SpeechT5SpeechDecoderPostneti  c           	        > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  -  5      U l        [        R                  " UR
                  UR                  5      U l	        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )rn   ro   r|   r   rF  r   r  r.   feat_outprob_outr/  rN   r  r  r  r  s      r*   ro   %SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<gh<gq'2<gh
hs   *Cr   c                    U R                  U5      R                  UR                  S5      SU R                  R                  5      nU R                  U5      nU R                  U5      R                  UR                  S5      S5      nX#U4$ )Nr   r#   )r  r   r   r|   r  postnetr  )r{   r   outputs_before_postnetoutputs_after_postnetlogitss        r*   r   $SpeechT5SpeechDecoderPostnet.forward  s{    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%fDDr,   c                     UR                  SS5      nU R                   H  nU" U5      nM     XR                  SS5      -   $ r   )r   r  )r{   r   layer_outputr,  s       r*   r  $SpeechT5SpeechDecoderPostnet.postnet  sB    $..q!4[[E .L !55a;;;r,   )r|   r  r  r  )r   r   r   r   ro   r   r   r   r  r   r   r   s   @r*   r  r    s/    	
EU\\ E<U\\ < <r,   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )SpeechT5TextEncoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        g r   )rn   ro   r|   r   r  
vocab_sizer   r    embed_tokensr  r  max_text_positionsr  rI  s     r*   ro   "SpeechT5TextEncoderPrenet.__init__  sc    LL):):F<N<NPVPcPcd @%%%%!
r,   r   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  )r{   r   r  s      r*   r   !SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r,   )r|   r  r  )
r   r   r   r   ro   r   r   r   r   r   r   s   @r*   r  r    s    
  r,   r  c                   z   ^  \ rS rSrU 4S jr  S	S\R                  S\R                  S-  S\S-  4S jjr	Sr
U =r$ )
SpeechT5TextDecoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  5      U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                  UR                  UR                  5      U l        [!        UR"                  UR                  -   S-   UR                  UR                  5      U l        g )Nr  r   )rn   ro   r|   r   r	  r  r
  scale_embeddingr   sqrtr   embed_scaler  r  r    r  r   r  embed_positionsrI  s     r*   ro   "SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r,   Nr   r/   past_key_valuesc                 "   Ub&  UR                  5       nUR                  SUS   5      nO[        S5      eUc  SOUR                  5       nU R	                  X5      nU R                  U5      U R                  -  nXv-  nU R                  U5      nXr4$ )Nr#   z'You have to specify `decoder_input_ids`r   )r   r   r'   get_seq_lengthr  r  r  r
  )r{   r   r/   r  input_shaper   	positionsr  s           r*   r   !SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG&5&=?CaCaCc((K	)))4t7G7GG"]3,,r,   )r|   r
  r  r  r  r  )r   r   r   r   ro   r   r   r  r
   r   r   r   r   s   @r*   r  r    sI    
" 37(,	-<<- ((4/- 	- -r,   r  c                   V   ^  \ rS rSrU 4S jrS\R                  4S jrS rS r	Sr
U =r$ )SpeechT5TextDecoderPostneti3  c                    > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        g )NFrm   )rn   ro   r|   r   rF  r   r  lm_headrI  s     r*   ro   #SpeechT5TextDecoderPostnet.__init__4  s3    yy!3!3V5F5FUSr,   r   c                 $    U R                  U5      $ r   r  r   s     r*   r   "SpeechT5TextDecoderPostnet.forward9  s    ||M**r,   c                     U R                   $ r   r  r\  s    r*   get_output_embeddings0SpeechT5TextDecoderPostnet.get_output_embeddings<  s     ||r,   c                     Xl         g r   r  r{   new_embeddingss     r*   set_output_embeddings0SpeechT5TextDecoderPostnet.set_output_embeddingsA  s    %r,   )r|   r  )r   r   r   r   ro   r   r   r   r  r  r   r   r   s   @r*   r  r  3  s(    T
+U\\ +
& &r,   r  c                   H  ^  \ rS rSrSr    SS\S\S\S-  S\S-  S\S-  S	\S-  4U 4S
 jjjr     SS\	R                  S\	R                  S-  S\S-  S\	R                  S-  S\	R                  S-  S\S\\	R                  \	R                  S-  \S-  4   4S jjrSrU =r$ )SpeechT5AttentioniE  z
Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
https://aclanthology.org/N18-2074.pdf)
N	embed_dim	num_headsr
  
is_decoderrm   	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rn   ro   r  r  r
  head_dimr'   scalingr  r  r   rF  k_projv_projq_projout_proj)r{   r  r  r
  r  rm   r  r~   s          r*   ro   SpeechT5Attention.__init__K  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr,   r   key_value_statesr  r/   position_biasoutput_attentionsr8   c                 J   USLnUR                  5       u  pnU R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUbU  WR'                  UUU R                  5      u  nnU(       a.  [        U[        5      (       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " UUR%                  SS5      5      nUR                  5       XR                   -  U
U4:w  a.  [/        SXR                   -  U
U4 SUR                  5        35      eUb  UR1                  5       R                  XR                   -  SU R"                  5      R%                  S	S5      n[*        R2                  " UUR%                  S
S5      5      nUR%                  S	S5      R                  XR                   -  UR                  S	5      UR                  S5      5      nUU-  nUbz  UR                  5       U	SU
U4:w  a#  [/        SU	SU
U4 SUR                  5        35      eUR                  XR                   U
U5      U-   nUR                  XR                   -  U
U5      n[4        R6                  R9                  USS9nU(       a=  UR                  XR                   U
U5      nUR                  XR                   -  U
U5      nOSn[4        R6                  R;                  UU R:                  U R<                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 SUR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU R>                  5      nU RA                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr#   r   r   Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   )r  r=  z `attn_output` should be of size )!r   r  r  
isinstancer   
is_updatedgetr  cross_attention_cacheself_attention_cacher  keysvaluesr  r  r   r  r  r   updaterZ   r   bmmr'   
contiguousmatmulr   r  softmaxr
  r=  r  r  )r{   r   r  r  r/   r  r  kwargsis_cross_attentionr   tgt_lenr]   query_statesr  curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                             r*   r   SpeechT5Attention.forwardh  s    .T9',,.a {{=1DLL@
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(*  $$//166s^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<nn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r,   )r
  r  r  r  r  r  r  r  r  r  r  )r2   FTN)NNNNF)r   r   r   r   r   r<   r   rP   ro   r   r   r
   tupler   r   r   r   s   @r*   r  r  E  s    !$"' !%CC C 	C
 4KC TkC $;C C@ 15(,.2-1"'r2||r2  ,,-r2 	r2
 t+r2 ||d*r2  r2 
u||U\\D0%$,>	?r2 r2r,   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  U5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " X!R                  5      U l        [        R                  " UR                  5      U l        g r   )rn   ro   r   r	  activation_dropoutintermediate_dropoutrF  r   intermediate_denser  
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r{   r|   intermediate_sizer~   s      r*   ro   SpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''--'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r,   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r  r  r  r  r!  r   s     r*   r   SpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r,   )r  r  r  r  r!  r   r   s   @r*   r  r    s    @ r,   r  c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\4S	 jjr	S
r
U =r$ )SpeechT5EncoderLayeri  r|   c                   > [         TU ]  5         [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        XR                  5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r  r  r
  r  rC  )rn   ro   r  r   encoder_attention_headsattention_dropout	attentionr   r	  r   r
  r   rE  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normrI  s     r*   ro   SpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r,   Nr   r/   r  r  c                     UnU R                  UUUUS9u  pU R                  U5      nXQ-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xv4-  nU$ )ay  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`):
        attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
        large negative values.
    position_bias (`torch.FloatTensor`):
        relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r/   r  r  )r+  r
  r   r-  r.  )r{   r   r/   r  r  residualr  outputss           r*   r   SpeechT5EncoderLayer.forward  s    ( !&*nn')'/	 '5 '
# ]3 06%(9(9-(HH--m< "&Gr,   )r+  r
  r-  r.  r   )NNF)r   r   r   r   r   ro   r   r   rP   r   r   r   r   s   @r*   r'  r'    sb    \~ \  /3-1"'(||( t+( ||d*	(
  ( (r,   r'  c                      ^  \ rS rSrSS\4U 4S jjjr      SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\	S-  S\	S-  4S jjr
SrU =r$ )SpeechT5DecoderLayeri.  Nr|   c                 t  > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  UR                  S9U l        [!        XR"                  5      U l        [        R                  " UR                  UR                  S9U l        g )NT)r  r  r
  r  r  rC  )r
  r  r  )rn   ro   r  r   decoder_attention_headsr*  	self_attnr   r	  r   r
  r   rE  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr-  r.  )r{   r|   r  r~   s      r*   ro   SpeechT5DecoderLayer.__init__/  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r,   r   r/   encoder_hidden_statesencoder_attention_maskr  r  	use_cachec                 j   Un	U R                  UUUUS9u  pU R                  U5      nX-   nU R                  U5      nSnUb=  Un	U R                  UUUUUS9u  pU R                  U5      nX-   nU R	                  U5      nXR                  U5      -   nU R                  U5      nU4nU(       a  XU4-  nU$ )ab  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r  r/   r  N)r   r  r/   r  r  )r8  r
  r9  r:  r;  r-  r.  )r{   r   r/   r>  r?  r  r  r@  r  r1  self_attn_weightscross_attn_weightsr2  s                r*   r   SpeechT5DecoderLayer.forwardG  s    2 ! ,0>>'+)/	 ,: ,
( ]3 011-@ " ,$H040A0A+!65 /"3 1B 1-M !LL7M$4M 88GM &(9(9-(HH--m< "+=>>Gr,   )r
  r:  r;  r-  r.  r8  r9  r   )NNNNFT)r   r   r   r   r   ro   r   r   r
   rP   r   r   r   r   s   @r*   r5  r5  .  s    \~ \ \6 /3596:(,).!%?||? t+?  %||d2	?
 !&t 3? ?  $;? $;? ?r,   r5  c                   z    \ rS rSr% \\S'   SrSrSrSr	\
R                  " 5       S\R                  4S j5       rS	rg
)SpeechT5PreTrainedModeli  r|   speecht5r-   audioTmodulec           
         U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SS[        R                  " SUR                  R                  S   UR                  R                  -  -  5      -  S9  [        R                  " UR                  R                  S5        GO[        U[        5      (       Gak  [        R                  " UR                   5        UR"                  UR$                  pC[&        R(                  " XC5      n[&        R*                  " SU5      R-                  S5      n[&        R.                  " [&        R*                  " SUS[&        R0                  S9R3                  5       [        R4                  " S5      U-  * -  5      n[&        R6                  " UR3                  5       U-  5      USS2SSS24'   [&        R8                  " UR3                  5       U-  5      USS2SSS24'   UR-                  S5      n[        R:                  " UR<                  U5        GO[        U[>        5      (       a  [        R                  " SUR@                  RB                  -  5      n[        RD                  " UR@                  R                  U* US9  [        RD                  " UR@                  R                  U* US9  GOh[        U[F        RH                  5      (       aO  [        R
                  " UR                  S	US9  UR                  b   [        RJ                  " UR                  5        GO[        U[F        RL                  [F        RN                  [F        RP                  45      (       a  [        RJ                  " UR                  5        [        R                  " UR                  5        [S        US
S5      b`  [        RJ                  " URT                  5        [        R                  " URV                  5        [        RJ                  " URX                  5        GO[        U[F        RZ                  5      (       a  [        R\                  " UR                  5        UR                  b^  [        R                  " UR^                  UR                  UR                  S   -  -  5      n[        RD                  " UR                  U* US9  GO`[        U[F        R`                  5      (       aw  [        R
                  " UR                  S	US9  URb                  bI  [S        UR                  SS5      (       d-  [        RJ                  " UR                  URb                     5        O[        U[d        5      (       a`  URg                  URh                  URj                  -   URl                  URb                  5      n	[        R:                  " URn                  U	5        OU[        U[p        5      (       a@  [        RJ                  " URr                  5        [        R                  " URt                  5        [w        US5      (       a!  [        RD                  " URx                  5        gg)zInitialize the weightsr   r   r   )meanstdrD   r  N)abr2   running_mean_is_hf_initializedFrV  )=r|   initializer_ranger  r   initnormal_rw   r   r   r  rk   in_channels	constant_rm   r  ones_r  r   r  r   rO   rR   r   r   r   r   r   r   r   copy_r  rA  rG  in_featuresrU  r   rF  zeros_r   r   r  r{  rO  running_varnum_batches_trackedrs   kaiming_normal_r   r  r   r   r   r   r   r   r   SpeechT5HifiGanrK  scaler   rV  )
r{   rI  rL  r   r  r  r  r  kr   s
             r*   _init_weights%SpeechT5PreTrainedModel._init_weights  s=    kk++f=>>LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ @AAJJv||$!::v~~W*B||Aw/99!<Hyyaau{{!K!Q!Q!SX\X`X`ahXiloXoVp!pqH))HNN$4x$?@Bq!$Q$wK))HNN$4x$?@Bq!$Q$wKaBJJvyy"% 9::		!f//;;;<AMM&++22qbA>MM&++00QB!<		**LLSc:{{&FKK(r||R^^ LMMKK$JJv}}%v~t4@F//0

6--.F667		**  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15--LLSc:!!-gfmmMach6i6iFMM&*<*<=> EFF ..$$v}}4f6J6JFL^L^K JJv~~{300KK$JJv||$6.//MM&223 0r,    N)r   r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr   r   r   Moduler`  r   rb  r,   r*   rF  rF    sB    "$O&*#
]]_74BII 74 74r,   rF  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S-  S\
S-  S	\
S-  S
\
S-  S\\-  4S jjrSrU =r$ )SpeechT5Encoderi  zm
Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
r|   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  S9U l        [        R                  " UR                  5      U l	        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [#        UR                  UR$                  -  UR&                  5      U l        SU l        U R-                  5         g s  snf )NrC  F)rn   ro   r   r   r   rE  r   r	  r   r
  encoder_layerdrop	layerdropr/  rN   encoder_layersr'  r  r  r)  encoder_max_relative_positionr  r1  	post_init)r{   r|   r]   r~   s      r*   ro   SpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$hKga%9&%AKg$hiA&"@"@@&BfBf 
 ',# 	 %is   DNr   r/   r  output_hidden_statesreturn_dictr8   c                 ,   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  n[	        U R                   UUS9nU R                  U5      nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      nU(       a  SOSn	U(       a  SOSn
[        U R                  5       H  u  pU(       a  X4-   n	SnU R                  (       a$  [        R                  " / 5      nXR                  :  nU(       a  U(       a  U" UUUUS9nUS   nU(       a  SnU(       d  Mv  U
WS   4-   n
M     U(       a  X4-   n	U(       d  [!        S	 XU
4 5       5      $ [#        UU	U
S
9$ )a  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the encoder prenet.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
N)r|   r  r/   rb  F)r/   r  r  r   r  r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rb  .0vs     r*   	<genexpr>*SpeechT5Encoder.forward.<locals>.<genexpr>4  s     m$[q$[s   	last_hidden_stater   
attentions)r|   r  rr  rs  r   r   r
  r  r   r   	enumerater  r=  r   rI   rm  r  r   )r{   r   r/   r  rr  rs  r  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r*   r   SpeechT5Encoder.forward  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY2;;')
 6]3,,];02R6LT6R"6BD$5b4"+DKK"8C#$58H$H! #N}}&+jjn#!4~~!E![ -!#1"/&7	! !.a 0 ,  &9]1=M<O&O#1 #94   14D Dm]GZ$[mmm++*
 	
r,   )r
  r  r1  r   rm  r  NNNNr   r   r   r   r   r   ro   r   r  r   rP   r  r   r   r   r   r   s   @r*   rj  rj    s    ~ ( /3)-,0#'X
((X
 t+X
  $;	X

 #TkX
 D[X
 
	 X
 X
r,   rj  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S-  S\
S-  S	\
S-  S
\
S-  S\\-  4S jjrSrU =r$ )SpeechT5EncoderWithSpeechPreneti=  z
Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   rO  prenetrj  wrapped_encoderrp  rI  s     r*   ro   (SpeechT5EncoderWithSpeechPrenet.__init__C  5     1&9.v6 	r,   Nr-   r/   r  rr  rs  r8   c                 R    U R                  X5      u  prU R                  UUUUUS9nU$ N)r   r/   r  rr  rs  r  r  	r{   r-   r/   r  rr  rs  r  r   r2  s	            r*   r   'SpeechT5EncoderWithSpeechPrenet.forwardK  s@     )-L(Q%&&')/!5# ' 
 r,   r  r  r  r   s   @r*   r  r  =  s    
~  /3)-,0#''' t+  $;	
 #Tk D[ 
	  r,   r  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r    SS\	R                  S	\	R                  S-  S
\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )SpeechT5EncoderWithTextPrenetia  zt
Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  rj  r  rp  rI  s     r*   ro   &SpeechT5EncoderWithTextPrenet.__init__f  5     /7.v6 	r,   c                 6    U R                   R                  5       $ r   r  get_input_embeddingsr\  s    r*   r  2SpeechT5EncoderWithTextPrenet.get_input_embeddingsn      {{//11r,   c                 :    U R                   R                  U5        g r   r  set_input_embeddingsr{   values     r*   r  2SpeechT5EncoderWithTextPrenet.set_input_embeddingsq      ((/r,   Nr-   r/   r  rr  rs  r8   c                 N    U R                  U5      nU R                  UUUUUS9nU$ r  r  r  s	            r*   r   %SpeechT5EncoderWithTextPrenet.forwardt  s=     L1&&')/!5# ' 
 r,   r  r  )r   r   r   r   r   r   ro   r  r  r   r  r   rP   r  r   r   r   r   r   s   @r*   r  r  a  s    ~ 20 /3)-,0#''' t+  $;	
 #Tk D[ 
	  r,   r  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S-  S\
S-  S	\
S-  S
\
S-  S\\-  4S jjrSrU =r$ )SpeechT5EncoderWithoutPreneti  
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
r|   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rn   ro   rj  r  rp  rI  s     r*   ro   %SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r,   Nr-   r/   r  rr  rs  r8   c                 (    U R                  UUUUUS9$ r  r  )r{   r-   r/   r  rr  rs  r  s          r*   r   $SpeechT5EncoderWithoutPrenet.forward  s+     ##&)/!5# $ 
 	
r,   r  r  r  r   s   @r*   r  r    s    
~  /3)-,0#'
''
 t+
  $;	

 #Tk
 D[
 
	 
 
r,   r  c                     ^  \ rS rSrSrS\4U 4S jjr         SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )SpeechT5Decoderi  zl
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
r|   c           
      
  > [         TU ]  U5        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l	        SU l
        U R                  5         g s  snf )N)r  F)rn   ro   decoder_layerdroprm  r   r/  rN   decoder_layersr5  r  r1  rp  r  s      r*   ro   SpeechT5Decoder.__init__  sl     11mmX]^d^s^sXt$uXtST%9&%NXt$uv&+# 	 %vs   	B Nr   r/   r>  r?  r  r@  r  rr  rs  r8   c
                 `   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a1  Uc.  [        [        U R                   S9[        U R                   S95      n[        U R                   UUUS9nUb  Ub  [        U R                   UUUS9n[        5       =(       d    [        U 5      nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[        U R                   5       H  u  nnU(       a  X4-   nSnU R                  (       a%  ["        R$                  " / 5      nUU R&                  :  nU(       a	  U(       d  MZ  U" UUUUUUUS9nUS	   nU(       d  Mt  UUS
   4-   nUc  M  UUS   4-   nM     U(       a  X4-   nU	(       d  [)        S XXU4 5       5      $ [+        UUUUUS9$ )a)  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the decoder prenet.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r|   )r|   r  r/   r  )r|   r  r/   r>  rb  )r?  r  r  r@  r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rb  rv  s     r*   ry  *SpeechT5Decoder.forward.<locals>.<genexpr>:  s      wA ws   	)r|  r  r   r}  cross_attentions)r|   r  rr  r@  rs  r1  r=  loggerwarning_oncer   r   r   r   r   r   r~  r  r   rI   rm  r  r   )r{   r   r/   r>  r?  r  r@  r  rr  rs  r  r  r  r  all_cross_attentionsr  decoder_layerr  r  r  s                       r*   r   SpeechT5Decoder.forward  sH   r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY&&4==##p "	01,dkk2RT`hlhshsTtuO+;;')+	
 !,1G1S%>{{+5&;	&" 12R6LT6R #7BD$5b4&7<Q<]rdh"+DKK"8C#$58H$H! #N}}&+jjn#!4t~~!Ek)%'= /"3#M *!,M  &9]1=M<O&O#(4+?=QRCSBU+U(5 #98   14D D ':Kbvw   9+++*1
 	
r,   )r1  rm  r  	NNNNNNNNNr   r   r   r   r   r   ro   r   r  r  r
   rP   r  r   r   r   r   r   s   @r*   r  r    s    	~ 	 3726:>:>(,!%)-,0#'M
((4/M
 ((4/M
  %0047	M

 !& 0 04 7M
 M
 $;M
  $;M
 #TkM
 D[M
 
:	:M
 M
r,   r  c                   (  ^  \ rS rSrSrS\4U 4S jjr          SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )SpeechT5DecoderWithSpeechPrenetiI  z|
Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  r  wrapped_decoderrp  rI  s     r*   ro   (SpeechT5DecoderWithSpeechPrenet.__init__O  r  r,   Nr-   r/   r>  r?  r  r  r@  r  rr  rs  r8   c                 V    U R                  X5      nU R                  UUUUUUUU	U
S9	nU$ N)	r   r/   r>  r?  r  r@  r  rr  rs  r  r  )r{   r-   r/   r>  r?  r  r  r@  r  rr  rs  r  decoder_hidden_statesr2  s                 r*   r   'SpeechT5DecoderWithSpeechPrenet.forwardW  sJ     !%L M&&/)"7#9+/!5# ' 

 r,   r  )
NNNNNNNNNN)r   r   r   r   r   r   ro   r   r  r  r   r
   rP   r  r   r   r   r   r   s   @r*   r  r  I  s    
~  2626:>:>26(,!%)-,0#'''$. ((4/  %0047	
 !& 0 04 7 "LL4/  $;  $; #Tk D[ 
:	: r,   r  c                     ^  \ rS rSrSrS\4U 4S jjrS rS r         SS\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )SpeechT5DecoderWithTextPrenetiv  zs
Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  r  r  rp  rI  s     r*   ro   &SpeechT5DecoderWithTextPrenet.__init__{  r  r,   c                 6    U R                   R                  5       $ r   r  r\  s    r*   r  2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r  r,   c                 :    U R                   R                  U5        g r   r  r  s     r*   r  2SpeechT5DecoderWithTextPrenet.set_input_embeddings  r  r,   Nr-   r/   r>  r?  r  r@  r  rr  rs  r8   c
                 \    U R                  XU5      u  pU R                  UUUUUUUUU	S9	nU$ r  r  )r{   r-   r/   r>  r?  r  r@  r  rr  rs  r  r  r2  s                r*   r   %SpeechT5DecoderWithTextPrenet.forward  sP     15LZi0j-&&/)"7#9+/!5# ' 

 r,   r  r  )r   r   r   r   r   r   ro   r  r  r   r  r  r
   rP   r  r   r   r   r   r   s   @r*   r  r  v  s    ~ 20
 2626:>:>(,!%)-,0#'''$. ((4/  %0047	
 !& 0 04 7  $;  $; #Tk D[ 
:	: r,   r  c                     ^  \ rS rSrSrS\4U 4S jjr         SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )SpeechT5DecoderWithoutPreneti  r  r|   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rn   ro   r  r  rp  rI  s     r*   ro   %SpeechT5DecoderWithoutPrenet.__init__  r  r,   Nr-   r/   r>  r?  r  r@  r  rr  rs  r8   c
                 4    U R                  UUUUUUUUU	S9	nU$ r  r  )r{   r-   r/   r>  r?  r  r@  r  rr  rs  r  r2  s               r*   r   $SpeechT5DecoderWithoutPrenet.forward  s;     &&&)"7#9+/!5# ' 

 r,   r  r  r  r   s   @r*   r  r    s    
~  2626:>:>(,!%)-,0#'''$. ((4/  %0047	
 !& 0 04 7  $;  $; #Tk D[ 
:	: r,   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jrS
 r\S 5       rSrU =r$ )$SpeechT5GuidedMultiheadAttentionLossi  z
Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
r|   c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )rn   ro   guided_attention_loss_sigmasigmaguided_attention_loss_scaler^  rI  s     r*   ro   -SpeechT5GuidedMultiheadAttentionLoss.__init__  s(    77
77
r,   r}  input_masksoutput_masksr8   c                 D   U R                  X#UR                  5      nUR                  S5      UR                  S5      -  nUR                  UR                  5      R                  S5      nXA-  n[        R
                  " UR                  U5      5      nU R                  U-  $ )a  
Compute the attention loss.

Args:
    attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
        Batch of multi-head attention weights
    input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
        Input attention mask as booleans.
    output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
        Target attention mask as booleans.

Returns:
    `torch.Tensor` with the loss value
r#   r   r   )_make_guided_attention_masksr   r   r   r   rK  masked_selectr^  )r{   r}  r  r  guided_attn_masksmaskslosseslosss           r*   r   ,SpeechT5GuidedMultiheadAttentionLoss.forward  s    " !==kYcYjYjk&&r*[-B-B2-FF**+55a8"/zz&..u56zzD  r,   c                 j   UR                  S5      nUR                  S5      n[        R                  " [        U5      UR                  S   UR                  S   4US9n[        [        XE5      5       H.  u  nu  pU R                  XU R                  U5      XgS U	2S U24'   M0     UR                  S5      $ )Nr#   r   rj  )
rL   r   rO   rS   r%   r~  rw  _make_guided_attention_maskr  r   )
r{   r  r  r   r^   rn  r  r  ilenolens
             r*   r  ASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masks  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}+M!NC$373S3STX`d`j`jlr3s5D5%4%/0 "O !**1--r,   c                    [         R                  " [         R                  " XS9[         R                  " XS9SS9u  pEUR                  5       U-  nUR                  5       U -  nS[         R                  " XE-
  S-  * SUS-  -  -  5      -
  $ )Nrj  xy)indexingr  r   )r   meshgridrR   r   r   )r>   output_lengthr  r   grid_ygrid_xs         r*   r  @SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask  sz    LL5LL6

 -/,.UYY&/a!78ANKLLLr,   )r^  r  )r   r   r   r   r   r   ro   r   r  
BoolTensorr   r   r  r   r  r   r   r   s   @r*   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr,   r  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S	\R                  S
\R                  S\R                  S-  S\R                  4S jjrSrU =r$ )SpeechT5SpectrogramLossi
  z3
Loss computation used by SpeechT5ForTextToSpeech.
r|   c                 .  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        5       U l        [        [        R                  " S5      S9U l
        U R                  (       a  [        U5      U l        g g )Ng      @)
pos_weight)rn   ro   use_guided_attention_lossguided_attention_loss_num_headsr.   r   l1_criterionr   r   r  bce_criterionr  attn_criterionrI  s     r*   ro    SpeechT5SpectrogramLoss.__init__  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r,   Nr/   r  r  r  labelsr  r8   c           	      V   US:g  nUR                  U5      nUR                  U5      nUR                  U5      nU R                  X55      U R                  X%5      -   nUS S 2S S 2S4   n	[        R                  " U	) S-  [        R                  " U	R                  S5      S5      R                  U	R                  5      /SS9n
U
S S 2SS 24   R                  U	5      n
UR                  U	5      nU R                  XJ5      nX-   nU R                  (       a  [        R                  " U Vs/ s H  oS S 2S U R                  24   PM     snSS9nUS:H  nUS S 2S S 2S4   nU R                  S:  a#  US S 2U R                  S-
  S U R                  24   nU R                  XU5      nUU-  nU$ s  snf )Nr1   r   r  r   r   )r  r  r   r   rU   r   r   r   r  r  r  r.   r  )r{   r/   r  r  r  r  r  re  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r*   r   SpeechT5SpectrogramLoss.forward  s    ' %%l3!7!E!El!S 5 C CL Q ##$9BTEVEVWmEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%f: ! ))99TdeTdq#IT%I%I#I IJTdeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D|LIID fs   %F&)r  r  r  r  r.   r  r   )r   r   r   r   r   r   ro   r   r  r  r   r   r   r   r   s   @r*   r  r  
  s    
O~ 
O& 6:)(() !& 1 1)  %00	)
 !!) !!)  ++d2) 
) )r,   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc                     ^  \ rS rSr  SS\S\R                  S-  S\R                  S-  4U 4S jjjrS rS r	S	 r
\           SS
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\\R                         S-  S\S-  S\S-  S\R                   S-  S\S-  S\S-  S\S-  S\\R                      \-  4S jj5       rSrU =r$ )SpeechT5ModeliG  Nr|   encoderdecoderc                    > [         TU ]  U5        Xl        Uc  [        U5      OUU l        Uc  [        U5      OUU l        U R                  5         g)z
encoder (`PreTrainedModel`, *optional*):
    The encoder model to use.
decoder (`PreTrainedModel`, *optional*):
    The decoder model to use.
N)rn   ro   r|   r  r  r  r  rp  )r{   r|   r  r  r~   s       r*   ro   SpeechT5Model.__init__M  sK     	 ?F3F;T[?F3F;T[ 	r,   c                     [        U R                  [        5      (       a  U R                  R                  5       $ [        U R                  [
        5      (       a  U R                  R                  5       $ [        er   )r  r  r  r  r  r  NotImplementedErrorr\  s    r*   r  "SpeechT5Model.get_input_embeddingsa  sR    dll$ABB<<4466dll$ABB<<4466!!r,   c                     [        U R                  [        5      (       a  U R                  R                  U5        [        U R                  [
        5      (       a  U R                  R                  U5        g g r   )r  r  r  r  r  r  r  s     r*   r  "SpeechT5Model.set_input_embeddingsh  sP    dll$ABBLL--e4dll$ABBLL--e4 Cr,   c                     [        U R                  [        5      (       a%  U R                  R                  R	                  5         ggz
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r  r  r  r  r]  r\  s    r*   r]  $SpeechT5Model.freeze_feature_encodern  s2    
 dll$CDDLL668 Er,   r-   r/   decoder_input_valuesdecoder_attention_maskencoder_outputsr  r@  r  r  rr  rs  r8   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUU	U
US9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUbV  [        U R
                  [        5      (       a7  U R
                  R                  R                  US   R                  S   U5      nOUn[        U R                  [        5      (       a  SU0nO0 nU R                  " S
UUUS   UUUU	U
US.	UD6nU(       d  X-   $ [        UR                   UR"                  UR$                  UR&                  UR(                  UR                   UR$                  UR&                  S	9$ )a  
input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
    Depending on which encoder is being used, the `input_values` are either: float values of the input raw
    speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
    filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
    the vocabulary, or hidden states.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
N)r-   r/   r  rr  rs  r   r   r   r{  r  )	r-   r/   r>  r?  r  r@  r  rr  rs  )r|  r  r  decoder_attentionsr  encoder_last_hidden_stater>  encoder_attentionsrb  )r|   r  rr  r@  rs  r  r  r   rS   r  r  ra  r%   r  r  r   r|  r  r   r}  r  )r{   r-   r/   r   r!  r"  r  r@  r  r  rr  rs  r  r?  decoder_argsdecoder_outputss                   r*   r   SpeechT5Model.forwardv  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll)-"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c*c%)\\%8%8%[%["((+^&" &4"dll$CDD02DELL,, 
-1"1!"4#9+/!5#
 
 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   )r|   r  r  r  NNNNNNNNNNN)r   r   r   r   r   r   rh  ro   r  r  r]  r   r   r   r  r  r  r
   rP   r   r   r   r   r   s   @r*   r  r  G  sz    %)$(	 T! T!	 ("59  -12648:>BF(,!%7;)-,0#'_
llT)_
 ((4/_
 $llT1	_

 !& 0 04 7_
 uU%6%6784?_
 _
 $;_
 "--4_
  $;_
 #Tk_
 D[_
 
u  	!$6	6_
 _
r,   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c                   t  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS r	\
           SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                        S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\R                  S	-  S\\-  4S jj5       rSrU =r$ )SpeechT5ForSpeechToTexti  z#text_decoder_postnet.lm_head.weightz+speecht5.decoder.prenet.embed_tokens.weightr|   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r'   r~   r  r  r  rG  r  text_decoder_postnetrp  )r{   r|   speech_encodertext_decoderr~   s       r*   ro    SpeechT5ForSpeechToText.__init__  s}     $00@ A/ /  9@4V<%flK$>v$F! 	r,   c                 T    U R                  5       R                  R                  5         gr  get_encoderr  r]  r\  s    r*   r]  .SpeechT5ForSpeechToText.freeze_feature_encoder      
 	!!88:r,   c                 6    U R                   R                  5       $ r   )r/  r  r\  s    r*   r  -SpeechT5ForSpeechToText.get_output_embeddings  s    ((>>@@r,   c                 :    U R                   R                  U5        g r   )r/  r  r  s     r*   r  -SpeechT5ForSpeechToText.set_output_embeddings  s    !!77Gr,   Nr-   r/   decoder_input_idsr!  r"  r  r@  r  rr  rs  r  r8   c                 ~   U
b  U
OU R                   R                  n
Ub7  Uc4  [        XR                   R                  U R                   R                  5      nU R                  UUUUUUUUU	SS9
nU R                  US   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                   UR"                  S9	$ )a:  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
    or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
    only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
>>> from datasets import load_dataset

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
>>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
>>> predicted_ids = model.generate(**inputs, max_length=100)

>>> # transcribe speech
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
>>> transcription[0]
'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
```

```python
>>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

>>> # compute loss
>>> loss = model(**inputs).loss
>>> round(loss.item(), 2)
19.68
```
NT)
r-   r/   r   r!  r"  r  r@  r  rr  rs  r   r#   r   )	r  r  r  r  r$  r  r%  r>  r&  )r|   rs  r+   r    r!   rG  r/  r   r   r  r   r  r  r$  r  r%  r>  r&  )r{   r-   r/   r<  r!  r"  r  r@  r  rr  rs  r  r  r2  r  r  loss_fctoutputs                     r*   r   SpeechT5ForSpeechToText.forward  sO   f &1%<k$++BYBY ($6KK44dkk6X6X%! --%)!2#9++/!5   
 **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   )rG  r/  r*  )r   r   r   r   _tied_weights_keysr   ro   r]  r  r  r   r   r  r  r  r
   rP   r   r   r   r   r   s   @r*   r,  r,    sO    @Ano~ (;AH  262659:>BF(,!%)-,0#'*.|
''$.|
 ((4/|
 !++d2	|

 !& 0 04 7|
 uU%6%6784?|
 |
 $;|
  $;|
 #Tk|
 D[|
   4'|
 
	 |
 |
r,   r,  modelr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
                    Uc  [        S5      eUc*  SXR                  R                  :H  R                  5       -
  n
OUn
UR	                  S5      nU R
                  R                  UU
SS9nUR                  n[        U R
                  R                  [        5      (       a@  U R
                  R                  R                  R                  US   R                  S   U
5      n
[        UR	                  S5      U-  U R                  R                  -  5      n[        UR	                  S5      U-  U R                  R                  -  5      nUR                  USU R                  R                  5      n/ n/ nS nSn0 n US-  nU R
                  R                   R                  UU5      nU R
                  R                   R#                  US S 2SS 24   S UU
USUSS9nU(       a.  UR%                  [&        R(                  " UR*                  SS95        UR                  R-                  S5      nUR.                  nU R0                  R3                  U5      nUR5                  XR                  R                  U R                  R                  5      nUR%                  U5        US S 2SS S 24   R5                  USU R                  R                  5      n[&        R(                  " UU4SS9n[&        R6                  " U R0                  R9                  U5      5      nUU:  a  GM  UU:  a@  [&        R:                  " USS9U:  n[&        R<                  " U5      S   R?                  5       nO[A        [C        U5      5      nU Vs/ s H  nUU;  d  M  UPM     nn[C        U5      S:  ad  [&        RD                  " U5      nURG                  SS5      RI                  SS	5      nU R0                  RK                  U5      nU H  n UU    UU '   M     [C        U5      U:  a  OGM  [A        [C        U5      5       Vs/ s H  nUU   PM
     nnU	(       d  US:X  a  US   O1[&        RL                  RN                  RP                  RS                  USS
9nUb	  U" U5      n!OUn!U(       a_  [&        R(                  " US	S9nUS:  a@  UR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 nU!U4n!U!$ / n"[A        U5       H&  nU"R%                  UU   R	                  S5      5        M(     Uc7  [&        RL                  RN                  RP                  RS                  USS
9nUU"4n!Oy/ n#[&        RL                  RN                  RP                  RS                  USS
9nU" U5      n#U" Vs/ s H,  n[        U#R	                  S5      [U        U"5      -  5      U-  PM.     n$nU#U$4n!U(       a\  [&        R(                  " US	S9nUR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 n/ U!QUP7n!U!$ s  snf s  snf s  snf )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r-   r/   rs  r#   )r   r/   r>  r?  r  r@  r  rs  r   r   )batch_first)+r'   r|   r    r<   r   rG  r  r|  r  r  r  ra  r%   r.   r$   r  r  r  rW   r   r   r  squeezer  speech_decoder_postnetr  r   sigmoidr  rL   r  rM   rN   rS   stackr   flattenr  r   r   rnnpad_sequencer=   )%rB  r-   r  r/   rC  rD  rE  rF  rG  rH  r?  r   encoder_outr%  maxlenminlenoutput_sequencespectrogramr  r  r  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr3  spectrograms
meet_indexr2  spectrogram_lengths	waveformswaveform_lengthss%                                        r*   _generate_speechre    s    !
 	
 !"lll6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ll&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S|!q@R7RA|LS< 1$${{;7+55a;CCAqI$;;CCLQ".J5A*5M&z2 #/%&#-i j 49=O9P3QR3Qa&q)3QLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !sA&&|A';';A'>? ? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rs_rZ[INN1$5<O8P$P QTU U_rs "23G"$yy)9qA/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   7
YY)Y%3YzB
    SpeechT5 Model with a text encoder and a speech decoder.
    c                     ^  \ rS rSrSrSrS\4U 4S jjr\S\	4S j5       r
\             SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\\\R                        S-  S\S-  S\	S-  S\	S-  S\	S-  S\	S-  S\R                  S-  S\R                  S-  S\R"                  S-  S\\-  4S jj5       r\R(                  " 5               S S\R                  S	\R                  S-  S\R                  S-  S\S\S\S\R.                  S-  S\	S\	S\R                  \\R                  \R                  4   -  4S jj5       r\R(                  " 5               S S\R                  S\R                  S-  S	\R                  S-  S\S\S\S\R.                  S-  S\	S\	S\R                  \\R                  \R                  4   -  4S jj5       rSrU =r$ )!SpeechT5ForTextToSpeechi	  )textr   r|   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )Nr.  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r'   r~   r  r  r  rG  r  rM  rp  )r{   r|   text_encoderspeech_decoderr~   s       r*   ro    SpeechT5ForTextToSpeech.__init__	  s}     $00@ A/ /  5V<8@%fNK&B6&J# 	r,   r8   c                     gr<  rb  )clss    r*   can_generate$SpeechT5ForTextToSpeech.can_generate.	  s    
 r,   Nr/   r   r!  r"  r  r@  r  rr  rs  r  r  r
  c                 j   U
b  U
OU R                   R                  n
UbB  Uc"  [        XR                   R                  U5      u  p4U R                   R                  (       a  SnU R                  UUUUUUUUUU	SS9nU R                  US   5      u  nnnSnUb,  [        U R                   5      nU" UUUUUUR                  5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a"
  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
    [`~PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
    for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
>>> import torch

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
>>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([15872])
```
NTr-   r/   r   r!  r"  r  r@  r  r  rr  rs  r   r   	r  rW  r  r  r$  r  r%  r>  r&  )r|   rs  r4   r.   r  rG  rM  r  r  r   r  r  r$  r%  r>  r&  )r{   r   r/   r   r!  r"  r  r@  r  rr  rs  r  r  r
  r  r2  r  r  r  r  	criterionr?  s                         r*   r   SpeechT5ForTextToSpeech.forward5	  so   J &1%<k$++BYBY#+?WKK88:P@<$ {{44$(!--")!5#9++1/!5   
 AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   rC  rD  rE  rF  rG  rH  c
                     UbY  UR                  S5      nUR                  S5      U:w  a3  UR                  S5      S:X  a  UR                  US5      nO[        S5      e[        U UUUUUUUUU	5
      $ )a  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Attention mask from the tokenizer, required for batched inference to signal to the model where to
        ignore padded tokens from the input_ids.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r'   re  )r{   r   r/   r  rC  rD  rE  rF  rG  rH  r  r\   s               r*   generate SpeechT5ForTextToSpeech.generate	  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   c
                     UbY  UR                  S5      n
UR                  S5      U
:w  a3  UR                  S5      S:X  a  UR                  U
S5      nO[        S5      e[        U UUUUUUUUU	5
      $ )aW  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.rw  )r{   r   r  r/   rC  rD  rE  rF  rG  rH  r\   s              r*   generate_speech'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   rM  rG  NNNNNNNNNNNNNNNg      ?r2   g      4@NFF)r   r   r   r   rf  re  r   ro   classmethodrP   ro  r   r   r  r  r  r
   r   r   r   r   r   r   rh  rx  r{  r   r   r   s   @r*   rg  rg  	  sZ    !!O~ ( T    .2269=:>BF(,!%)-,0#'7;+/+/x
##d*x
 ((4/x
 $//$6	x

 !& 0 04 7x
 uU%6%6784?x
 x
 $;x
  $;x
 #Tkx
 D[x
 "--4x
 !!D(x
 \\D(x
  
)	)!x
 x
t ]]_ 377; !$((-&+Y
##Y
 ((4/Y
 "--4	Y

 Y
 Y
 Y
 T!Y
 "&Y
  $Y
 
		U5#4#4e6G6G#GH	HY
 Y
v ]]_ 8<26 !$((-&+]
##]
 "--4]
 ((4/	]

 ]
 ]
 ]
 T!]
 "&]
  $]
 
		U5#4#4e6G6G#GH	H]
 ]
r,   rg  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c                     ^  \ rS rSrS\4U 4S jjrS r\             SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\\R                        S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\-  4S jj5       r\R"                  " 5               SS\R                  S\R                  S-  S\R                  S-  S\S\S\S\R(                  S-  S\S\S\R                  4S jj5       rSrU =r$ )SpeechT5ForSpeechToSpeechim
  r|   c                    > [         TU ]  U5        [        U5      n[        U5      n[	        XU5      U l        [        U5      U l        U R                  5         g r   )	rn   ro   r  r  r  rG  r  rM  rp  )r{   r|   r0  rk  r~   s       r*   ro   "SpeechT5ForSpeechToSpeech.__init__s
  sK     8@8@%fnM&B6&J# 	r,   c                 T    U R                  5       R                  R                  5         gr  r4  r\  s    r*   r]  0SpeechT5ForSpeechToSpeech.freeze_feature_encoder
  r7  r,   Nr-   r/   r   r!  r"  r  r@  r  rr  rs  r  r  r
  r8   c                    U
b  U
OU R                   R                  n
Ub%  Uc"  [        XR                   R                  U5      u  p4U R	                  UUUUUUUUUU	SS9nU R                  US   5      u  nnnSnU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a5  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
    a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
    [`SpeechT5Processor.__call__`] for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
>>> from datasets import load_dataset
>>> import torch

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
>>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([77824])
```
NTrr  r   r   rs  )r|   rs  r4   r.   rG  rM  r   r  r  r$  r  r%  r>  r&  )r{   r-   r/   r   r!  r"  r  r@  r  rr  rs  r  r  r
  r  r2  r]   rW  r  r  r?  s                        r*   r   !SpeechT5ForSpeechToSpeech.forward
  s   X &1%<k$++BYBY#+?WKK88:P@<$ --%)!5#9++1/!5   
 "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   rC  rD  rE  rF  rG  rH  c
                 n    Uc  [         R                  " SUR                  S9n[        U UUUUUUUUU	5
      $ )ao  
Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
speech waveform using a vocoder.

Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Float values of input raw speech waveform.

        Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
        a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
        or the soundfile library (`pip install soundfile`).
        To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
        conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
)r   i   rj  )r   rO   r   re  )
r{   r-   r  r/   rC  rD  rE  rF  rG  rH  s
             r*   r{  )SpeechT5ForSpeechToSpeech.generate_speech
  sM    T %!&Xl>Q>Q!R#!
 	
r,   r}  r~  r  )r   r   r   r   r   ro   r]  r   r   r  r  r  r
   rP   r   r   r   r   r   r   rh  r{  r   r   r   s   @r*   r  r  m
  s3   
~ 
;  26269=:>BF(,!%)-,0#'7;+/+/s
''$.s
 ((4/s
 $//$6	s

 !& 0 04 7s
 uU%6%6784?s
 s
 $;s
  $;s
 #Tks
 D[s
 "--4s
 !!D(s
 \\D(s
  
)	)!s
 s
j ]]_ 8<26 !$((-&+W
''W
 "--4W
 ((4/	W

 W
 W
 W
 T!W
 "&W
  $W
 
		W
 W
r,   r  c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlockiW  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ s H0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ s H,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )rl   dilationr   )rn   ro   leaky_relu_sloper   r/  rN   rS   rs   get_paddingconvs1convs2)r{   channelsrk   r  r  r3  r]   r~   s          r*   ro   HifiGanResidualBlock.__init__X  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   7C%%3C*c                     X-  U-
  S-  $ r   rb  )r{   rk   r  s      r*   r   HifiGanResidualBlock.get_paddingw  s    &1a77r,   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r  r  r{   r   r,  s      r*   apply_weight_norm&HifiGanResidualBlock.apply_weight_normz  si    hh**288,,m<<((33??K[[E ![[E !r,   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r   )r  r   r   remove_weight_normr  r{   r,  s     r*   r  'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r,   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r   )rw  r  r  r   r  
leaky_relur  )r{   r   conv1conv2r1  s        r*   r   HifiGanResidualBlock.forward  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r,   )r  r  r  )r   )r   r      g?)r   )r   r   r   r   ro   r  r  r  r   r   r   r   s   @r*   r  r  W  s!    
>8/ r,   r  z
    HiFi-GAN vocoder.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrU 4S jrS r	S r
\" SS	9S\R                  S
\R                  4S j5       rSrU =r$ )r]  i  r|   rW  c                   > [         TU ]  U5        [        UR                  5      U l        [        UR
                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [        UR
                  UR                   5      5       Ha  u  nu  p4U R                  R#                  [        R$                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [)        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[        UR                  UR*                  5       H4  u  pFU R&                  R#                  [-        XTXaR.                  5      5        M6     Mr     [        R                  " WSSSSS9U l        U R3                  S[4        R6                  " UR                  5      5        U R3                  S[4        R8                  " UR                  5      5        U R;                  5         g )N   r   r   )rk   rl   r   r   rK  r^  )rn   ro   rS   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rs   model_in_dimupsample_initial_channelconv_prer/  	upsamplerr~  rw  upsample_kernel_sizesrW   ConvTranspose1d	resblocksrN   resblock_dilation_sizesr  r  	conv_postr   r   rO   rU   rp  )r{   r|   r3  upsample_raterk   r  r  r~   s          r*   ro   SpeechT5HifiGan.__init__  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r,   c                    > [         TU ]  U5        [        U[        5      (       aA  [        R
                  " UR                  5        [        R                  " UR                  5        g g r   )	rn   r`  r  r]  rR  rY  rK  rV  r^  )r{   rI  r~   s     r*   r`  SpeechT5HifiGan._init_weights  sB    f%fo..KK$JJv||$ /r,   c                    [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU" U R
                  5        U R                   H  nU" U5        M     U R                   H  nUR                  5         M     U" U R                  5        g r  )
r   r   r   r   r   r  r  r  r  r  r  s      r*   r  !SpeechT5HifiGan.apply_weight_norm  s    hh**288,,m<<((33??KDMM"^^E $^^E##% $DNN#r,   c                 R   [         R                  R                  U R                  5        U R                   H"  n[         R                  R                  U5        M$     U R
                   H  nUR                  5         M     [         R                  R                  U R                  5        g r   )r   r   r  r  r  r  r  r  s     r*   r  "SpeechT5HifiGan.remove_weight_norm  sh    
##DMM2^^EHH''. $^^E$$& $
##DNN3r,   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r8   c                    U R                   R                  (       a  XR                  -
  U R                  -  nUR	                  5       S:H  nU(       d  UR                  S5      nUR                  SS5      nU R                  U5      n[        U R                  5       H  n[        R                  R                  X@R                   R                  5      nU R                  U   " U5      nU R                  XPR                   -     " U5      n[        SU R                   5       H)  nX`R                  XPR                   -  U-      " U5      -  nM+     X`R                   -  nM     [        R                  R                  U5      nU R#                  U5      n[$        R&                  " U5      nU(       d2  UR)                  S5      R                  SS5      R+                  S5      nU$ UR)                  S5      nU$ )a  
spectrogram (`torch.FloatTensor`):
    Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
    config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

Returns:
    `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
    shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
r   r   r   r   r#   )r|   normalize_beforerK  r^  r   r   r   r  rN   r  r   r  r  r  r  r  r  r  r   tanhrL  r   )	r{   rW  r  
is_batchedr   r3  	res_statejwaveforms	            r*   r   SpeechT5HifiGan.forward  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr,   )r  r  r  r  r  r  )r   r   r   r   r   rc  re  ro   r`  r  r  r   r   r  r   r   r   r   s   @r*   r]  r]    se     "!#O$4 $L%
$4 (5#4#4 (5CTCT ((r,   r]  )r,  r  rg  r  rF  r]  )r   Nr%  r  )er   r   numpyrG   r   r   torch.nnr   r   r    r   rR  activationsr	   cache_utilsr
   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   configuration_speecht5r   r   
get_loggerr   r  _HIDDEN_STATES_START_POSITIONr   r<   r+   r4   r  r   r  ndarrayre   rg   r   r   rh  r   r   r  r  r   r)  rA  rO  r  r  r  r  r  r  r  r  r'  r5  rF  rj  r  r  r  r  r  r  r  r  r  r  r,  r  rP   re  rg  r  r  r]  __all__rb  r,   r*   <module>r     s        @ @ & ! C C ) @ 7 J 9  D , I 
		H	% !" %,, c [^ " bf0,,0250KP<<Z^K^04 /3tc?tt t $$t+	t
 t ZZtp#= ,!; 8!; 2B8BII B8L*bii *Zryy 2" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ""-		+? "-J&,@ &$U2		 U2p")) 065 6rX5 Xv ?4o ?4 ?4Dn
- n
b!&= !H&$; &R
#: 
>]
- ]
@*&= *Z.$; .b%#: %P8M299 8Mv:bii :z 
J
+ J

J
Z 
a
5 a

a
N 48.2 $$)"'L"L##L ))D0L $$t+	L
 L L L YYL "L  L u00%2C2CCDDL^ 
T
5 T

T
n
 
b
 7 b

b
J;299 ;| 
so s
slr,   