
    Z j                        S r SSKrSSKJr  SSKJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJr  SSK J!r!  \RD                  " \#5      r$\" SS9\ " S S\5      5       5       r%\" SS9\ " S S\5      5       5       r&\RN                  RP                  S 5       r)     SAS jr*S r+ " S S\R                  RX                  5      r- " S S\	RX                  5      r. " S  S!\	RX                  5      r/ " S" S#\	RX                  5      r0 " S$ S%\	RX                  5      r1 " S& S'\	RX                  5      r2 " S( S)\	RX                  5      r3 " S* S+\	RX                  5      r4 " S, S-\	RX                  5      r5 " S. S/\	RX                  5      r6 " S0 S1\	RX                  5      r7 " S2 S3\	RX                  5      r8 " S4 S5\	RX                  5      r9 " S6 S7\5      r: " S8 S9\	RX                  5      r; " S: S;\	RX                  5      r<\ " S< S=\5      5       r=\" S>S9 " S? S@\=5      5       r>S@S=/r?g)BzPyTorch VITS model.    N)	dataclass)Any)nn   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringloggingtorch_compilable_check   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
VitsModelOutput'   a  
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    The final audio waveform predicted by the model.
sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
    The length in samples of each element in the `waveform` batch.
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
    The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
    GAN decoder model to obtain the final audio waveform.
Nwaveformsequence_lengthsspectrogramhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   r   __static_attributes__r       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vits/modeling_vits.pyr   r   '   s     *.He$&-15e''$.537Ku(()D0759M5**+d2926Je''(4/6r)   r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
VitsTextEncoderOutput?   aq  
prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    The predicted mean values of the prior distribution for the latent text variables.
prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    The predicted log-variance values of the prior distribution for the latent text variables.
Nlast_hidden_stateprior_meansprior_log_variancesr   r   r   )r   r    r!   r"   r#   r.   r$   r%   r&   r/   r0   r   r'   r   r(   r   r)   r*   r,   r,   ?   s~     37u((4/6,0K""T)048**T1859M5**+d2926Je''(4/6r)   r,   c                     X-   n[         R                  " US S 2S U2S S 24   5      n[         R                  " US S 2US 2S S 24   5      nXE-  nU$ N)r$   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r*   fused_add_tanh_sigmoid_multiplyr<   T   sP    FJJva,123EMM&LM1!456E=DKr)   c	                    X* :  X:*  -  n	U	) n
[         R                  " U 5      n[         R                  " U 5      n[        R                  " [        R                  " SU-
  5      S-
  5      n[
        R                  R                  USS9nXS'   XS'   X
   X'   SX'   [        X	   XSS24   X)SS24   X9SS24   UUUUUS9	u  X'   X'   X4$ )	ap	  
This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
`tail_bound`, the transform behaves as an identity function.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`, *optional*, defaults to `False`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`, *optional* defaults to 5):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
        applied.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
        limits applied.
r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r$   
zeros_likenplogexpr   
functionalr>   _rational_quadratic_spline)rC   rD   rE   rF   rG   rH   rI   rJ   rK   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r*   (_unconstrained_rational_quadratic_splinerW   ]   s    \ #k1f6JK11v&G""6*KvvbffQ/0145H!}}001Iv0V'/V$(0W%%+%BG"),K&Ga+/a0GH12IJ!9PQ:Q!R#%%
HDG!;#D r)   c	                    Un	U* n
[        U R                  5       U
:  U R                  5       U	:*  -  SU
 SU	 S35        UR                  S   nXk-  S:  a  [	        SU SU 35      eX{-  S:  a  [	        SU SU 35      e[
        R                  R                  USS	9nUS
Xk-  -
  U-  -   n[        R                  " USS	9n[
        R                  R                  USSSS9nX-
  U-  U
-   nXS'   XS'   USS
S24   USSS24   -
  nU[
        R                  R                  U5      -   n[
        R                  R                  USS	9nUS
X{-  -
  U-  -   n[        R                  " USS	9n[
        R                  R                  USSSS9nX-
  U-  U
-   nU
US'   U	US'   USS
S24   USSS24   -
  nU(       a  UOUnUS==   S-  ss'   [        R                  " U S   U:  SS	9S
-
  nUS   nUR                  SU5      S   nUR                  SU5      S   nUR                  SU5      S   nX-  nUR                  SU5      S   nUR                  SU5      S   nUSS
S24   R                  SU5      S   nUR                  SU5      S   nUU-   SU-  -
  nU(       d  U U-
  U-  nUS
U-
  -  nUUUR                  S5      -  UU-  -   -  nUUU-  -   nUUU-  -   n UR                  S5      UUR                  S5      -  SU-  U-  -   US
U-
  R                  S5      -  -   -  n![        R                  " U!5      S[        R                  " U5      -  -
  n"U U"4$ U U-
  n#U#U-  n$UUU-
  -  U$-   n%UU-  U$-
  n&U* U#-  n'U&R                  S5      SU%-  U'-  -
  n([        [        R                   " U(S:  5      SU( 35        SU'-  U&* [        R"                  " U(5      -
  -  n)U)U-  U-   n U)S
U)-
  -  nUUU-  -   nUR                  S5      UU)R                  S5      -  SU-  U-  -   US
U)-
  R                  S5      -  -   -  n![        R                  " U!5      S[        R                  " U5      -  -
  n"U U"* 4$ )a  
This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs`.
zInputs are outside the range [z, ]rA         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rV   rB   )r>   modevaluer?   r@   .Ngư>).N      r   z!Discriminant has negative values )r   minmaxshape
ValueErrorr   rP   softmaxr$   cumsumr>   softplussumgatherpowrN   allsqrt)*rC   rD   rE   rF   rG   rH   rI   rJ   rK   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrT   derivative_numeratorrU   intermediate2intermediate3abcdiscriminantroots*                                             r*   rQ   rQ      sN   X K+K		$)DE
(R}AF #((,H#%-m_<^_g^hijj 3&.~.>>`ai`jkll]]""#6B"?Fa-"::fDDFV,I!!)jPS!TI*i7+EI#f$gsABw)C"H"55F 2==#9#9:R#SSKmm##$8b#AGN$= =HHGg2.J"":6
RU"VJ+z9KGJ$Jv%Jwab!JsCRCx$88G")JyM'd"iiy)]:CaGGi G&&r73F;O}}R1&9!((W5f=E,,r7+F3K#**2w7?!,S!"W!5!<!<R!I&!QNN2w/7M%(BBQ_TM/)-== %U 3![599Q<%?BSVkBk%kl	!M4I$II"Y%<<*q1&15+o 5561u9//!"445 

 ii 45EIIk<R8RR## !11%5[+<<=M--=L=(uuQx!a%!)+IIla'(/~>	

 A1"uzz,778))O; $D 1!M4I$II*q1&!4+o 5561t8.."334 

 ii 45EIIk<R8RR$$r)   c                   D   ^  \ rS rSrS\S\4U 4S jjrSS jrS rSr	U =r
$ )	VitsWaveNeti1  config
num_layersc           	        > [         TU ]  5         UR                  U l        X l        [        R
                  R                  5       U l        [        R
                  R                  5       U l        [
        R                  " UR                  5      U l        [        [
        R                  R                  S5      (       a%  [
        R                  R                  R                  nO[
        R                  R                  nUR                   S:w  aG  [        R
                  R#                  UR                   SUR                  -  U-  S5      nU" USS9U l        ['        U5       H  nUR(                  U-  nUR*                  U-  U-
  S-  n[        R
                  R#                  UR                  SUR                  -  UR*                  UUS9nU" USS9nU R                  R-                  U5        XRS-
  :  a  SUR                  -  n	OUR                  n	[        R
                  R#                  UR                  U	S5      n
U" U
SS9n
U R                  R-                  U
5        M     g )Nweight_normr   r_   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r$   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r*   r   VitsWaveNet.__init__2  s   !--$,,.$xx224zz&"8"89288,,m<<((33??K((..K((A-)F)FFL^L^H^akHkmnoJ)*8DDOz"A33Q6H11H<xGAMGxx"..!3!33"66! ' H #8(;HNN!!(+ >!$%(:(:$:!$*$6$6!"XX__V-?-?ARTUVN(hGN  ''7+ #r)   c                    [         R                  " U5      n[         R                  " U R                  /5      nUb  U R	                  U5      n[        U R                  5       H  nU R                  U   " U5      nUb0  US-  U R                  -  nUS S 2XSU R                  -  -   2S S 24   n	O[         R                  " U5      n	[        XyUS   5      n
U R                  U
5      n
U R                  U   " U
5      nX`R                  S-
  :  a;  US S 2S U R                  2S S 24   nX-   U-  nXKS S 2U R                  S 2S S 24   -   nM  XK-   nM     XB-  $ )Nr_   r   r   )r$   rL   	IntTensorr   r   r   r   r   r<   r   r   )r   rC   padding_maskglobal_conditioningrT   num_channels_tensorr   r   cond_offsetglobal_statesr;   res_skip_actsres_actss                r*   forwardVitsWaveNet.forward[  sV   ""6*#oot/?/?.@A*"&//2E"Ft'A NN1-f5M".!ed&6&66 3A{STW[WgWgSgEg7gij4j k % 0 0 ?2=QdefQghD<<%D 003D9M??Q&&(,>d.>.>,>)AB +|;!!T5E5E5G2J$KK!1% (( %%r)   c                 z   U R                   S:w  a3  [        R                  R                  R	                  U R
                  5        U R                   H,  n[        R                  R                  R	                  U5        M.     U R                   H,  n[        R                  R                  R	                  U5        M.     g )Nr   )r   r$   r   r   remove_weight_normr   r   r   r   layers     r*   r   VitsWaveNet.remove_weight_normx  st    &&!+HHNN--doo>^^EHHNN--e4 $))EHHNN--e4 *r)   )r   r   r   r   r   r   r2   )r   r    r!   r"   r   intr   r   r   r(   __classcell__r   s   @r*   r   r   1  s&    '8z '8s '8R&:5 5r)   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsPosteriorEncoderi  r   c                 >  > [         TU ]  5         UR                  U l        [        R
                  " UR                  UR                  S5      U l        [        XR                  S9U l        [        R
                  " UR                  U R                  S-  S5      U l        g )Nr   r   r_   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r*   r   VitsPosteriorEncoder.__init__  ss    ",,		&"9"96;M;MqQ"66a6ab6#5#5t7H7H17LaPr)   c                 &   U R                  U5      U-  nU R                  XU5      nU R                  U5      U-  n[        R                  " X@R
                  SS9u  pVU[        R                  " U5      [        R                  " U5      -  -   U-  nXuU4$ )Nr   r[   )r   r   r   r$   splitr   
randn_likerO   )r   rC   r   r   statsmean
log_stddevsampleds           r*   r   VitsPosteriorEncoder.forward  s    v&5f4GHv&5 ;;u.?.?QG%**40599Z3HHHLXj((r)   )r   r   r   r   r2   	r   r    r!   r"   r   r   r   r(   r   r   s   @r*   r   r     s    Qz Q) )r)   r   c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ s H0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ s H,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )strider   r   )r   r   leaky_relu_sloper   r   r   lenr   get_paddingconvs1convs2)r   channelsr   r   r   r   _r   s          r*   r   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   7C%%3C*c                     X-  U-
  S-  $ )Nr_   r   )r   r   r   s      r*   r    HifiGanResidualBlock.get_padding  s    &1a77r)   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r   r   r   r   r   s      r*   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm  si    hh**288,,m<<((33??K[[E ![[E !r)   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r2   )r   r   r   r   r   r   s     r*   r   'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r)   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r2   )zipr   r   r   rP   
leaky_relur   )r   r   conv1conv2residuals        r*   r   HifiGanResidualBlock.forward  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r)   )r   r   r   )r   )r   r      g?r   )r   r    r!   r"   r   r   r   r   r   r(   r   r   s   @r*   r   r     s!    
>8/ r)   r   c                      ^  \ rS rSrS\4U 4S jjrS rS r SS\R                  S\R                  S-  S	\R                  4S
 jjr
SrU =r$ )VitsHifiGani  r   c                 `  > [         TU ]  5         Xl        [        UR                  5      U l        [        UR                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [!        UR                  UR"                  5      5       Ha  u  nu  p4U R                  R%                  [        R&                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [+        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[!        UR                  UR,                  5       H4  u  pFU R(                  R%                  [/        XTXaR0                  5      5        M6     Mr     [        R                  " WSSSSSS9U l        UR4                  S:w  a2  [        R                  " UR4                  UR                  S5      U l        g g )	N   r   r   )r   r   r   r_   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r*   r   VitsHifiGan.__init__  s   v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRY^_((A-		&"?"?A`A`bcdDI .r)   c                 N   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nUR                  5         M     g r   )r   r   r   r   r   r  r  r   r   s      r*   r   VitsHifiGan.apply_weight_norm  sm    hh**288,,m<<((33??K^^E $^^E##% $r)   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H  nUR                  5         M     g r2   )r  r   r   r   r  r   s     r*   r   VitsHifiGan.remove_weight_norm  s<    ^^EHH''. $^^E$$& $r)   Nr   r   returnc                    U R                  U5      nUb  X0R                  U5      -   n[        U R                  5       H  n[        R
                  R                  X0R                  R                  5      nU R                  U   " U5      nU R                  X@R                  -     " U5      n[        SU R                  5       H)  nXPR                  X@R                  -  U-      " U5      -  nM+     XPR                  -  nM     [        R
                  R                  U5      nU R                  U5      n[        R                  " U5      nU$ )a  
Converts a spectrogram into a speech waveform.

Args:
    spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
        Tensor containing the spectrograms.
    global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
        Tensor containing speaker embeddings, for multispeaker models.

Returns:
    `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
r   )r   r
  r   r  r   rP   r   r   r   r  r  r   r	  r$   r3   )r   r   r   r   r   	res_statejr   s           r*   r   VitsHifiGan.forward  s    k2*)II6I,JJMt))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5::m,r)   )r
  r   r	  r   r   r  r  r  r2   )r   r    r!   r"   r   r   r   r   r$   r%   r   r(   r   r   s   @r*   r   r     s]    "ez "eH&' _c  ,, CHCTCTW[C[ 			   r)   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingLayeri*  r   c                 >  > [         TU ]  5         UR                  S-  U l        [        R
                  " U R                  UR                  S5      U l        [        XR                  S9U l
        [        R
                  " UR                  U R                  S5      U l        g )Nr_   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r	  r   s     r*   r   "VitsResidualCouplingLayer.__init__+  ss    #--2		$"4"4f6H6H!L"66]6]^6#5#5t7I7I1Mr)   c                    [         R                  " XR                  /S-  SS9u  pVU R                  U5      U-  nU R	                  XrU5      nU R                  U5      U-  n[         R                  " U5      n	U(       dP  X[         R                  " U	5      -  U-  -   n[         R                  " XV/SS9n
[         R                  " U	SS/5      nX4$ Xh-
  [         R                  " U	* 5      -  U-  n[         R                  " XV/SS9n
U
S 4$ )Nr_   r   r[   )
r$   r   r  r   r   r	  rL   rO   catrh   )r   rC   r   r   rG   
first_halfsecond_halfr   r   r   rT   log_determinants               r*   r   !VitsResidualCouplingLayer.forward3  s    "'++f7I7I6JQ6NTU"V
j1L@]BUV~~m,|;%%d+
uyy/D!D|!SSKii 9qAG#ii
QF;O++&-J;1GG,VKii 9qAGD= r)   )r	  r   r  r   NFr   r   s   @r*   r  r  *  s    Nz N! !r)   r  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingBlockiE  r   c                    > [         TU ]  5         [        R                  " 5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     g r2   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  r   r   r   r   s      r*   r   "VitsResidualCouplingBlock.__init__F  sH    ]]_
v556AJJ7?@ 7r)   c                     U(       d8  U R                    H&  nU" XU5      u  p[        R                  " US/5      nM(     U$ [        U R                   5       H%  n[        R                  " US/5      nU" XUSS9u  pM'     U$ )Nr   TrG   )r&  r$   flipreversed)r   rC   r   r   rG   flowr   s          r*   r   !VitsResidualCouplingBlock.forwardL  s}    

 7JK	FQC0 #  !,FQC0 7JTXY	 - r)   )r&  r"  r   r   s   @r*   r$  r$  E  s    Az A	 	r)   r$  c                   >   ^  \ rS rSrSS\4U 4S jjjrSS jrSrU =r$ )VitsDilatedDepthSeparableConviX  r   c                 N  > [         TU ]  5         UR                  nUR                  nUR                  U l        [        R                  " U5      U l        [        R                  " 5       U l
        [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        [        U R
                  5       H  nX5-  nX6-  U-
  S-  nU R                  R                  [        R                   " UUUUUUS95        U R                  R                  [        R                   " XDS5      5        U R                  R                  [        R"                  " U5      5        U R                  R                  [        R"                  " U5      5        M     g )Nr_   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r*   r   &VitsDilatedDepthSeparableConv.__init__Y  s,   ;;%% ;;zz,/]]_!}}}}}}t'A"~H"-8Q>G%%		 (!) +#%#	   ''		(a(HILLX 67LLX 67 (r)   c                 "   Ub  X-   n[        U R                  5       H  nU R                  U   " X-  5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U   " U5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U5      nX-   nM     X-  $ Nr   rA   )r   r   r6  r8  	transposer   rP   gelur7  r9  r   )r   rC   r   r   r   r   s         r*   r   %VitsDilatedDepthSeparableConv.forwardu  s    *1Ft'A ..q1&2GHM LLOM,C,CAr,JKUUVWY[\MMM..}=M 003MBM LLOM,C,CAr,JKUUVWY[\MMM..}=M LL7M+F ( $$r)   )r6  r7  r   r8  r9  r   )rB   r2   r   r   s   @r*   r1  r1  X  s    8z 8 88% %r)   r1  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsConvFlowi  r   c                   > [         TU ]  5         UR                  U l        UR                  S-  U l        UR                  U l        UR                  U l	        [        R                  " U R
                  U R                  S5      U l        [        U5      U l        [        R                  " U R                  U R
                  U R                  S-  S-
  -  S5      U l        g )Nr_   r   r   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsro   duration_predictor_tail_boundrH   r   r   r   r1  conv_ddsr   r   s     r*   r   VitsConvFlow.__init__  s    %11#<<A;; >>		$"4"4d6J6JAN5f=4#7#79K9Kt}}_`O`cdOd9eghir)   c           	         [         R                  " XR                  /S-  SS9u  pVU R                  U5      nU R	                  XrU5      nU R                  U5      U-  nUR                  u  pn
UR                  XSU
5      R                  SSSS5      nUSS U R                  24   [        R                  " U R                  5      -  nUSU R                  SU R                  -  24   [        R                  " U R                  5      -  nUSSU R                  -  S 24   n[        UUUUUU R                  S9u  pn[         R                  " XV/SS9U-  nU(       d  [         R                   " X-  SS/5      nUU4$ US 4$ )	Nr_   r   r[   rA   r   r   .)rG   rH   )r$   r   r  r   rI  r   rc   reshapepermutero   mathrl   rE  rW   rH   r  rh   )r   rC   r   r   rG   r  r  r   
batch_sizer   lengthrD   rE   rF   rU   rT   r   s                    r*   r   VitsConvFlow.forward  s}   "'++f7I7I6JQ6NTU"V
j1mCVW}5D'1'7'7$
f%--jBOWWXY[\^_abc+C4==,@ADIIdNbNbDcc,S$--!dmmBS2S-STW[W`W`aeauauWvv#0a$--6G6I1I#J #K $$
  ))Z51=L#ii(BQFKOO++D= r)   )rI  r   r   rE  r  ro   rH   r"  r   r   s   @r*   rC  rC    s    	jz 	j! !r)   rC  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsElementwiseAffinei  r   c                 ,  > [         TU ]  5         UR                  U l        [        R
                  " [        R                  " U R                  S5      5      U l        [        R
                  " [        R                  " U R                  S5      5      U l	        g Nr   )
r   r   rF  r   r   	Parameterr$   zeros	translate	log_scaler   s     r*   r   VitsElementwiseAffine.__init__  sY    77ekk$--&CDekk$--&CDr)   c                 8   U(       d]  U R                   [        R                  " U R                  5      U-  -   nXR-  n[        R                  " U R                  U-  SS/5      nXV4$ XR                   -
  [        R                  " U R                  * 5      -  U-  nUS 4$ Nr   r_   )rX  r$   rO   rY  rh   )r   rC   r   r   rG   rT   r   s          r*   r   VitsElementwiseAffine.forward  s    nnuyy'@6'IIG,G#ii(E1vNO++.%))T^^O2LL|[GD= r)   )r   rY  rX  r"  r   r   s   @r*   rS  rS    s    Ez E! !r)   rS  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsStochasticDurationPredictori  c                   > [         TU ]  5         UR                  nUR                  n[        R
                  " X3S5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l
        US:w  a  [        R
                  " X#S5      U l        [        R                  " 5       U l        U R                  R                  [        U5      5        [!        UR"                  5       H'  nU R                  R                  [%        U5      5        M)     [        R
                  " SUS5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l        [        R                  " 5       U l        U R,                  R                  [        U5      5        [!        UR"                  5       H'  nU R,                  R                  [%        U5      5        M)     g )Nr   )r;  r   )r   r   r   r   r   r   r   r   r1  duration_predictor_dropoutrI  r
  r   r&  r   rS  r   duration_predictor_num_flowsrC  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimrE  r   r   s        r*   r   (VitsStochasticDurationPredictor.__init__  sb   11	 ,,		/AF?QG5::

 >		)a@DI]]_


/78v::;AJJl623 <  YYq/1= ii!L:::

 --/4V<=v::;AOO""<#78 <r)   c                    [         R                  " U5      nU R                  U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R	                  X5      nU R                  U5      U-  nU(       Gd  U R                  U5      nU R                  Xr5      nU R                  U5      U-  n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nSn	Un
U R                   H*  nU" XX-   S9u  p[         R                  " U
S/5      n
X-  n	M,     [         R                   " U
SS/SS9u  pU	[         R"                  " [$        R&                  R)                  U5      [$        R&                  R)                  U* 5      -   U-  SS/5      -  n	[         R"                  " S[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U	-
  nU[         R0                  " U5      -
  U-  n[         R,                  " [         R2                  " US5      5      U-  n[         R"                  " U* SS/5      n[         R4                  " X/SS9nU R6                   H*  nU" UX!S9u  nn[         R                  " US/5      nUU-  nM,     [         R"                  " S	[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U-
  nUU-   $ [9        [;        U R6                  5      5      nUS S
 US   /-   n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nU H&  n[         R                  " US/5      nU" UX!SS9u  nnM(     [         R                   " USS/SS9u  nnU$ )Nr   r_   )devicedtype)r   r   r[         gh㈵>g      ?rA   T)r   rG   )r$   detachr   r
  rI  r   rc  re  rd  randnsizetorj  rk  rf  r,  r   rh   r   rP   
logsigmoidrN  rN   pir4   	clamp_minr  r&  listr-  )r   rC   r   r   	durationsrG   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr.  r   r  r  logqlog_determinant_sumlatentsnllr&  r   log_durations                         r*   r   'VitsStochasticDurationPredictor.forward  s   f%v&*"',,/B"Cii(;<<Fv4',6 ..y9M ..}KM //>MM INN1-q)..2CDGGv}}djdpdpGq  -.) 059%I_62! %*JJ/@1#$F!-@- ( ',kk2CaVQR&S#J)UYY))*58P8PR\Q\8]]ammpqstou. ) 		$$((1tww;"7;KQ;N"OPS__bcefagh/0 
 $emmJ&??<OJ5??:t#DETJ"'))ZK!Q"@ii 9qAG

+/+b(**Wqc2#6# #
 ))C488AK#8GQJ#GH<WZ[]^Y_`cvvC:$**-.E#2J%),E FKKNAv{{1~>AA^d^j^jAk  **Wqc2!'<]ab
  $kk'Aq6qAOL!r)   )	r
  rI  r   r   r&  re  rc  rd  rf  )NNFrZ   r   r    r!   r"   r   r   r(   r   r   s   @r*   r_  r_    s    9@@  @ r)   r_  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsDurationPredictori'  c                 p  > [         TU ]  5         UR                  nUR                  n[        R
                  " UR                  5      U l        [        R                  " UR                  X2US-  S9U l
        [        R                  " X1R                  S9U l        [        R                  " X3X"S-  S9U l        [        R                  " X1R                  S9U l        [        R                  " USS5      U l        UR"                  S:w  a2  [        R                  " UR"                  UR                  S5      U l        g g )Nr_   )r   epsr   r   )r   r   r4  "duration_predictor_filter_channelsr   r   ra  r   r   r   conv_1r:  layer_norm_epsnorm_1conv_2norm_2projr   r
  )r   r   r   rE  r   s       r*   r   VitsDurationPredictor.__init__(  s    ;; CCzz&"C"CDii 2 2OZeijZjkll?8M8MNii+fgWghll?8M8MNIIoq!4	((A-		&"?"?ASASUVWDI .r)   c                 `   [         R                  " U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      nX-  $ r>  )r$   rn  r
  r  relur  r?  r   r  r  r  )r   rC   r   r   s       r*   r   VitsDurationPredictor.forward7  s    f%*"',,/B"Cii(;<<FV23F#V--a45??2Ff%V23F#V--a45??2Ff%601$$r)   )r
  r  r  r   r  r  r  r2   r  r   s   @r*   r  r  '  s    X% %r)   r  c                     ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	4S jr
   SS
\R                  S\R                  S	-  S\R                  S	-  S\S\\R                  \R                  S	-  4   4
S jjrS rS rS rSrU =r$ )VitsAttentioniL  z?Multi-headed attention with relative positional representation.r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  -  U l	        U R                  S-  U l
        U R                  U R
                  -  U R                  :w  a&  [        SU R                   SU R
                   S35      e[        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        U R                  (       a  [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        g g )Nrl  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   r_   )r   r   r   rg  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingrd   r   Linearuse_biask_projv_projq_projout_projrV  r$   ro  	emb_rel_k	emb_rel_vr   s     r*   r   VitsAttention.__init__O  s   ++33//!--$..8}}d*MMDNN*t~~=[\`\j\j[k.t~~.>bB 
 iiV__UiiV__UiiV__U		$..$..vW\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN r)   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ r\  )viewr  r  r?  
contiguous)r   r  r  r  s       r*   _shapeVitsAttention._shapeh  s5    {{3GQQRSUVWbbddr)   Nr   key_value_statesattention_maskoutput_attentionsr  c                    UR                  5       u  pVnU R                  U5      U R                  -  nU R                  U R	                  U5      SU5      n	U R                  U R                  U5      SU5      n
XPR                  -  SU R                  4nU R                  XU5      R                  " U6 nU	R                  " U6 n	U
R                  " U6 n
U	R                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XPR                  -  Xl4:w  a-  [        SXPR                  -  Xl4 SUR                  5        35      eU R                  bX  U R                  U R                  U5      n[        R                   " XR                  SS5      5      nU R#                  U5      nUU-  nUbv  UR                  5       USXl4:w  a"  [        SUSXl4 SUR                  5        35      eUR                  XPR                  Xl5      U-   nUR                  XPR                  -  Xl5      n[$        R&                  R)                  USS	9nU(       a;  UR                  XPR                  Xl5      nUR                  XPR                  -  Xl5      nOSn[$        R&                  R+                  XR*                  U R,                  S
9n[        R                  " UU
5      nUR                  5       XPR                  -  X`R                  4:w  a5  [        SXPR                  X`R                  4 SUR                  5        35      eU R                  bI  U R                  U R.                  U5      nU R1                  U5      n[        R                   " UU5      nUU-  nUR                  XPR                  X`R                  5      nUR                  SS5      nUR3                  XVU R4                  5      nU R7                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelrA   r   r_   z$Attention weights should be of size z	, but is Nrm  z!Attention mask should be of size r[   )ptrainingz `attn_output` should be of size )rp  r  r  r  r  r  r  r  r  r$   bmmr?  rd   r  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rP   re   r   r  r  '_absolute_position_to_relative_positionrL  rg  r  )r   r   r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                         r*   r   VitsAttention.forwardk  s    (,,.a {{=1DLL@ [[]!;RE
{{4;;}#=r3GNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 '&*&C&CDNNT[&\##ll<9Z9Z[]_a9bcOGGXLL(L%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 '(,(E(EdnnV](^%#KKJW <<(8:STL<'K!&&sNNG]]S!++Aq1 "))#GmmK0111r)   c           	          [        X R                  S-   -
  S5      nUS:  a%  [        R                  R	                  USSX3SS/5      n[        U R                  S-   U-
  S5      nUSU-  -   S-
  nUS S 2XE24   $ )Nr   r   r_   )rb   r  r   rP   r>   )r   relative_embeddingsrP  
pad_lengthslice_start_positionslice_end_positions         r*   r  &VitsAttention._get_relative_embeddings  s    #3#3a#78!<
>"$--"3"34G!QPZhiklIm"n"D$4$4q$8F#BAF1AJ>B"1&:&M#MNNr)   c                 H   UR                  5       u  p#n[        R                  R                  U/ SQ5      nUR	                  X#S-  U-  /5      n[        R                  R                  USUS-
  SS/5      nUR	                  X#S-   SU-  S-
  /5      nUS S 2S U2US-
  S 24   nU$ )N)r   r   r   r   r   r   r_   r   r   rp  r   rP   r>   r  r   xbatch_headsrP  r   x_flatx_finals          r*   r  5VitsAttention._relative_position_to_absolute_position  s    !"Q MMa!34 qj6&9:;""6Avz1a+@A ++{QJF
QGH!WfWfqjl23r)   c           	      >   UR                  5       u  p#n[        R                  R                  USUS-
  SSSS/5      nUR	                  X#SU-  S-
  -  /5      n[        R                  R                  XSSSS/5      nUR	                  X#SU-  /5      S S 2S S 2SS 24   nU$ )Nr   r   r_   r  r  s          r*   r  5VitsAttention._absolute_position_to_relative_position  s    !"Q MMa!VaZAq!!<=F
Q&?@A ""6Aq!+<=++{AJ?@AqrJr)   )r   r  r  rg  r  r  r  r  r  r  r  r  )NNF)r   r    r!   r"   r#   r   r   r$   Tensorr   r  boolr'   r   r  r  r  r(   r   r   s   @r*   r  r  L  s    Irz r2eU\\ eC ec e 15.2"'V2||V2  ,,-V2 t+	V2
  V2 
u||U\\D00	1V2pO
 
r)   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VitsFeedForwardi  c                 t  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  5      U l        [        R                  " UR
                  UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        UR                  S:  a.  UR                  S-
  S-  nUR                  S-  nX#SSSS/U l        g S U l        g )Nr   r_   r   )r   r   r   r   r   ffn_dimffn_kernel_sizer  r  r   activation_dropoutr   
isinstance
hidden_actstrr   act_fnr   )r   r   pad_left	pad_rightr   s       r*   r   VitsFeedForward.__init__  s    ii 2 2FNNFDZDZ[ii0B0BFDZDZ[zz&";";<f''-- !2!23DK ++DK!!A%..2q8H..!3I$Aq!<DLDLr)   c                    UR                  SSS5      nUR                  SSS5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nU R                  U5      nU R                  U5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nX-  nUR                  SSS5      nU$ )Nr   r_   r   )	rM  r   r   rP   r>   r  r  r   r  )r   r   r   s      r*   r   VitsFeedForward.forward  s    %--aA6#++Aq!4%4<<#MM--m\\JMM2M2]3%4<<#MM--m\\JMM2%4%--aA6r)   )r  r  r  r   r   r  r   s   @r*   r  r    s     $ r)   r  c            	          ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S\	4S	 jjr
S
rU =r$ )VitsEncoderLayeri  r   c                 d  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )Nr  )r   r   r  	attentionr   r   hidden_dropoutr   r:  r   r  
layer_normr  feed_forwardfinal_layer_normr   s     r*   r   VitsEncoderLayer.__init__  sz    &v.zz&"7"78,,v'9'9v?T?TU+F3 "V-?-?VEZEZ [r)   Nr   r   r  r  c                    UnU R                  UUUS9u  pU R                  U5      nU R                  XQ-   5      nUnU R                  X5      nU R                  U5      nU R	                  XQ-   5      nU4nU(       a  Xv4-  nU$ )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rT   s           r*   r   VitsEncoderLayer.forward  s     !&*nn')/ '5 '
# ]3(@A ))-F]3--h.FG "&Gr)   )r  r   r  r  r  r"  )r   r    r!   r"   r   r   r$   r  r%   r  r   r(   r   r   s   @r*   r  r    s\    \z \ /3"'|| '' t+	
   r)   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\R                  S-  S\	S-  S	\	S-  S
\	S-  S\
\-  4S jjrSrU =r$ )VitsEncoderi9  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        UR                  U l
        g s  snf r"  )r   r   r   r   r   r   num_hidden_layersr  layersgradient_checkpointing	layerdropr(  s      r*   r   VitsEncoder.__init__:  s`    mmuVMeMeGf$gGf!%5f%=Gf$gh&+#)) %hs   A7Nr   r   r  r  output_hidden_statesreturn_dictr  c                 2   U(       a  SOS nU(       a  SOS n[        U R                  UUS9nX-  n[        5       =(       d    [        U 5      n	U R                   H  n
U(       a  Xq4-   n[
        R                  R                  SS5      nU R                  =(       a    XR                  :  nU(       a  U	(       a  U
" UUUUS9nUS   nU(       a  SnU(       d  M~  UWS   4-   nM     X-  nU(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )	Nr   )r   inputs_embedsr  r   r   )r  r   r  )NNc              3   .   #    U  H  oc  M  Uv   M     g 7fr2   r   ).0vs     r*   	<genexpr>&VitsEncoder.forward.<locals>.<genexpr>u  s     m$[q$[s   	)r.   r   r   )r   r   r	   r
   r  rM   randomuniformr  r  r'   r   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r*   r   VitsEncoder.forwardA  s)    #7BD$5b42;;')
 &402R6LT6R![[M#$58H$H! #%))"3"3Aq"9!]]U0Cnn0TN![ -!#1!-&7	! !.a 0 ,  &9]1=M<O&O#- )0 &4 14D Dm]GZ$[mmm++*
 	
r)   )r   r  r  r  )NNNN)r   r    r!   r"   r   r   r$   r%   r  r  r'   r   r   r(   r   r   s   @r*   r  r  9  s    *z * /3)-,0#':
((:
 '':
 t+	:

  $;:
 #Tk:
 D[:
 
	 :
 :
r)   r  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\R                  S-  S	\
S-  S
\
S-  S\
S-  S\\R                     \-  4S jjrSrU =r$ )VitsTextEncoderi~  zk
Transformer encoder that uses relative positional representation instead of absolute positional encoding.
r   c                 (  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        U5      U l
        [        R                  " UR                  UR                  S-  SS9U l        g )Nr_   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r*   r   VitsTextEncoder.__init__  sm    LL):):F<N<NPVPcPcd"6*yy!3!3V5E5E5IWXYr)   N	input_idsr   r  r  r  r  r  c           	         U R                  U5      [        R                  " U R                  R                  5      -  nU R                  UUUUUUS9nU(       d  US   OUR                  n	U R                  U	R                  SS5      5      R                  SS5      U-  n
[        R                  " XR                  R                  SS9u  pU(       d  XU4USS  -   nU$ [        U	UUUR                  UR                  S9$ )N)r   r   r  r  r  r  r   r   r_   r[   )r.   r/   r0   r   r   )r  rN  rl   r   r   r  r.   r  r?  r$   r   r   r,   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr.   r   r/   r0   rT   s                 r*   r   VitsTextEncoder.forward  s     )))4tyyAXAX7YY,,'%)/!5# ' 
 7BOA.GhGh.88A>?II!QOR^^+0;;ukk>S>SYZ+[((7JKo^_^`NaaGN$/# 3)77&11
 	
r)   )r   r  r  r  )NNNT)r   r    r!   r"   r#   r   r   r$   r  r%   r  r'   r,   r   r(   r   r   s   @r*   r  r  ~  s    Zz Z /3)-,0#'#
<<#
 ''#
 t+	#

  $;#
 #Tk#
 D[#
 
u||	4	4#
 #
r)   r  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
VitsPreTrainedModeli  r   vitsr  Tmodulec                 f   U R                   R                  n[        U[        R                  5      (       aO  [
        R                  " UR                  SUS9  UR                  b!  [
        R                  " UR                  5        gg[        U[        R                  5      (       aA  [
        R                  " UR                  5        [
        R                  " UR                  5        g[        U[        R                  [        R                  45      (       a  [
        R                  " UR                  5        UR                  b_  [        R                   " UR"                  UR$                  UR&                  S   -  -  5      n[
        R(                  " UR                  U* US9  gg[        U[        R*                  5      (       ay  [
        R                  " UR                  SUS9  UR,                  bK  [/        UR                  SS5      (       d.  [
        R                  " UR                  UR,                     5        ggg[        U[0        5      (       a  U R                   R2                  (       ar  U R                   R4                  U R                   R6                  -  n[
        R                  " UR8                  US-  S	9  [
        R                  " UR:                  US-  S	9  gg[        U[<        5      (       aA  [
        R                  " UR>                  5        [
        R                  " UR@                  5        gg)
zInitialize the weightsrB   )r   stdNr   )r   r   _is_hf_initializedFrl  )r!  )!r   initializer_ranger  r   r  initnormal_r   r   zeros_r:  ones_r   r  kaiming_normal_rN  rl   r3  r   r   uniform_r  padding_idxgetattrr  r  r   r  r  r  rS  rX  rY  )r   r  r!  kr  s        r*   _init_weights!VitsPreTrainedModel._init_weights  s    kk++fbii((LLSc:{{&FKK( '--KK$JJv}}%B,>,> ?@@  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' --LLSc:!!-gfmmMach6i6iFMM&*<*<=> 7j-..{{&&;;22dkk6U6UUV--8T>BV--8T>B '  566KK(()KK(() 7r)   r   N)r   r    r!   r"   r   r&   base_model_prefixmain_input_namesupports_gradient_checkpointingr$   no_gradr   Moduler-  r(   r   r)   r*   r  r    s<    !O&*#
]]_*BII * *r)   r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                      ^  \ rS rSrS\4U 4S jjr\        SS\R                  S-  S\R                  S-  S\	S-  S\
S-  S	\
S-  S
\
S-  S\R                  S-  S\S-  S\\   \-  4S jj5       rSrU =r$ )	VitsModeli  r   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        UR                  (       a  [        U5      U l        O[        U5      U l        UR                  S:  a0  [        R                  " UR                  UR                   5      U l        [%        U5      U l        UR(                  U l        UR*                  U l        UR,                  U l        U R/                  5         g rU  )r   r   r   r  text_encoderr$  r.  r   decoder"use_stochastic_duration_predictionr_  duration_predictorr  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterw  noise_scale_duration	post_initr   s     r*   r   VitsModel.__init__  s     +F3-f5	"6*44&Ef&MD#&;F&CD#"!#f.A.A6C`C`!aD "6f!= $11!--$*$?$?! 	r)   Nr  r  
speaker_idr  r  r  labelsr>  r  c	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        S5      eU R
                  R                  R                  R                  n
Ub!  UR                  S5      R                  U
5      nO4[        R                  " U5      R                  S5      R                  U
5      nU R                   R                  S:  a  Ub  SUs=::  a  U R                   R                  :  d(  O  [        SU R                   R                  S-
   S35      e[        U[         5      (       a  [        R"                  " SX0R$                  S	9nU R'                  U5      R                  S5      nOSnU R                  UUUUUUS
9nU(       d  US   OUR(                  nUR+                  SS5      nUR+                  SS5      nU(       d  US   OUR,                  nU(       d  US   OUR.                  nU R                   R0                  (       a  U R3                  UUUSU R4                  S9nOU R3                  XU5      nUc  U R6                  nSU-  n[        R8                  " [        R:                  " U5      U-  U-  5      n[        R<                  " [        R>                  " USS/5      S5      RA                  5       n[        RB                  " URE                  5       UR                  UR$                  S9nUR                  S5      UR                  S5      :  nUR                  S5      R                  UR                  5      n[        R                  " US5      [        R                  " US5      -  nURF                  u  nnnn[        RH                  " US5      RK                  UU-  S5      n[        RB                  " UUR                  UR$                  S9nUR                  S5      U:  nUR                  UR                  5      RK                  UUU5      nU[L        RN                  RQ                  U/ SQ5      SS2SS24   -
  nUR                  S5      R+                  SS5      U-  n[        RR                  " URU                  S5      U5      R+                  SS5      n[        RR                  " URU                  S5      U5      R+                  SS5      nU[        RV                  " U5      [        R:                  " U5      -  U RX                  -  -   n U R[                  U UUSS9n!U!U-  n"U R]                  U"U5      n#U#RU                  S5      n#U[^        R`                  " U R                   Rb                  5      -  n$U(       d  U#U$U"4USS -   n%U%$ [e        U#U$U"URf                  URh                  S9$ )a_  
speaker_id (`int`, *optional*):
    Which speaker embedding to use. Only used for multispeaker models.
labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
    Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation.
speaking_rate (`float`, *optional*):
    Speaking rate.

Example:

```python
>>> from transformers import VitsTokenizer, VitsModel, set_seed
>>> import torch

>>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
>>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

>>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

>>> set_seed(555)  # make deterministic

>>> with torch.no_grad():
...     outputs = model(inputs["input_ids"])
>>> outputs.waveform.shape
torch.Size([1, 45824])
```
Nz&Training of VITS is not supported yet.rA   r   r   z Set `speaker_id` in the range 0-.r   )rp  
fill_valuerj  )r  r   r  r  r  r  r_   T)rG   rw  rZ   )rk  rj  )r   r   r   r   r   r   r   r+  )r   r   r   r   r   )5r   r  r  r  NotImplementedErrorr7  r  r   rk  	unsqueezerq  r$   	ones_liker;  rd   r  r   fullrj  r<  r.   r?  r/   r0   r9  r:  r?  r>  ceilrO   rt  rh   longarangerb   rc   rf   r  r   rP   r>   r  squeezer   rw  r.  r8  rM   prodr   r   r   r   )&r   r  r  rB  r  r  r  rC  r>  kwargs
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r/   r0   r  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskrO  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsr}  r   r   r   rT   s&                                         r*   r   VitsModel.forward  sF   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY%&NOO&&33::@@
%!/!9!9"!=!@!@!L!&!;!E!Eb!I!L!LZ!X;;##a'J,B
=T[[%=%== #CDKKD\D\_`D`Caab!cdd*c**"ZZTjQ\Q\]
!%!3!3J!?!I!I"!M!%"//+)/!5# 0 
 7B+A.GZGlGl%//15/99!Q?4?)!,EXEdEd<G1!4M`MtMt;;9922"" 55 3 L  22=VhiL  ..M]*::eii58JJ\YZ!OOEIIhA,GKPPR ,,0446>O>U>U^o^v^vw%//25F5P5PQR5SS1;;A>AABTBZBZ[ OO$6:U__M`bd=ee	5>__2
A}l||Hb166zL7PRST,,}HNN8??[))!,|;%((9>>z<Yfg&):):=J\)]^_adbdad^d)ee''*44Q:YF ll4<<?K@JJ1aP#ll4<<?<OPZZ[\^_`#e&6&6{&CeiiPcFd&dgkgwgw&ww))M+>@R\`)a 33<<-?@##A&,rwwt{{7Q7Q/RR!1;?BUVWVXBYYGN-#-;;*55
 	
r)   )
r   r8  r:  r<  r.  rw  r?  r=  r>  r7  )NNNNNNNN)r   r    r!   r"   r   r   r   r$   r  r   r  r%   floatr'   r   r   r   r(   r   r   s   @r*   r5  r5    s    z 4  *..2!%)-,0#'+/&*D
<<$&D
 t+D
 $J	D

  $;D
 #TkD
 D[D
 !!D(D
 t|D
 
so	%D
 D
r)   r5  )Fg      @MbP?rd  rd  )@r#   rN  dataclassesr   typingr   numpyrM   r$   r    r   r$  activationsr   integrations.deepspeedr	   integrations.fsdpr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r,   jitscriptr<   rW   rQ   r3  r   r   r   r   r  r$  r1  rC  rS  r_  r  r  r  r  r  r  r  r5  __all__r   r)   r*   <module>rv     sT     !     & ! @ 7 6 9 < - D D * 
		H	% 
 7k 7 7$ 
 7K 7 7   G TG%TM5%((// M5`)299 )&;299 ;|U")) Up!		 !6		 &+%BII +%\(!299 (!V!BII !$a bii a H"%BII "%JYBII Yx'bii 'T$1 $NB
")) B
J/
bii /
d "*/ "* "*J 
`
# `

`
F -
.r)   