
    Z j{"                     d    S r SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/rg
)zVITS model configuration    )strict   )PreTrainedConfig)auto_docstringzfacebook/mms-tts-eng)
checkpointc                      \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S	'   S
r\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\-  \S'   Sr\\-  \S'   Sr\\-  \S'   Sr\\S'   S r\\S!'   Sr\\S"'   S#r\\S$'   S%r\\S&'   S'r\\S('   S)r \!\   \"\S*4   -  \S+'   S,r#\!\   \"\S*4   -  \S-'   S.r$\!\   \"\S*4   -  \S/'   S0r%\!\"-  \S1'   Sr&\\S2'   S
r'\\S3'   Sr(\\S4'   S5r)\\S6'   S7r*\\S8'   Sr+\\S9'   S:r,\\-  \S;'   Sr-\\S<'   S=r.\\S>'   Sr/\\S?'   Sr0\\S@'   SAr1\\SB'   SCr2\\SD'   S#r3\\SE'   SFr4\\-  \SG'   SHr5\\-  \SI'   SJr6\\SK'   SLr7\\SM'   SNr8\\SO'   SPr9\SP-  \SQ'   SR r:SSr;gP)T
VitsConfig   a  
window_size (`int`, *optional*, defaults to 4):
    Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
use_bias (`bool`, *optional*, defaults to `True`):
    Whether to use bias in the key, query, value projection layers in the Transformer encoder.
ffn_kernel_size (`int`, *optional*, defaults to 3):
    Kernel size of the 1D convolution layers used by the feed-forward network in the Transformer encoder.
flow_size (`int`, *optional*, defaults to 192):
    Dimensionality of the flow layers.
spectrogram_bins (`int`, *optional*, defaults to 513):
    Number of frequency bins in the target spectrogram.
use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
    Whether to use the stochastic duration prediction module or the regular duration predictor.
num_speakers (`int`, *optional*, defaults to 1):
    Number of speakers if this is a multi-speaker model.
speaker_embedding_size (`int`, *optional*, defaults to 0):
    Number of channels used by the speaker embeddings. Is zero for single-speaker models.
upsample_initial_channel (`int`, *optional*, defaults to 512):
    The number of input channels into the HiFi-GAN upsampling network.
upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
    A tuple of integers defining the stride of each 1D convolutional layer in the HiFi-GAN upsampling network.
    The length of `upsample_rates` defines the number of convolutional layers and has to match the length of
    `upsample_kernel_sizes`.
upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
    A tuple of integers defining the kernel size of each 1D convolutional layer in the HiFi-GAN upsampling
    network. The length of `upsample_kernel_sizes` defines the number of convolutional layers and has to match
    the length of `upsample_rates`.
resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
    A tuple of integers defining the kernel sizes of the 1D convolutional layers in the HiFi-GAN
    multi-receptive field fusion (MRF) module.
resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
    A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
    HiFi-GAN multi-receptive field fusion (MRF) module.
leaky_relu_slope (`float`, *optional*, defaults to 0.1):
    The angle of the negative slope used by the leaky ReLU activation.
depth_separable_channels (`int`, *optional*, defaults to 2):
    Number of channels to use in each depth-separable block.
depth_separable_num_layers (`int`, *optional*, defaults to 3):
    Number of convolutional layers to use in each depth-separable block.
duration_predictor_flow_bins (`int`, *optional*, defaults to 10):
    Number of channels to map using the unonstrained rational spline in the duration predictor model.
duration_predictor_tail_bound (`float`, *optional*, defaults to 5.0):
    Value of the tail bin boundary when computing the unconstrained rational spline in the duration predictor
    model.
duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
    Kernel size of the 1D convolution layers used in the duration predictor model.
duration_predictor_dropout (`float`, *optional*, defaults to 0.5):
    The dropout ratio for the duration predictor model.
duration_predictor_num_flows (`int`, *optional*, defaults to 4):
    Number of flow stages used by the duration predictor model.
duration_predictor_filter_channels (`int`, *optional*, defaults to 256):
    Number of channels for the convolution layers used in the duration predictor model.
prior_encoder_num_flows (`int`, *optional*, defaults to 4):
    Number of flow stages used by the prior encoder flow model.
prior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 4):
    Number of WaveNet layers used by the prior encoder flow model.
posterior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 16):
    Number of WaveNet layers used by the posterior encoder model.
wavenet_kernel_size (`int`, *optional*, defaults to 5):
    Kernel size of the 1D convolution layers used in the WaveNet model.
wavenet_dilation_rate (`int`, *optional*, defaults to 1):
    Dilation rates of the dilated 1D convolutional layers used in the WaveNet model.
wavenet_dropout (`float`, *optional*, defaults to 0.0):
    The dropout ratio for the WaveNet layers.
speaking_rate (`float`, *optional*, defaults to 1.0):
    Speaking rate. Larger values give faster synthesised speech.
noise_scale (`float`, *optional*, defaults to 0.667):
    How random the speech prediction is. Larger values create more variation in the predicted speech.
noise_scale_duration (`float`, *optional*, defaults to 0.8):
    How random the duration prediction is. Larger values create more variation in the predicted durations.

Example:

```python
>>> from transformers import VitsModel, VitsConfig

>>> # Initializing a "facebook/mms-tts-eng" style configuration
>>> configuration = VitsConfig()

>>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
>>> model = VitsModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```vits&   
vocab_size   hidden_size   num_hidden_layers   num_attention_heads   window_sizeTuse_biasi   ffn_dimg?	layerdropr   ffn_kernel_size	flow_sizei  spectrogram_binsrelu
hidden_acthidden_dropoutattention_dropoutactivation_dropoutg{Gz?initializer_rangegh㈵>layer_norm_eps"use_stochastic_duration_prediction   num_speakersr   speaker_embedding_sizei   upsample_initial_channel)   r(   r   r   .upsample_rates)   r*   r   r   upsample_kernel_sizes)r         resblock_kernel_sizes)r$   r      r/   r/   resblock_dilation_sizesleaky_relu_slopedepth_separable_channelsdepth_separable_num_layers
   duration_predictor_flow_binsg      @duration_predictor_tail_boundduration_predictor_kernel_sizeg      ?duration_predictor_dropoutduration_predictor_num_flows   "duration_predictor_filter_channelsprior_encoder_num_flows prior_encoder_num_wavenet_layersr*   $posterior_encoder_num_wavenet_layersr0   wavenet_kernel_sizewavenet_dilation_rateg        wavenet_dropoutg      ?speaking_rategMbX?noise_scaleg?noise_scale_durationi>  sampling_rateNpad_token_idc                     [        U R                  5      [        U R                  5      :w  a8  [        S[        U R                  5       S[        U R                  5       S35      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.z'The length of `upsample_kernel_sizes` (z-) must match the length of `upsample_rates` ()N)lenr+   r)   
ValueError)selfs    |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vits/configuration_vits.pyvalidate_architecture VitsConfig.validate_architecture   se    t))*c$2E2E.FF9#d>X>X:Y9Z [%%()<)<%=$>aA  G     )<__name__
__module____qualname____firstlineno____doc__
model_typer   int__annotations__r   r   r   r   r   boolr   r   floatr   r   r   r   strr   r   r    r!   r"   r#   r%   r&   r'   r)   listtupler+   r.   r1   r2   r3   r4   r6   r7   r8   r9   r:   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rN   __static_attributes__rQ   rP   rM   r	   r	      sI   Tl JJKs  KHdGS Ius{ OSIscJ"%NECK%%(us{(&))#u# NE /3&3L#"#C#$'c'2>NDIc3h/>9G49uS#X6G9C49uS#X6C,MTE\M!e!$%c%&''(* #*+.!5.*+"C+.11() #).1&1#$S$,-$c-02(#2  !"3"#&OUS[&!$M53;$K"%%%M3#L#*#rP   r	   N)	rV   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__rQ   rP   rM   <module>rd      sJ     . 3 # 12M! M  3M` .rP   