
    Z jTA                         S r SSKJr  SSKrSSKJrJrJrJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  \R$                  " \5      r " S	 S
\5      rS
/rg)z%Feature extractor class for SpeechT5.    )AnyN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   8  ^  \ rS rSrSrSS/r            S$S\S\S\S\S	\S
\S\S\	S\S\S\S\4U 4S jjjr
\ S%S\\R                     S\\R                     S\S\\R                     4S jj5       rS\R                  S\R                  4S jr         S&S\R                  \\   -  \\R                     -  \\\      -  S-  S\R                  \\   -  \\R                     -  \\\      -  S-  S\\	-  \-  S\S-  S\S\S-  S\S-  S\	\-  S-  S\S-  S\4S jjr       S'S\R                  \\   -  \\R                     -  \\\      -  S \S\\	-  \-  S\S-  S\S\S-  S\S-  S\	\-  S-  S\4S! jjrS\\	\4   4U 4S" jjrS#rU =r$ )(SpeechT5FeatureExtractor   aC  
Constructs a SpeechT5 feature extractor.

This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
the SpeechT5 speech encoder prenet.

This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
prenet.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance for some models.
    num_mel_bins (`int`, *optional*, defaults to 80):
        The number of mel-frequency bins in the extracted spectrogram features.
    hop_length (`int`, *optional*, defaults to 16):
        Number of ms between windows. Otherwise referred to as "shift" in many papers.
    win_length (`int`, *optional*, defaults to 64):
        Number of ms per window.
    win_function (`str`, *optional*, defaults to `"hann_window"`):
        Name for the window function used for windowing, must be accessible via `torch.{win_function}`
    fmin (`float`, *optional*, defaults to 80):
        Minimum mel frequency in Hz.
    fmax (`float`, *optional*, defaults to 7600):
        Maximum mel frequency in Hz.
    mel_floor (`float`, *optional*, defaults to 1e-10):
        Minimum value of mel frequency banks..
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
input_valuesattention_maskfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfminfmax	mel_floorreturn_attention_maskc           
        > [         TU ]  " S	XUS.UD6  X@l        Xl        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xr-  S-  U l        Xb-  S-  U l        [        U R                  5      U l        U R                  S-  S-   U l        [!        U R                  U R                  SS9U l        [%        U R                  U R                  U R                  U R                  U R&                  SSS9U l        g )
N)r   r   r   i        T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__r   r   r   r   r   r   r   r   r   sample_sizesample_strider   n_fftn_freqsr   windowr   r   mel_filters)selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 ډ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/speecht5/feature_extraction_speecht5.pyr.   !SpeechT5FeatureExtractor.__init__H   s      	wl_lwpvw(%:"($$(		"%5='74?'(8(89


a1,%D4D4D4K\K\gkl*#|| --)))),,
    returnc                    Ub  [         R                  " U[         R                  5      n/ n[        XR	                  S5      5       Hl  u  pEXDSU R                  5       -
  [         R                  " USU R                  5       S-   5      -  nXVR                  S   :  a  X&US& UR                  U5        Mn     U$ U  Vs/ s H=  owUR                  5       -
  [         R                  " UR                  5       S-   5      -  PM?     nnU$ s  snf )zK
Every array in the list is normalized to have zero mean and unit variance
NgHz>r   )
nparrayint32zipsummeansqrtvarshapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slicexs           r8   zero_mean_unit_var_norm0SpeechT5FeatureExtractor.zero_mean_unit_var_normu   s     %XXnbhh?N"$"%l4F4Fr4J"K &)=)=)? ?2776RYSY?K^K^K`cgKgChh..q11,9)#**<8 #L #" Vb"bUaPQLBGGAEEGdN4K#KUa"b"" #cs   ;ADone_waveformc                     [        UU R                  U R                  U R                  U R                  U R
                  U R                  SS9nUR                  $ )zJ
Extracts log-mel filterbank features for one waveform array (unbatched).
log10)r3   frame_lengthr   
fft_lengthr4   r   log_mel)r   r3   r/   r0   r1   r4   r   T)r5   rO   log_mel_specs      r8   _extract_mel_features.SpeechT5FeatureExtractor._extract_mel_features   sP     #;;))))zz((nn	
 ~~r:   Naudioaudio_targetpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc
                    Uc  Uc  [        S5      eU	b<  XR                  :w  a,  [        SU  SU R                   SU R                   SU	 S3	5      eO-[        R                  SU R                  R
                   S	35        Ub  U R                  " US
UUUUUU40 U
D6nOSnUb?  U R                  " USUUUUUU40 U
D6nUc  U$ US   US'   UR                  S5      nUb  XS'   U$ )a  
Main method to featurize and prepare for the model one or several sequence(s).

Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
spectrogram features.

Args:
    audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
        The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
        be mono channel audio, not stereo, i.e. single float per timestep.
    audio_target (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
        The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
        list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
        spectrogram features.
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    truncation (`bool`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
        to pass `sampling_rate` at the forward call to prevent silent errors.
Nz9You must provide either `audio` or `audio_target` values.z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.FTr   labelsr   decoder_attention_mask)
ValueErrorr   loggerwarningr7   __name___process_audioget)r5   rY   rZ   r[   r\   r]   r^   r   r_   r   r6   inputsinputs_targetrc   s                 r8   __call__!SpeechT5FeatureExtractor.__call__   sb   | =\1XYY$ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 (("%
 
F F# //"%
 
M ~$$#0#@x )6):):;K)L&)57M34r:   speech	is_targetc	           	      
	   [        U[        R                  5      =(       a    [        UR                  5      S:  n
U
(       a'  [        UR                  5      S:  a  [        SU  35      eU
=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a5  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                  [        R                  " [        R                  5      L a  UR                  [        R                  5      nU(       d  U/nU R                  nU(       a?  U Vs/ s H  oR                  U5      PM     nn[        SU05      nU R                   U l        O[        SU05      nU R"                  " U4UUUUUS.U	D6nXl        US   n[        US   [        R                  5      (       d9  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snUS'   GO[        U[        R                  5      (       d  [        US   [        R                  5      (       ah  US   R                  [        R                  " [        R                  5      L a4  U Vs/ s H"  nUR                  [        R                  5      PM$     snUS'   Or[        U[        R                  5      (       aS  UR                  [        R                  " [        R                  5      L a"  UR                  [        R                  5      US'   UR%                  S5      nUb7  U Vs/ s H&  n[        R                  " U[        R&                  S9PM(     snUS'   U(       dV  U R(                  (       aE  U R+                  X4S	9[,        R.                  La  UOS nU R1                  US   UU R2                  S
9US'   Ub  UR5                  U5      nU$ s  snf s  snf s  snf s  snf s  snf )Nr!   r    z2Only mono-channel audio is supported for input to r   )dtyper   )r[   r\   r]   r^   r   r   )r\   )r   r   )
isinstancer>   ndarraylenrF   rd   listtupleasarrayfloat32rq   float64astyper   rW   r
   r   padri   r@   r   _get_padding_strategiesr   
DO_NOT_PADrM   r   convert_to_tensors)r5   rn   ro   r[   r\   r]   r^   r   r_   r6   is_batched_numpy
is_batchedfeature_size_hackwaveformfeaturesencoded_inputspadded_inputsr   r?   r   s                       r8   rh   'SpeechT5FeatureExtractor._process_audio  s    &fbjj9Sc&,,>ORS>SFLL 1A 5QRVQWXYY% 
ve}-d:fQi"**V[]aIb3c 	 IOPvbjjrzz:PFJvrzz$B$BZZbjj9F

++@T0T]]2::.F XF !-- MSTV228<VHT)>8*DEN $ 1 1D)>6*BCN
!!1"7
 
 . %^4,q/2::66^j,k^jUZRZZRZZ-P^j,kM.)<44<?BJJ77Q%%"**)==S_,`S_%U\\"**-ES_,`M.)bjj11l6H6HBHHUWU_U_L`6`,8,?,?

,KM.) '**+;<%^l.m^lUZrzz%rxx/P^l.mM*+ T.. ///OWfWqWqq  
 -1,H,Hn-n\`\n\n -I -M.) %)<<^LMC Q U* -l -a /ns   -Q,Q1-Q6)Q;-R c                 P   > [         TU ]  5       n/ SQnU H  nX1;   d  M
  X	 M     U$ )N)r3   r4   r/   r0   r1   r2   )r-   to_dict)r5   outputnamesr#   r7   s       r8   r    SpeechT5FeatureExtractor.to_dictj  s2    " ^D~L  r:   )r   r   r   r   r   r4   r   r1   r2   r   r   r/   r0   r   r   r3   )r!   i>          FP      @   hann_windowr   i  g|=T)r   )	NNFNFNNNN)FFNFNNN)rg   
__module____qualname____firstlineno____doc__model_input_namesintfloatboolstrr.   staticmethodru   r>   rs   rM   rW   r   r   r
   rl   rh   dictr   r   __static_attributes____classcell__)r7   s   @r8   r   r      s&   &P ()9: """) &*+
+
 +
 	+

 +
 +
 +
 +
 +
 +
 +
 +
  $+
 +
Z  be#2::&#8<RZZ8H#Y^#	bjj	# #*jj 
* Y]_c05!% )--126$(rzzDK'$rzz*::T$u+=NNQUUr jj4;.bjj1AADeDUUX\\r o-	r
 $Jr r  $Jr  $d{r j(4/r Tzr 
rn  05!% )--126U

T%[(4

+;;d4;>OOU U o-	U
 $JU U  $JU  $d{U j(4/U 
Un	c3h 	 	r:   r   )r   typingr   numpyr>   audio_utilsr   r   r   r   !feature_extraction_sequence_utilsr	   feature_extraction_utilsr
   utilsr   r   r   
get_loggerrg   re   r   __all__r,   r:   r8   <module>r      sN    ,   \ \ I 4 9 9 
		H	%V7 Vr
 &
&r:   