
    Z jA                         S r SSKrSSKJr  SSKJrJrJr  SSK	J
r
  SSKJr  SSKJrJr  \" 5       (       a  SSKr\R"                  " \5      r " S	 S
\
5      rS
/rg)z%
Feature extractor class for Whisper
    N   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                     ^  \ rS rSrSrS/r        SU 4S jjrS\R                  S\	S\R                  4S jr
SS	\R                  S\	S\R                  4S
 jjr\ SS\\R                     S\\R                     S\S\\R                     4S jj5       r         SS\R                  \\   -  \\R                     -  \\\      -  S\S\S-  S\	\-  S-  S\S-  S\	S-  S\S-  S\S-  S\S-  S\	S-  S\4S jjrSrU =r$ )WhisperFeatureExtractor!   a/  
Constructs a Whisper feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
Fourier Transform` which should match pytorch's `torch.stft` equivalent.

Args:
    feature_size (`int`, *optional*, defaults to 80):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    hop_length (`int`, *optional*, defaults to 160):
        Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
    chunk_length (`int`, *optional*, defaults to 30):
        The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
        sequences.
    n_fft (`int`, *optional*, defaults to 400):
        Size of the Fourier transform.
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
    dither (`float`, *optional*, defaults to 0.0):
        Adds dithering. In other words, adds a small Gaussian noise to each frame.
        E.g. use 0.0001 to add dithering with a normal distribution centered
        around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
        The value 0.0 means no dithering.
        Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
        the high log_mel_fbank values for signals with hard-zero sections,
        when VAD cutoff is present in the signal.
input_featuresc	           
         > [         T
U ]  " SUUUUS.U	D6  XPl        X0l        X@l        XB-  U l        U R
                  U-  U l        X l        Xpl        [        SUS-  -   USSUSSS9U l
        g )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   ditherr   mel_filters)selfr   r   r#   r$   r"   r   r'   r   kwargs	__class__s             ڇ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr!    WhisperFeatureExtractor.__init__E   s     	 	
%''"7		

 	
 
$(%5!^^z9** 5A:~( '
    waveform_batchdevicereturnc                    US:w  a  [        SU S35      e/ nU H  n[        U[        U R                  S5      U R                  U R                  SU R
                  U R                  SS9nUSS2SS	24   n[        R                  " XUR                  5       S
-
  5      nUS-   S-  nUR                  U5        M     [        R                  " U5      nU$ )z
Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
implementation with 1e-5 tolerance.
cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr#   powerr'   r(   log_melN       @      @)
ValueErrorr   r   r"   r#   r'   r(   npmaximummaxappendarray)r)   r/   r0   log_spec_batchwaveformlog_specs         r,   _np_extract_fbank_features2WhisperFeatureExtractor._np_extract_fbank_featuresi   s    
 U?vh 'q q 
 &H"

F3!ZZ??{{ ,,	H  3B3'Hzz(LLNS,@AH 3#-H!!(+ ' .1r.   rC   c                 "   [         R                  " U5      R                  U[         R                  5      n[         R                  " U R
                  US9nU R                  S:w  aC  XR                  [         R                  " UR                  UR                  UR                  S9-  -  n[         R                  " XR
                  U R                  USS9nUSSS24   R                  5       S	-  n[         R                  " U R                  5      R                  U[         R                  5      nUR                  U-  n[         R                   " US
S9R#                  5       nUR%                  5       S	:X  a>  UR'                  S	SS9S   R'                  SSS9S   n	[         R(                  " XS-
  5      nO'[         R(                  " XR'                  5       S-
  5      nUS-   S-  nUS:w  a  UR+                  5       R-                  5       nUR/                  5       $ )z
Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
yielding results similar to cpu computing with 1e-5 tolerance.
)r0   r   )dtyper0   T)windowreturn_complex.Nr9   r   g|=)min)dimkeepdimr   r   r:   r;   r3   )torch
from_numpytofloat32hann_windowr"   r'   randnshaperH   r0   stftr#   absr(   Tclampr5   rL   r?   r>   detachr3   numpy)
r)   rC   r0   rI   rU   
magnitudesr(   mel_specrD   max_vals
             r,   _torch_extract_fbank_features5WhisperFeatureExtractor._torch_extract_fbank_features   s   
 ##H-00G""4::f=
 ;;#ekk(.._g_n_n&oooHzz(JJ_cd#ss(^'')Q.
&&t'7'78;;FEMMR==:-;;xU399;<<>Qllq$l7:>>1d>STUVG}}X}=H}}X||~/CDHsNc)U?(,,.H~~r.   input_valuesattention_maskr   c                    Ub  [         R                  " U[         R                  5      n/ n[        XR	                  S5      5       Hl  u  pEXDSU R                  5       -
  [         R                  " USU R                  5       S-   5      -  nXVR                  S   :  a  X&US& UR                  U5        Mn     U$ U  Vs/ s H=  owUR                  5       -
  [         R                  " UR                  5       S-   5      -  PM?     nnU$ s  snf )zK
Every array in the list is normalized to have zero mean and unit variance
Nr9   gHz>r   )
r=   rA   int32zipsummeansqrtvarrT   r@   )r`   ra   r   normed_input_valuesvectorlengthnormed_slicexs           r,   zero_mean_unit_var_norm/WhisperFeatureExtractor.zero_mean_unit_var_norm   s     %XXnbhh?N"$"%l4F4Fr4J"K &)=)=)? ?2776RYSY?K^K^K`cgKgChh..q11,9)#**<8 #L #" Vb"bUaPQLBGGAEEGdN4K#KUa"b"" #cs   ;ADN
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizec                    UbP  XR                   :w  a@  [        SU R                  R                   SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R                   S35        [        U[        R                  5      =(       a    [        UR                  5      S	:  nU(       a'  [        UR                  5      S
:  a  [        SU  35      eU=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a?  U Vs/ s H1  n[        R                  " U/[        R                  S9R                  PM3     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                   [        R                   " [        R"                  5      L a  UR%                  [        R                  5      nU(       d"  [        R                  " U/5      R                  /n['        SU05      nU R)                  UUU(       a  UOU R*                  UUU=(       d    U	S9nU	(       a?  U R-                  US   US   U R.                  S9US'   [        R0                  " US   SS9US'   UR3                  S5      R5                  S
SS	5      n[7        5       (       a  U R8                  OU R:                  nU" US   U
5      n[        US   [        5      (       a8  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snUS'   OUUS'   U(       aL  US   SS2SSU R<                  24   nUS   R                  S	   U R<                  -  S:w  a  USS2SS24   nUUS'   Ub  UR?                  U5      nU$ s  snf s  snf )a  Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch
for the STFT computation if available, otherwise a slower NumPy based one.

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    truncation (`bool`, *optional*, default to `True`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*, defaults to None):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

        <Tip>

        For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
        bugs.

        </Tip>
    padding (`str` or [`~utils.PaddingStrategy`], *optional*, defaults to `'max_length'`):
        Activates and controls padding. Accepts the following values:

        - `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence is
          provided).
        - `'max_length'` (default): Pad to a maximum length specified with the argument `max_length` or to the
          maximum acceptable input length for the model if that argument is not provided.
        - `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
    max_length (`int`, *optional*):
        Controls the maximum length to use by one of the truncation/padding parameters.

        If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
        is required by one of the truncation/padding parameters. If the model has no specific maximum input
        length (like XLNet) truncation/padding to a maximum length will be deactivated.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
        pipeline.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance of the model.
    device (`str`, *optional*, defaults to `'cpu'`):
        Specifies the device for computation of the log-mel spectrogram of audio signals in the
        `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
    **kwargs: Not supported by WhisperFeatureExtractor.__call__() and ignored.
Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )rH   r   )ru   rp   rr   rs   r   ra   )ra   r   )axisr9   ) r   r<   r+   __name__loggerwarning
isinstancer=   ndarraylenrT   listtupleasarrayrQ   rW   rH   float64astyper	   padr%   rn   r   stackget	transposer   r^   rE   r#   convert_to_tensors)r)   rq   rr   rs   rt   r   ru   rp   r   rv   r0   r*   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   extract_fbank_featuresfeaturerescaled_attention_masks                        r,   __call__ WhisperFeatureExtractor.__call__   s   P $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  3 NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\Q[v"**fXRZZ@BBQ[J\JJz2::$F$FJbjjAJ
BJJ//J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DE %/zT^^!1"7"G< ! 
 .2.J.J./,-=>"00 /K /M*+
 /1hh}EU7V]^._M*+ '**+;<FFq!QO 3E2F2FD..DLkLk 	 0q0A6JnQ'..dr.sdrY`rzz'/Tdr.sM*+ /=M*+ &34D&EaI[DOOI[F[&\#
 -.44Q7$//IQN*A!SbS&*I'.EM*+%)<<^LMy ]R /ts   8O4-O9)r$   r'   r#   r(   r"   r%   r&   r   )P   i>        i  r   r   F)r3   )r   )	TNNNrp   NNNr3   )rz   
__module____qualname____firstlineno____doc__model_input_namesr!   r=   r~   strrE   r^   staticmethodr   floatrn   boolintr
   r	   r   __static_attributes____classcell__)r+   s   @r,   r   r   !   s   B ** #"
H S UWU_U_ < bjj  #  Z\ZdZd  >  be#2::&#8<RZZ8H#Y^#	bjj	# #0  )-26-1*!%$($("YJJe,tBJJ/??$tE{BSSY Y  $J	Y
 j(4/Y  $d{Y tY $JY TzY TkY d
Y 
Y Yr.   r   )r   rZ   r=    r   audio_utilsr   r   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr	   utilsr
   r   rN   
get_loggerrz   r{   r   __all__r   r.   r,   <module>r      sY     " H H I 4 ( 			H	%u6 up	 %
%r.   