
    Z j                      |    S r SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
  \
R                  " \5      r " S S\5      rS/rg)	zFeature extractor class for Dia    N   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   
  ^  \ rS rSrSrSS/r    SS\S\S\S\4U 4S	 jjjr     SS\	R                  \\   -  \\	R                     -  \\\      -  S\\-  \-  S
-  S\S
-  S\S
-  S\\-  S
-  S\S
-  S\4S jjrSrU =r$ )DiaFeatureExtractor   a
  
Constructs an Dia feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used for padding.
    hop_length (`int`, *optional*, defaults to 512):
        Overlap length between successive windows.
input_valuesn_quantizersfeature_sizesampling_ratepadding_value
hop_lengthc                 8   > [         TU ]  " SXUS.UD6  X@l        g )N)r   r   r    )super__init__r   )selfr   r   r   r   kwargs	__class__s         /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/dia/feature_extraction_dia.pyr   DiaFeatureExtractor.__init__.   s#     	wl_lwpvw$    N	raw_audiopadding
truncation
max_lengthreturn_tensorsreturnc                    Ub<  X`R                   :w  a,  [        SU  SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R
                   S35        U(       a  U(       a  [        S5      eUc  S	n[        [        U[        [        45      =(       a(    [        US
   [        R                  [        [        45      5      nU(       a>  U Vs/ s H0  n[        R                  " U[        R                  S9R                  PM2     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                  [        R                  " [        R                   5      L a  UR#                  [        R                  5      nU(       d!  [        R                  " U5      R                  /n[%        U5       HB  u  pU R&                  S:X  d  M  U
R(                  S:X  d  M)  [        R*                  " U
S5      X'   MD     [%        U5       H  u  pU
R(                  S:  a  [        SU
R,                   35      eU R&                  S:X  a,  U
R(                  S:w  a  [        SU
R,                  S    S35      eU R&                  S:X  d  M{  U
R(                  S:w  d  M  [        SU
R,                  S    S35      e   [/        SU05      nU R&                  nSU l        U R1                  UUUUS	U R2                  S9nUR5                  S5      US'   / nUR5                  S5       H3  n
U R&                  S:X  a  U
S   n
UR7                  U
R                  5        M5     XS'   Ub  UR9                  U5      nXl        U$ s  snf )a  
Main method to featurize and prepare for the model one or several sequence(s).

Args:
    raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
        `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
        (`feature_size = 2`).
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    truncation (`bool`, *optional*, defaults to `False`):
        Activates truncation to cut input sequences longer than `max_length` to `max_length`.
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors.
z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.zABoth padding and truncation were set. Make sure you only set one.Tr   )dtype   z6Expected input shape (channels, length) but got shape    z$Expected mono audio but example has z	 channelsz&Expected stereo audio but example has r   )r   r   r   return_attention_maskpad_to_multiple_ofattention_maskpadding_mask).N)r   
ValueErrorloggerwarningr   __name__bool
isinstancelisttuplenpndarrayasarrayfloat32Tr$   float64astype	enumerater   ndimmeanshaper   padr   popappendconvert_to_tensors)r   r   r   r   r   r    r   
is_batchedaudioidxexampler   original_feature_sizepadded_inputss                 r   __call__DiaFeatureExtractor.__call__9   se   R $ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 z`aa_Gy4-0jj1PRPZPZ\acgOh6i

 LUVI5E<>>IIVIJy"**$E$E

9BJJ?I	2::..9??bhhrzzFZ3Z!((4I I.001I &i0LC  A%',,!*;!#"!5	 1
 &i0LC||a #YZaZgZgYh!ijj  A%',,!*; #GVXHYGZZc!dee  A%',,!*; #I'--XZJ[I\\e!fgg 1 $^Y$?@ !% 1 1 !!"&# ! 
 )6(9(9:J(Kn%$((8G  A%!),		* 9
 )5n%%)<<^LM 2m Ws   $7O)r   r   )r'   i>  g        i   )NFNNN)r/   
__module____qualname____firstlineno____doc__model_input_namesintfloatr   r4   r5   r2   r0   strr   r   r   rI   __static_attributes____classcell__)r   s   @r   r
   r
      s   " (8 ""	%	% 	% 		%
 	% 	% 8<"'!%26$(w::U+d2::.>>d5kARRw o-4w 4K	w
 $Jw j(4/w Tzw 
w wr   r
   )rN   numpyr4   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerr/   r-   r
   __all__r   r   r   <module>r[      sE    &  I 4 9 9 
		H	%V2 Vr !
!r   