
    Z j                         S SK Jr  S SKJr  S SKrS SKrSSKJrJ	r	  SSK
Jr  SSKJrJr  \	R                  " \5      r\" \" S	S	S
95       " S S\5      5       rg)    )UserDict)AnyN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)has_feature_extractorhas_tokenizerc            	          ^  \ rS rSrSrSrSrSrSrU 4S jr	S\
R                  \-  \-  \-  S\S\\\\4      4U 4S	 jjrS
 rSS jrS rS rSrU =r$ )#ZeroShotAudioClassificationPipeline   a  
Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
provide an audio and a set of `candidate_labels`.

<Tip warning={true}>

The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

</Tip>

Example:
```python
>>> from transformers import pipeline
>>> from datasets import load_dataset

>>> dataset = load_dataset("ashraq/esc50")
>>> audio = next(iter(dataset["train"]["audio"]))["array"]
>>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
>>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
[{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
```


Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"zero-shot-audio-classification"`. See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
FTc                 &   > [         TU ]  " S0 UD6  g )N )super__init__)selfkwargs	__class__s     چ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   ,ZeroShotAudioClassificationPipeline.__init__C   s    "6"    audiosr   returnc                 &   > [         TU ]  " U40 UD6$ )ah  
Assign labels to the audio(s) passed as inputs.

Args:
    audios (`str`, `list[str]`, `np.array` or `list[np.array]`):
        The pipeline handles three types of inputs:
        - A string containing a http link pointing to an audio
        - A string containing a local path to an audio
        - An audio loaded in numpy
    candidate_labels (`list[str]`):
        The candidate labels for this audio. They will be formatted using *hypothesis_template*.
    hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
        The format used in conjunction with *candidate_labels* to attempt the audio classification by
        replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
        already formatted.
Return:
    A list of dictionaries containing one entry per proposed label. Each dictionary contains the
    following keys:
    - **label** (`str`) -- One of the suggested *candidate_labels*.
    - **score** (`float`) -- The score attributed by the model to that label. It is a value between
        0 and 1, computed as the `softmax` of `logits_per_audio`.
)r   __call__)r   r   r   r   s      r   r   ,ZeroShotAudioClassificationPipeline.__call__F   s    . w1&11r   c                 H    0 nSU;   a  US   US'   SU;   a  US   US'   U0 0 4$ )Ncandidate_labelshypothesis_templater   )r   r   preprocess_paramss      r   _sanitize_parameters8ZeroShotAudioClassificationPipeline._sanitize_parameters_   sI    '4:;M4N01 F*7=>S7T34 "b((r   c                 <   [        U[        5      (       aq  UR                  S5      (       d  UR                  S5      (       a   [        R                  " USS9R
                  nO%[        US5       nUR                  5       nS S S 5        [        U[        5      (       a  [        XR                  R                  5      n[        U[        R                  5      (       d  [        S5      e[        UR                   5      S:w  a  [#        S5      eU R                  U/U R                  R                  S	S
9nUR%                  U R&                  5      nX%S'   U Vs/ s H  ocR)                  U5      PM     nnU R+                  US	SS9nU/US'   U$ ! , (       d  f       GN= fs  snf )Nzhttp://zhttps://T)follow_redirectsrbz"We expect a numpy ndarray as inputr   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipelinept)sampling_ratereturn_tensorsr!   )r+   paddingtext_inputs)
isinstancestr
startswithhttpxgetcontentopenreadbytesr	   feature_extractorr*   npndarray	TypeErrorlenshape
ValueErrortodtypeformat	tokenizer)	r   audior!   r"   finputsx	sequencesr-   s	            r   
preprocess.ZeroShotAudioClassificationPipeline.preprocessh   s]   eS!!	**e.>.>z.J.J 		%$?GG%&!FFHE ' eU##'='='K'KLE%,,@AAu{{q mnn''G4#9#9#G#GX\ ( 
 4::&%5!"<LM<Lq//2<L	MnnYtTnR!,}' '&  Ns   .FF
Fc                     UR                  S5      nUR                  S5      n[        US   [        5      (       a  US   nOUS   S   nU R                  " S0 UDUD6nUUR                  S.nU$ )Nr!   r-   r   )r!   logitsr   )popr.   r   modellogits_per_audio)r   model_inputsr!   r-   outputsmodel_outputss         r   _forward,ZeroShotAudioClassificationPipeline._forward   s    '++,>?"&&}5k!nh//%a.K &a.+K**;{;l; !1..
 r   c                     UR                  S5      nUS   S   nUR                  SS9nUR                  5       n[        [	        XR5      S S9 VVs/ s H	  u  pgXgS.PM     nnnU$ s  snnf )Nr!   rJ   r   )dimc                     U S   * $ )Nr   r   )rE   s    r   <lambda>AZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>   s    _`ab_c^cr   )key)scorelabel)rK   softmaxtolistsortedzip)	r   rP   r!   rJ   probsscoresrY   candidate_labelresults	            r   postprocess/ZeroShotAudioClassificationPipeline.postprocess   s    (,,-?@x(+1% +1V1NTc*d
*d& 6*d 	 
 	
s   A%r   )NzThis is a sound of {}.)__name__
__module____qualname____firstlineno____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r8   r9   r6   r/   dictr   listr   r$   rG   rQ   rc   __static_attributes____classcell__)r   s   @r   r   r      s~    : O!"O#2rzzE1C7$> 2# 2RVW[\_ad\dWeRf 22)8" r   r   )collectionsr   typingr   r1   numpyr8   utilsr   r   audio_classificationr	   baser
   r   
get_loggerre   loggerr   r   r   r   <module>rz      s_    !    . 4 
		H	% ,4W[\]@( @ ^@r   