
    Z j3                         S SK JrJrJr  SSKJr  SSKJr  SSKJ	r	  SSK
JrJr  SSKJr  \	" 5       (       a  S S	KrSS
KJr  SSKJr  Sr " S S\SS9r " S S\5      rg	)    )Any	TypedDictoverload   )
AudioInput)GenerationConfig)is_torch_available)ChatChatType   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                   .    \ rS rSr% Sr\\S'   \\S'   Srg)AudioOutput!   z
audio (`AudioInput`):
    The generated audio waveform.
sampling_rate (`int`):
    The sampling rate of the generated audio waveform.
audiosampling_rate N)	__name__
__module____qualname____firstlineno____doc__r   __annotations__int__static_attributes__r       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.pyr   r   !   s     r   r   F)totalc                     ^  \ rS rSrSrSrSrSrSrSr	\
" SS9rSSS.U 4S	 jjrS
 rS r\S\S\S\4S j5       r\S\\   S\S\\   4S j5       r\S\S\S\4S j5       r\S\\   S\S\\   4S j5       rU 4S jr   SS jrS rSrU =r$ )TextToAudioPipeline-   al  
Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
pipeline generates an audio file from an input text and optional other conditional inputs.

Unless the model you're using explicitly sets these generation parameters in its configuration files
(`generation_config.json`), the following default values will be used:
- max_new_tokens: 256

Example:

```python
>>> from transformers import pipeline

>>> pipe = pipeline(model="suno/bark-small")
>>> output = pipe("Hey it's HuggingFace on the phone!")

>>> audio = output["audio"]
>>> sampling_rate = output["sampling_rate"]
```

Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

<Tip>

You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
[`TextToAudioPipeline.__call__.generate_kwargs`].

Example:

```python
>>> from transformers import pipeline

>>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")

>>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
>>> generate_kwargs = {
...     "do_sample": True,
...     "temperature": 0.7,
...     "max_new_tokens": 35,
... }

>>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
```

</Tip>

This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
`"text-to-audio"`.

See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
TNF   )max_new_tokens)vocoderr   c                r  > [         T
U ]  " U0 UD6  S U l        U R                  R                  [
        R                  " 5       ;   aG  Uc=  [        R                  " [        5      R                  U R                  R                  5      OUU l        U R                  R                  R                  S;   a  S U l        X l        U R                  b%  U R                  R                  R                  U l        U R                  c  U R                  R                  nU R                  R                   R#                  SS 5      nUbG  UR%                  UR'                  5       R)                  5        VVs0 s H  u  pxUc  M
  Xx_M     snn5        S HL  n	[+        XYS 5      nUb  X l        M  [+        USS 5      c  M*  [+        UR,                  U	S 5      nUc  MF  X l        MN     U R                  cP  U R                  bB  [/        U R                  S5      (       a&  U R                  R0                  R                  U l        g g g g s  snnf )N)musicgenspeecht5generation_config)sample_rater   codec_configfeature_extractor)super__init__r&   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodeviceconfig
model_type	processorr   __dict__getupdateto_dictitemsgetattrr,   hasattrr-   )selfr&   r   argskwargsr7   
gen_configkvsampling_rate_namer1   s             r   r/   TextToAudioPipeline.__init__m   s   $)&)::#H#O#O#QQ ?  //0BCFFtzzGXGXY L ::''+CC!DN*<<#!%!4!4!B!BD% ZZ&&F,,001DdKJ%
0B0B0D0J0J0L^0LPQtqt0L^_&F" 'D I ,)6&V^T:F$+F,?,?ASUY$ZM$0-:* 'G %$..*DQUQ_Q_atIuIu!%!A!A!O!OD Jv*D% _s   (	H3
5H3
c                 x   [        U[        5      (       a  U/nU R                  R                  R                  S:X  aX  Sn[        U R                  S5      (       a!  [        U R                  R                  SS5      nUSSSS.nUR                  U5        UnU R                  b  U R                  OU R                  n[        U[        5      (       a"  UR                  " UR                  4SSS.UD6nU$ U R                  R                  R                  S	:X  a>  U Vs/ s H  owR                  S
5      (       d  SU 3OUPM!     nnUR!                  SS5        U R                  R                  R                  S:X  a,  U Vs/ s H  owR                  S
5      (       d  SU 3OUPM!     nnU" U40 UDSS0D6nU$ s  snf s  snf )Nbarkr$   semantic_configmax_input_semantic_lengthFT)
max_lengthadd_special_tokensreturn_attention_maskreturn_token_type_ids)tokenizereturn_dictcsm[z[0]rN   diaz[S1] return_tensorspt)
isinstancestrr0   r7   r8   r@   r*   r?   rK   r<   r9   	tokenizerr
   apply_chat_templatemessages
startswith
setdefault)rA   textrC   rM   
new_kwargspreprocessoroutputts           r   
preprocessTextToAudioPipeline.preprocess   s   dC  6D::''61 Jt--/@AA$T%;%;%K%KMhjmn
(&+)-).	J f%F)-)Ct~~dD!!!55  	F  zz  ++u4KOP4ac):):#aS	A4P!!"6=zz  ++u4MQRT<<+<+<%s!CTR!$F&FFF Q Ss   &F2;&F7c                    U R                  X R                  S9nUS   nUS   nU R                  R                  5       (       a  U R                  X@R                  S9nSU;  a  U R                  US'   UR                  U5        UR                  SS05        U R                  R                  R                  S;   a  SU;  a  SUS'   U R                  R                  " S0 UDUD6nOC[        U5      (       a  [        S	UR                  5        35      eU R                  " S0 UDUD6S
   nU R                  b  U R                  U5      nU$ )N)r6   forward_paramsgenerate_kwargsr*   return_dict_in_generateT)rS   output_audiozYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r   r   )_ensure_tensor_on_devicer6   r0   can_generater*   r<   r7   r8   generatelen
ValueErrorkeysr&   )rA   model_inputsrC   rg   rh   rb   s         r   _forwardTextToAudioPipeline._forward   sQ   ..vkk.J 01 !23::""$$";;OT_T_;`O #/97;7M7M 34 !!/2 !!#<d"CDzz  ++w6 "759N>2ZZ((J<J>JF?## KKZK_K_KaJbd 
 ZZA,A.A!DF<<#\\&)Fr   text_inputsrg   returnc                     g Nr   rA   rt   rg   s      r   __call__TextToAudioPipeline.__call__   s    PSr   c                     g rw   r   rx   s      r   ry   rz      s    \_r   c                     g rw   r   rx   s      r   ry   rz      s    UXr   c                     g rw   r   rx   s      r   ry   rz      s    adr   c                 &   > [         TU ]  " U40 UD6$ )a  
Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

Args:
    text_inputs (`str`, `list[str]`, `ChatType`, or `list[ChatType]`):
        One or several texts to generate. If strings or a list of string are passed, this pipeline will
        generate the corresponding text. Alternatively, a "chat", in the form of a list of dicts with "role"
        and "content" keys, can be passed, or a list of such chats. When chats are passed, the model's chat
        template will be used to format them before passing them to the model.
    forward_params (`dict`, *optional*):
        Parameters passed to the model generation/forward method. `forward_params` are always passed to the
        underlying model.
    generate_kwargs (`dict`, *optional*):
        The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
        complete overview of generate, check the [following
        guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
        only passed to the underlying model if the latter is a generative model.

Return:
    `AudioOutput` or a list of `AudioOutput`, which is a `TypedDict` with two keys:

    - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
    - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
)r.   ry   )rA   rt   rg   r1   s      r   ry   rz      s    2 w>~>>r   c                     [        U SS 5      b  U R                  US'   [        U SS 5      b  U R                  US'   U R                  US'   U(       a  UO0 U(       a  UO0 S.nUc  0 n0 nXU4$ )Nassistant_modelassistant_tokenizerrZ   )rg   rh   )r?   r   rZ   r   )rA   preprocess_paramsrg   rh   paramspostprocess_paramss         r   _sanitize_parameters(TextToAudioPipeline._sanitize_parameters  s     4*D1=151E1EO-.4.5A+/>>OK(595M5MO12 1?nB2Ar

 $ " *<<<r   c                 z   Sn[        U[        5      (       a  SU;   a  US   nO"SnUS   nO[        U[        5      (       a  US   nU(       a(  U R                  b  U R                  R	                  U5      n[        U[
        5      (       a`  U Vs/ s H<  o3R                  S[        R                  S9R                  5       R                  5       PM>     nn[        U5      S:  a  UOUS   nO:UR                  S[        R                  S9R                  5       R                  5       n[        UU R                  S	9$ s  snf )
NFr   T	sequencesr   cpu)r6   dtyper   )r   r   )rX   dicttupler9   decodelistr5   torchfloatnumpysqueezern   r   r   )rA   r   needs_decodingels       r   postprocessTextToAudioPipeline.postprocess%  s   eT""%g!%k*u%%!HEdnn8NN))%0EeT""X]^X]RTUU%u{{U;AACKKMX]E^ Z!^EqEHHEH=CCEMMOE,,
 	
 _s   AD8)r9   r   r&   )NNN)r   r   r   r   r   _pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr/   rd   rr   r   rY   r   r   ry   r   r   r   r   r   __classcell__)r1   s   @r   r"   r"   -   s    2h  $O!#O "2" '+$ &P &PP&P(T SCS3S;S S_DI__kIZ_ _XHXXX XdDNdcddS^N_d d?: 	=.
 
r   r"   )typingr   r   r   audio_utilsr   
generationr   utilsr	   utils.chat_template_utilsr
   r   baser   r   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r4   r   r"   r   r   r   <module>r      sR    , + $ ) & 6  QC1 	)5 	O
( O
r   