
    Z j'                         S SK r S SKrSSKJrJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJr  \" 5       (       a  S SKr\R$                  " \5      r " S S	\	S
S9r " S S\
5      rS/rg)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   2    \ rS rSrSS0SSSS.SSS	.S
.rSrg)MusicFlamingoProcessorKwargs(   paddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       ڋ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/musicflamingo/processing_musicflamingo.pyr   r   (   s2     t
 #%)#
 #"
Ir"   r   F)totalc                      ^  \ rS rSrSr     SU 4S jjrS rS rS r  SS\	\
\	   -  S	\S-  S
\S-  S\\   S\4
S jjr\S\
\   4S j5       rSrU =r$ )MusicFlamingoProcessor9   a  
Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
tokenizer into a single processor.

[`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
[`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.

Args:
    feature_extractor ([`WhisperFeatureExtractor`]):
        The feature extractor is a required input.
    tokenizer ([`Qwen2TokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`Optional[str]`, *optional*):
        The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
        template will be used.
    audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
        Special token used to represent audio inputs in the chat template.
    audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
        Special token used to represent the beginning of audio.
    audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
        Special token used to represent the end of audio.
    max_audio_len (`int`, *optional*, defaults to 1200):
        Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
Nc                    > X@l         UR                  U5      U l        Xpl        [        TU ]  XUS9  XPl        X`l        UR                  U5      U l        UR                  U5      U l	        g )N)chat_template)
audio_tokenconvert_tokens_to_idsaudio_token_idmax_audio_lensuper__init__audio_bos_tokenaudio_eos_tokenaudio_bos_token_idaudio_eos_token_id)	selffeature_extractor	tokenizerr)   r*   r0   r1   r-   	__class__s	           r#   r/   MusicFlamingoProcessor.__init__S   si     ''==kJ**]S.."+"A"A/"R"+"A"A/"Rr"   c                 2    US-
  S-  S-   nUS-
  S-  S-   nU$ )N      r   )r4   audio_lengthsconv_output_lengthsaudio_tokens_lengthss       r#   _get_audio_token_length.MusicFlamingoProcessor._get_audio_token_lengthf   s2    ,q0Q6: 3a 7A=A##r"   c                    [         R                  " [         R                  " UR                  S5      U5       Vs/ s H  oDR                  5       PM     sn5      nU R	                  U5      n[
        R                  " [
        R                  " U R                  5      5      n[        U5       HB  u  pUR                  U R                  U R                  U	-  -   U R                  -   X   5      X'   MD     U$ s  snf )N)torchstacksplitsumr?   recompileescaper*   	enumeratesubr0   r1   )
r4   textpadding_maskper_sample_windowssr<   r>   audio_token_patterniaudio_lengths
             r#   _expand_audio_tokens+MusicFlamingoProcessor._expand_audio_tokensk   s    ekk,BRBRSUBVXj6k$l6kUUW6k$lm#;;MJ jj43C3C)DE()=>OA)--$$t'7'7,'FFI]I]]DG  ?
  %ms   C1c                 Z    XR                   :H  XR                  :H  -  XR                  :H  -  $ N)r,   r2   r3   )r4   	input_idss     r#   _get_audio_tokens_mask-MusicFlamingoProcessor._get_audio_tokens_maskv   s5    ---33353335	
r"   rL   audiooutput_labelskwargsreturnc           
         U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUR	                  S5      nUS:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e0 n	UGb  [        U5      n[        U5      [        U5      :w  a$  [        S	[        U5       S
[        U5       S35      e[        US   U R                   R"                  -  5      n
[        U R$                  U R                   R"                  -  5      n/ n/ nU H  n[        UR&                  S   5      n[)        SX-   S-
  U
-  5      nUU:  a;  [*        R-                  SXS   -  S SU R$                   SU R$                   S35        UnUR/                  U5        [1        UUU
-  5      n[3        U5       H.  nUU
-  n[1        US-   U
-  U5      nUR/                  UUU 5        M0     M     U R                   " U40 UD6n	U	R5                  S5      nUU	S'   U R7                  UUU5      nU R                  " U40 UD6n0 UEU	EnU(       aH  US   R9                  5       nSUU R;                  U5      '   SUUU R                  R<                  :H  '   UUS'   [?        UUS9$ )a  
Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
the text is tokenized as-is (LM-only behavior).

Args:
    text (`str` or `list[str]`):
        Input sequence or batch of sequences.
    audio (`np.ndarray` or `list[np.ndarray]`):
        Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
        `audio` inputs.
    output_labels (bool, *optional*, default=False):
        Whether to return labels for training.

Returns:
    [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
    audio features (`input_features`, `input_features_mask`).
tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   B   #    U  H  n[        U[        5      v   M     g 7frV   )
isinstancestr).0ts     r#   	<genexpr>2MusicFlamingoProcessor.__call__.<locals>.<genexpr>   s     9[VZQR*Q:L:LVZs   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r:   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskrW   ilabels)datatensor_type) _merge_kwargsr   r6   init_kwargsget
ValueErrorr7   r   ra   rb   listtupleallr   lenintr5   chunk_lengthr-   shapemaxloggerwarningappendminrangepoprS   clonerX   pad_token_idr   )r4   rL   rZ   r[   r\   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsrN   flat_chunksaudio_el	n_samplesn_wintime_caprQ   startendrM   text_inputsrj   ri   s                            r#   __call__MusicFlamingoProcessor.__call__}   s!   : (((
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;d>T>T>a>aabKd00D4J4J4W4WWXK,.,.K!q 12	A	 7! ;KL;&NN*9O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>uAOEq1u3X>C&&xc':; & ""  11+NNL'++,<=L2>L./ ,,T<ASTD nnT9[9.+..+&,,.F:>F4..v67<@F6T^^8889#DN>BBr"   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   S/-   5      5      $ )Nrh   )r6   model_input_namesr5   rp   dictfromkeys)r4   	tok_names	fea_namess      r#   r   (MusicFlamingoProcessor.model_input_names   sB    NN44	**<<	DMM)"7;P:Q"QRSSr"   )r0   r2   r1   r3   r*   r,   r-   )Nz<sound>z<|sound_bos|>z<|sound_eos|>i  )NF)r   r   r   r   __doc__r/   r?   rS   rX   r
   rp   r   boolr	   r   r   r   propertyrb   r   r!   __classcell__)r7   s   @r#   r&   r&   9   s    : ''S&$
	
 $(%*	]C$y/)]C D ]C d{	]C
 56]C 
]C~ T49 T Tr"   r&   )rG   numpynpaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   rC   
get_loggerr   rx   r   r&   __all__r   r"   r#   <module>r      si   , 
  9 4 H H 0 0  
		H	%#35 "gT^ gTT $
$r"   