
    Z jS6                         S SK r S SKrSSKJrJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJr  \" 5       (       a  S SKr\R$                  " \5      r " S S	\	S
S9r " S S\
5      rS/rg)    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   2    \ rS rSrSS0SSSS.SSS	.S
.rSrg)AudioFlamingo3ProcessorKwargs"   paddingTi>  
max_length)sampling_ratereturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       ڍ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/audioflamingo3/processing_audioflamingo3.pyr   r   "   s2     t
 #%)#
 #"
Ir"   r   F)totalc                     ^  \ rS rSrSr    SU 4S jjrS rS rS r  SS	\	\
\	   -  S
\S-  S\S-  S\\   S\4
S jjr\S\
\   4S j5       r SS
\\
\   -  \-  S\\
\   -  S-  S\\   S\4S jjrSS.S jrS rS	\S\4S jrSrU =r$ )AudioFlamingo3Processor3   a  
Constructs an AudioFlamingo3 processor which wraps an AudioFlamingo3 feature extractor and an AudioFlamingo3
tokenizer into a single processor.

[`AudioFlamingo3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
[`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.

Args:
        feature_extractor ([`WhisperFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`Qwen2TokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`Optional[str]`, *optional*):
            The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
            template will be used.
        audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
            Special token used to represent audio inputs in the chat template.
        default_transcription_prompt (`str`, *optional*, defaults to `"Transcribe the input speech."`):
            Default prompt to use for transcription tasks when applying transcription requests.
        max_audio_len (`int`, *optional*, defaults to 600):
            Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
Nc                 r   > X@l         UR                  U5      U l        XPl        X`l        [
        TU ]  XUS9  g )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr)   r*   r-   r.   	__class__s          r#   r0    AudioFlamingo3Processor.__init__K   s=     ''==kJ,H)**]Sr"   c                 2    US-
  S-  S-   nUS-
  S-  S-   nU$ )N      r   )r1   audio_lengthsconv_output_lengthsaudio_tokens_lengthss       r#   _get_audio_token_length/AudioFlamingo3Processor._get_audio_token_lengthZ   s2    ,q0Q6: 3a 7A=A##r"   c                    [         R                  " [         R                  " UR                  S5      U5       Vs/ s H  oDR                  5       PM     sn5      nU R	                  U5      n[
        R                  " [
        R                  " U R                  5      5      n[        U5       H(  u  pUR                  U R                  U	-  X   5      X'   M*     U$ s  snf )N)torchstacksplitsumr<   recompileescaper*   	enumeratesub)
r1   textpadding_maskper_sample_windowssr9   r;   audio_token_patterniaudio_lengths
             r#   _expand_audio_tokens,AudioFlamingo3Processor._expand_audio_tokens_   s    ekk,BRBRSUBVXj6k$l6kUUW6k$lm#;;MJ jj43C3C)DE()=>OA)--d.>.>.MtwWDG  ? %ms   Cc                     XR                   :H  $ N)r,   )r1   	input_idss     r#   _get_audio_tokens_mask.AudioFlamingo3Processor._get_audio_tokens_maskg   s    ////r"   FrI   audiooutput_labelskwargsreturnc           
         U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUR	                  S5      nUS:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e0 n	UGb  [        U5      n[        U5      [        U5      :w  a$  [        S	[        U5       S
[        U5       S35      e[        US   U R                   R"                  -  5      n
[        U R$                  U R                   R"                  -  5      n/ n/ nU H  n[        UR&                  S   5      n[)        SX-   S-
  U
-  5      nUU:  a;  [*        R-                  SXS   -  S SU R$                   SU R$                   S35        UnUR/                  U5        [1        UUU
-  5      n[3        U5       H.  nUU
-  n[1        US-   U
-  U5      nUR/                  UUU 5        M0     M     U R                   " U40 UD6n	U	R5                  S5      nUU	S'   U R7                  UUU5      nU R                  " U40 UD6n0 UEU	EnU(       aH  US   R9                  5       nSUU R;                  U5      '   SUUU R                  R<                  :H  '   UUS'   [?        UUS9$ )a  
Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
the text is tokenized as-is (LM-only behavior).

Args:
    text (`str` or `list[str]`):
        Input sequence or batch of sequences.
    audio (`np.ndarray` or `list[np.ndarray]`):
        Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
        `audio` inputs.
    output_labels (bool, *optional*, default=False):
        Whether to return labels for training.

Returns:
    [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
    audio features (`input_features`, `input_features_mask`).
tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c              3   B   #    U  H  n[        U[        5      v   M     g 7frS   
isinstancestr).0ts     r#   	<genexpr>3AudioFlamingo3Processor.__call__.<locals>.<genexpr>   s     9[VZQR*Q:L:LVZ   zAInvalid input text. Please provide a string, or a list of stringszGot z
 text but z audios; they must match 1:1.r   r   r7   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskrT   ilabels)datatensor_type) _merge_kwargsr   r3   init_kwargsget
ValueErrorr4   r   r_   r`   listtupleallr   lenintr2   chunk_lengthr.   shapemaxloggerwarningappendminrangepoprP   clonerU   pad_token_idr   )r1   rI   rW   rX   rY   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsrK   flat_chunksaudio_el	n_samplesn_wintime_caprN   startendrJ   text_inputsri   rh   s                            r#   __call__ AudioFlamingo3Processor.__call__j   s!   : (()
"&.."<"<
 
 "-0">2$)9:T! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa&u-E4yCJ& 4D	{*SZLHe!fgg l?;d>T>T>a>aabKd00D4J4J4W4WWXK,.,.K!q 12	A	 7! ;KL;&NN*9O7T+TUX*YYdeiewewdx  yP  QU  Qc  Qc  Pd  df  g (E"))%0y%+*=>uAOEq1u3X>C&&xc':; & ""  11+NNL'++,<=L2>L./ ,,T<ASTD nnT9[9.+..+&,,.F:>F4..v67<@F6T^^8889#DN>BBr"   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   S/-   5      5      $ )Nrg   )r3   model_input_namesr2   ro   dictfromkeys)r1   	tok_names	fea_namess      r#   r   )AudioFlamingo3Processor.model_input_names   sB    NN44	**<<	DMM)"7;P:Q"QRSSr"   promptc           
      ~   [        U[        5      (       a  U/nO[        U[        [        45      (       a*  U(       a#  [	        S U 5       5      (       a  [        U5      nO[        [        U5      5      n[        5       (       a]  U Vs/ s HP  n[        U[        R                  5      (       a,  UR                  5       R                  5       R                  5       OUPMR     nn[        U5      nUS:X  a  [        S5      eUc  U R                  /U-  nO[        U[        5      (       a  U/U-  nO[        U[        [        45      (       a  [        U5      U:w  a  [        S[        U5       SU S35      e/ nU HT  nUc  UR                  U R                  5        M#  [        U[        5      (       a  UR                  U5        MK  [!        S5      e   O[!        S5      e[#        Xt5       V	V
s/ s H-  u  pS	S
U	S.[        U
[        5      (       a  SU
S.OSU
S./S./PM/     nn	n
U R$                  " U4SSSS.UD6$ s  snf s  sn
n	f )a  
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

Args:
    audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
        the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
    prompt (`str` or `list[str]`, *optional*):
        Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
        each sample uses `"Transcribe the input speech."`.
    **kwargs:
        Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
        `text_kwargs`, `audio_kwargs`, ...).

Returns:
    [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

c              3   B   #    U  H  n[        U[        5      v   M     g 7frS   r^   )ra   els     r#   rc   FAudioFlamingo3Processor.apply_transcription_request.<locals>.<genexpr>   s     ?d^cXZ
2s@S@S^cre   r   z)`audio` must contain at least one sample.z	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.userrI   )typerI   rW   )r   path)r   rW   )rolecontentT)tokenizeadd_generation_promptreturn_dict)r_   r`   ro   rp   rq   r   r   r@   Tensordetachcpunumpyrr   rn   r-   ry   	TypeErrorzipapply_chat_template)r1   rW   r   rY   audio_itemsr   
batch_sizepromptsitemprompt_text
audio_itemconversationss               r#   apply_transcription_request3AudioFlamingo3Processor.apply_transcription_request   s-   2 eS!!38'Ke}--%C?d^c?d<d<du+K1%89K!##kvwkvegJr5<<<X<Xryy{0668^``kvw%
?HII>889JFG$$h+Gu..6{j( F}OJ<Gkl  G<NN4#D#DEc**NN4(#$MNN  Z[[ ,/w+D
 ,E' #!'=%j#66 ")*=&-
C	 
 ,E 	 
 ''
"&	

 
 	
S x4
s   AH4&4H9)strip_prefixc                    U R                   R                  " U0 UD6nU(       a   U Vs/ s H  oPR                  U5      PM     nnU$ s  snf )aB  
Forward arguments to [`~PreTrainedTokenizer.decode`] and optionally remove the assistant framing the model
was trained to produce.

AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
)r3   decode"_strip_assistant_prefix_and_quotes)r1   r   argsrY   decodedrI   s         r#   r   AudioFlamingo3Processor.decode   sJ     ..''88QXYQX>>tDQXGY Zs   Ac                 &    U R                   " U0 UD6$ )z)BC as previous examples used batch_decode)r   )r1   r   rY   s      r#   batch_decode$AudioFlamingo3Processor.batch_decode-  s    {{D+F++r"   c                 `   UR                  5       nS H7  nUR                  U5      (       d  M  U[        U5      S R                  5       n  O   UR                  S5      (       a  USS R                  5       n[        U5      S:  a(  US   US   :X  a  US   S;   a  USS R                  5       nU$ )	zY
Remove the assistant prefix and surrounding quotes from a decoded transcription string.
)z"The spoken content of the audio isz!The transcription of the audio isz!The content of the input audio isN.r?   r8   r   >   "'r7   )strip
startswithrr   endswith)r1   rI   strippedprefixs       r#   r   :AudioFlamingo3Processor._strip_assistant_prefix_and_quotes1  s    
 ::<
F
 ""6**#CKM288:
 S!!}**,Hx=A(1+""=(1+Q[B["~++-Hr"   )r*   r,   r-   r.   )Nz<sound>zTranscribe the input speech.iX  )NFrS   )r   r   r   r   __doc__r0   r<   rP   rU   r
   ro   r   boolr	   r   r   r   propertyr`   r   r   r   r   r   r!   __classcell__)r4   s   @r#   r&   r&   3   s,   6 %CT$
0 $(%*	]C$y/)]C D ]C d{	]C
 67]C 
]C~ T49 T T *.O
T#Y+O
 d3i$&O
 67	O

 
O
b */ ,s s  r"   r&   )rD   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   utilsr   r   r@   
get_loggerr   rw   r   r&   __all__r   r"   r#   <module>r      sg     
  9 4 H H 0 0  
		H	%$4E "Tn Tn %
%r"   