
    Z j`2                        S SK r S SK Jr  SSKJr  SSKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJrJr  SSKJ r   \RB                  " \"5      r# " S S\5      r$ " S S\5      r% " S S\5      r&\" SS9 " S S\5      5       r' " S S\5      r(\" SS9 " S S \5      5       r)/ S!Qr*g)"    N)nn   )ACT2FN)Cache)create_bidirectional_mask)BaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )Qwen2AudioEncoderQwen2AudioPreTrainedModel)VoxtralForConditionalGenerationVoxtralMultiModalProjector)WhisperAttentionWhisperEncoderLayer   )AudioFlamingo3Configc                       \ rS rSrSrg)AudioFlamingo3Attention'    N__name__
__module____qualname____firstlineno____static_attributes__r       ڊ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/audioflamingo3/modular_audioflamingo3.pyr   r   '       r$   r   c                       \ rS rSrSrg)AudioFlamingo3EncoderLayer+   r   Nr   r   r$   r%   r(   r(   +   r&   r$   r(   c                       \ rS rSrSrg)AudioFlamingo3PreTrainedModel/   r   Nr   r   r$   r%   r+   r+   /   r&   r$   r+   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc            
           \ rS rSrSr\\S.r\\	 S
S\
R                  S\
R                  S-  S\\-  4S jj5       5       rS	rg)AudioFlamingo3Encoder3   zQ
AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
)hidden_states
attentionsNinput_featuresinput_features_maskreturnc                    UR                   S   S-
  S-  S-   nUR                  S5      nUS-
  S-  S-   n[        R                  " XAR                  S9USS2S4   :  n[
        R                  R                  U R                  U5      5      n[
        R                  R                  U R                  U5      5      nUR                  SSS5      nX`R                  R                  -   n[
        R                  R                  XpR                  U R                  S9n[        U R                   UUS9nU R"                   HJ  n	U R                  =(       a#    [        R$                  " / 5      U R&                  :  n
U
(       a  MB  U	" Xx5      nML     UR                  SSS5      nU R)                  U5      R                  SSS5      nU R+                  U5      n[-        US	9$ )
a(  
Args:
    input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
        these features from waveform input.
    input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
r   r   deviceNr   )ptraining)configinputs_embedsattention_mask)last_hidden_state)shapesumtorcharanger:   r   
functionalgeluconv1conv2permuteembed_positionsweightdropoutr<   r   r=   layersrand	layerdrop
avg_pooler
layer_normr   )selfr4   r5   kwargsseq_leninput_features_lengthsr>   r2   r?   layerdrops              r%   forwardAudioFlamingo3Encoder.forwardB   s   ( "''+a/A59!4!8!8!<"81"<!BQ!F#ll7;P;PQTjklnrkrTss **4::n+EF**4::m+DE%--aA6 &(<(<(C(CC--m||VZVcVc-d2;;'.
 [[E==DUZZ^dnn%DD4 %m D ! &--aA66>>q!QG6)+
 	
r$   r   N)r   r    r!   r"   __doc__r(   r   _can_record_outputsr   r   rC   Tensortupler   rX   r#   r   r$   r%   r0   r0   3   sg    
 4-
   483
3
 #\\D03

 
+	+3
   3
r$   r0   c                   4   ^  \ rS rSrSrS\4U 4S jjrSrU =r$ )!AudioFlamingo3MultiModalProjectorz   z
Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
to the LLM embedding space so they can replace `<sound>` tokens.
r=   c                   > [         TU ]  5         [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        g )N)bias)super__init__r   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2rR   r=   	__class__s     r%   re   *AudioFlamingo3MultiModalProjector.__init__   s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r$   )rm   rk   rn   )	r   r    r!   r"   r[   r   re   r#   __classcell__rp   s   @r%   r`   r`   z   s    

3 
 
r$   r`   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                     ^  \ rS rSrSrSrSrSrU 4S jr\	\
" SS9S\R                  S\R                  S	\\   S
\\-  4S j5       5       r\	\
          SS\R&                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R&                  S-  S\S-  S\R                  S-  S\R&                  S-  S\S-  S\\R                  -  S	\\   S
\4S jj5       5       rSS.S\4U 4S jjjrSrU =r$ )&AudioFlamingo3ForConditionalGeneration   TNc                 $   > [         TU ]  U5        g rZ   )rd   re   ro   s     r%   re   /AudioFlamingo3ForConditionalGeneration.__init__   s     r$   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r-   r4   r5   rS   r6   c                    U R                   " U4USS.UD6nU R                  UR                  5      nUR                  S5      R	                  [
        R                  5      nU R                   R                  U5      u  px[
        R                  " UR                  S   UR                  S9SSS24   USS2S4   :  n	XYR	                  UR                  5         Ul        U$ )a  
input_features (`torch.FloatTensor`):
    Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padded feature indices.
T)r5   return_dictr8   r   r9   N)audio_towermulti_modal_projectorr@   rB   torC   long _get_feat_extract_output_lengthsrD   rA   r:   pooler_output)
rR   r4   r5   rS   audio_outputaudio_embedsinput_lengths_post_lengths
valid_masks
             r%   get_audio_features9AudioFlamingo3ForConditionalGeneration.get_audio_features   s    * ''
0CQU
Y_
 11,2P2PQ ,//366uzzB**KKMZ\\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1--@S@S2T%U"r$   	input_idsr?   position_idspast_key_valuesr>   labels	use_cachelogits_to_keepc                    Uc  U R                  5       " U5      nUb  Ub  U R                  X#SS9R                  nXR                  R                  :H  R                  S5      nUR                  UR                  UR                  5      UR                  UR                  5      5      nU R                  " SUUUUUU	U
S.UD6nU$ )a	  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

>>> model_id = "nvidia/audio-flamingo-3-hf"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

>>> conversations = [
>>>     [
>>>         {
>>>             "role": "user",
>>>             "content": [
>>>                 {"type": "text", "text": "Transcribe the input speech."},
>>>                 {
>>>                     "type": "audio",
>>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
>>>                 },
>>>             ],
>>>         }
>>>     ],
>>>     [
>>>         {
>>>             "role": "user",
>>>             "content": [
>>>                 {
>>>                     "type": "text",
>>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
>>>                 },
>>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
>>>             ],
>>>         }
>>>     ],
>>> ]

>>> inputs = processor.apply_chat_template(
>>>     conversations,
>>>     tokenize=True,
>>>     add_generation_prompt=True,
>>>     return_dict=True,
>>> ).to(model.device)

>>> outputs = model.generate(**inputs, max_new_tokens=500)

>>> decoded_outputs = processor.batch_decode(
>>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
>>> )
>>> print(decoded_outputs)
["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
```T)rz   r8   )r>   r?   r   r   r   r   r   r   )
get_input_embeddingsr   r   r=   audio_token_id	unsqueezemasked_scatterr}   r:   language_model)rR   r   r4   r5   r?   r   r   r>   r   r   r   rS   r   audio_token_maskoutputss                  r%   rX   .AudioFlamingo3ForConditionalGeneration.forward   s    ^   557	BM%)*?22>dh2iwwL !*[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +/*=*= 	+
')%+)	+
 	+
 r$   F)is_first_iterationr   c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  Ub  XFS'   Ub  XVS'   U$ )Nr4   r5   r   F)poprd   prepare_inputs_for_generationget)rR   r   argsrS   r4   r5   model_inputsrp   s          r%   r   DAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generation"  ss    $4d;$jj)>Ew<dMfM\%5%5k5%I%I)1?-.".6I23r$   r   )
NNNNNNNNNr   )r   r    r!   r"   _supports_attention_backend_tp_plan_pp_plan_keep_in_fp32_modules_strictre   r   r   rC   FloatTensorr]   r
   r   r^   r   r   
LongTensorr   boolintr	   rX   r   r#   rr   rs   s   @r%   ru   ru      s    #'HH#' !  w)) #\\ +,	
 
+	+ <  .23737.204(,26*.!%-.c##d*c ))D0c #\\D0	c
 t+c &&-c c ((4/c   4'c $;c ell*c +,c 
 c  cJ OT t  r$   ru   )ru   r+   r0   )+rC   r   activationsr   cache_utilsr   masking_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   r   utils.genericr   utils.output_capturingr    qwen2_audio.modeling_qwen2_audior   r   voxtral.modeling_voxtralr   r   whisper.modeling_whisperr   r   configuration_audioflamingo3r   
get_loggerr   loggerr   r(   r+   r0   r`   ru   __all__r   r$   r%   <module>r      s       !   6 R & R R 7 5 c L > 
		H	%	. 		!4 		$= 	 
?
- ?

?
D
(B 
" 
^-L ^
^B or$   