
    Z jK                        S SK r S SKJr  S SKJr  S SKJrJr  SSKJ	r
  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%  \" 5       (       a  S SKr\" SS9\ " S S\5      5       5       r& " S S\!5      r'S r(S r) " S S\%5      r* " S S\5      r+\" SS 9 " S! S"\5      5       r,/ S#Qr-g)$    N)pi)strict)Tensorbroadcast_tensors   )initialization)Cache)PreTrainedConfig)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_available   )AudioFlamingo3Config)&AudioFlamingo3ForConditionalGenerationAudioFlamingo3PreTrainedModel)AudioFlamingo3Processor)CONFIG_MAPPING)MoonshineRotaryEmbeddingznvidia/music-flamingo-2601-hf)
checkpointc                   ^    \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\
\S'   S	r\S	-  \S
'   S rSrg	)MusicFlamingoConfig+   a5  
audio_bos_token_id (`int`, *optional*, defaults to 151670):
    The beginning-of-audio token index used to mark the start of audio spans.
audio_eos_token_id (`int`, *optional*, defaults to 151671):
    The end-of-audio token index used to mark the end of audio spans.
audio_frame_step (`float`, *optional*, defaults to 0.01):
    Duration in seconds of one input mel frame (trained with hop_length 160 at sampling_rate 16000).

Example:

```python
>>> from transformers import MusicFlamingoForConditionalGeneration, MusicFlamingoConfig, AudioFlamingo3EncoderConfig, Qwen2Config

>>> # Initializing an MusicFlamingoEncoder config
>>> audio_config = AudioFlamingo3EncoderConfig()

>>> # Initializing a Qwen2 config
>>> text_config = Qwen2Config()

>>> # Initializing an MusicFlamingo configuration
>>> configuration = MusicFlamingoConfig(audio_config, text_config)

>>> # Initializing a model from the musicflamingo style configuration
>>> model = MusicFlamingoForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```ivP audio_bos_token_idiwP audio_eos_token_idg{Gz?audio_frame_stepNrope_parametersc                     [        U R                  [        5      (       aN  U R                  S   S;   a  SU R                  S'   [        U R                  S      " S
0 U R                  D6U l        O U R                  c  [        S   " 5       U l        [        U R                  [        5      (       aU  U R                  R                  SS5      U R                  S'   [        U R                  S      " S
0 U R                  D6U l        O U R                  c  [        S   " 5       U l        U R                  c  SSSS.U l        U R                  S	   U l        U R                  R                  U l	        [        R                  " S
0 UD6  g )N
model_type)Nmusicflamingo_encoderaudioflamingo3_encoderqwen2default  g?)	rope_type
rope_thetapartial_rotary_factorr*    )
isinstanceaudio_configdictr   text_configgetr!   max_position_embeddingshidden_sizehead_dimr
   __post_init__)selfkwargss     ڈ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/musicflamingo/modular_musicflamingo.pyr5   !MusicFlamingoConfig.__post_init__P   sJ   d''..  .2QQ2J!!,/ .t/@/@/N O dRVRcRc dD& ./G H JDd&&---1-=-=-A-A,PW-XD\*-d.>.>|.LMaPTP`P`aD%-g68D'1:$il#mD '+';';L'I$))55&&00    )r.   r4   r2   r!   r0   )__name__
__module____qualname____firstlineno____doc__r   int__annotations__r   r    floatr!   r/   r5   __static_attributes__r,   r:   r8   r   r   +   s=    : %$$$"e"#'OTD['1r:   r   c                   ^   ^  \ rS rSrSr     SU 4S jjrS rS rS rS r	S r
S	 rS
rU =r$ )MusicFlamingoProcessorf   a  
Constructs an MusicFlamingo processor which wraps an MusicFlamingo feature extractor and an MusicFlamingo
tokenizer into a single processor.

[`MusicFlamingoProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
[`Qwen2TokenizerFast`]. See the [`~MusicFlamingoProcessor.__call__`] for more information.

Args:
    feature_extractor ([`WhisperFeatureExtractor`]):
        The feature extractor is a required input.
    tokenizer ([`Qwen2TokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`Optional[str]`, *optional*):
        The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
        template will be used.
    audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
        Special token used to represent audio inputs in the chat template.
    audio_bos_token (`Optional[str]`, *optional*, defaults to `"<|sound_bos|>"`):
        Special token used to represent the beginning of audio.
    audio_eos_token (`Optional[str]`, *optional*, defaults to `"<|sound_eos|>"`):
        Special token used to represent the end of audio.
    max_audio_len (`int`, *optional*, defaults to 1200):
        Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
c                    > [         TU ]  UUUUUS9  U ?XPl        X`l        UR                  U5      U l        UR                  U5      U l        g )N)chat_templateaudio_tokenmax_audio_len)super__init__default_transcription_promptaudio_bos_tokenaudio_eos_tokenconvert_tokens_to_idsr   r   )	r6   feature_extractor	tokenizerrH   rI   rN   rO   rJ   	__class__s	           r8   rL   MusicFlamingoProcessor.__init__   s`     	'#' 	 	
 -.."+"A"A/"R"+"A"A/"Rr:   c                    [         R                  " [         R                  " UR                  S5      U5       Vs/ s H  oDR                  5       PM     sn5      nU R	                  U5      n[
        R                  " [
        R                  " U R                  5      5      n[        U5       HB  u  pUR                  U R                  U R                  U	-  -   U R                  -   X   5      X'   MD     U$ s  snf )N)torchstacksplitsum_get_audio_token_lengthrecompileescaperI   	enumeratesubrN   rO   )
r6   textpadding_maskper_sample_windowssaudio_lengthsaudio_tokens_lengthsaudio_token_patterniaudio_lengths
             r8   _expand_audio_tokens+MusicFlamingoProcessor._expand_audio_tokens   s    ekk,BRBRSUBVXj6k$l6kUUW6k$lm#;;MJ jj43C3C)DE()=>OA)--$$t'7'7,'FFI]I]]DG  ?
  %ms   C1c                 Z    XR                   :H  XR                  :H  -  XR                  :H  -  $ N)audio_token_idr   r   )r6   	input_idss     r8   _get_audio_tokens_mask-MusicFlamingoProcessor._get_audio_tokens_mask   s5    ---33353335	
r:   c                     [        S5      eNz/This method is not supported for MusicFlamingo.NotImplementedErrorr6   argsr7   s      r8   apply_transcription_request2MusicFlamingoProcessor.apply_transcription_request       !"STTr:   c                     [        S5      eNz5MusicFlamingo does not need to overwrite this method.rt   rv   s      r8   decodeMusicFlamingoProcessor.decode       !"YZZr:   c                     [        S5      er|   rt   rv   s      r8   batch_decode#MusicFlamingoProcessor.batch_decode   r   r:   c                     [        S5      ers   rt   rv   s      r8   "_strip_assistant_prefix_and_quotes9MusicFlamingoProcessor._strip_assistant_prefix_and_quotes   rz   r:   )rN   r   rO   r   )Nz<sound>z<|sound_bos|>z<|sound_eos|>r(   )r;   r<   r=   r>   r?   rL   rj   rp   rx   r}   r   r   rC   __classcell__rS   s   @r8   rE   rE   f   sG    : ''S.	
U[[U Ur:   rE   c                     U R                   " / U R                  S S QSPSP76 n U R                  SS9u  p[        R                  " U* U4SS9n U R                  S5      $ )NrV   r   dim)reshapeshapeunbindrW   rX   flatten)xx1x2s      r8   rotate_halfr      s]    			'1773B<''Q'AXX"XFBbS"I2&A99R=r:   c                 N   U R                   nU R                  [        R                  5      n UR                  U 5      nUR                  U 5      nUR                  S   nU SUS 24   nU SS U24   nXa-  [        U5      U-  -   n[        R                  " Xe4SS9R                  U5      $ )NrV   .r   )dtypetorW   float64r   r   cat)hidden_statescossinoriginal_dtyperot_dimpassthroughrotateds          r8   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g+477GGr:   c            	          ^  \ rS rSrSrSS\4U 4S jjjrS r\R                  " 5       S\
S\S\\
\
4   4S	 j5       rS
rU =r$ )MusicFlamingoRotaryEmbedding   a  Rotary time embedding module used by MusicFlamingo checkpoints.

This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
(Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
within each audio sample and the encoder time index within each window, then modulates both axes with absolute
timestamps in seconds.
configc                 x   > [         TU ]  XS9  U R                  U R                  5      nU R	                  SUSS9  g )Ndeviceposition_anglesF)
persistent)rK   rL   _compute_position_anglesinv_freqregister_buffer)r6   r   r   r   rS   s       r8   rL   %MusicFlamingoRotaryEmbedding.__init__   s=    /77F.ERr:   c                 2   [         R                  " [        U R                  5      UR                  UR
                  S9nX R                  -  S[        -  -  nUR                  S5      U-  n[         R                  " USSS9nUR                  UR
                  S9$ )Nr   r   r   rV   r   )r   )
rW   aranger@   max_seq_len_cachedr   r   r   	unsqueezerepeat_interleaver   )r6   r   	positionsr   s       r8   r   5MusicFlamingoRotaryEmbedding._compute_position_angles   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r:   
timestampsseq_lenreturnc                    USS2S4   R                  U R                  R                  U R                  R                  S9nU R                  R
                  S-  U-  n[        R                  " X4-  5      U R                  -  nUR                  S5      U R                  -  n[        R                  " USSS9nUSS2SSS24   nU R                  SU SSS2SS24   n[        Xg5      u  pg[        R                  " Xg4SS9nU* S-  [        -  R                  U5      n	XR                  S5      -  nUR                  5       UR!                  5       4$ )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   r      rV   r   r   )r   r   r   r   r   r    rW   roundr   r   r   r   r   r   r   r   r   )
r6   r   r   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r8   forward$MusicFlamingoRotaryEmbedding.forward   s+   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\#N 		<4"=q2%))%0++yy{EIIK''r:   r,   rm   )r;   r<   r=   r>   r?   r   rL   r   rW   no_gradr   r@   tupler   rC   r   r   s   @r8   r   r      sZ    S2 S S
8 ]]_(& (3 (5;P ( (r:   r   c                   F    \ rS rSrSr\R                  " 5       S 5       rSrg)MusicFlamingoPreTrainedModel   Nc                     [         R                  " X5        [        U[        5      (       a=  UR	                  UR
                  5      n[        R                  " UR                  U5        g g rm   )	r   _init_weightsr-   r   r   r   initcopy_r   )r6   modulebuffer_values      r8   r   *MusicFlamingoPreTrainedModel._init_weights   sK    %%d3f:;;!::6??KLJJv--|< <r:   r,   )	r;   r<   r=   r>   _no_split_modulesrW   r   r   rC   r,   r:   r8   r   r      s     
]]_= =r:   r   z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    custom_introc                   P  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\S\R                  4S jr
\\" S	S
9S\R                  S\R                  S\R                  S\\   S\\-  4
S j5       5       r\\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )%MusicFlamingoForConditionalGeneration   r   c                 D   > [         TU ]  U5        [        U5      U l        g rm   )rK   rL   r   pos_emb)r6   r   rS   s     r8   rL   .MusicFlamingoForConditionalGeneration.__init__  s     3F;r:   ro   post_lengthsmax_post_lengthr   c                    XR                   R                  :H  n[        R                  " [        R                  R
                  R                  UR                  5       SSS9SS9n[        R                  " US:H  5      u  pg[        R                  " US:H  5      u  phX-
  R                  [        R                  5      n	U R                   R                  S-  n
[        R                  " X2R                  [        R                  S9U
-  n[        R                  " [        R                   " SUR                  S	9[        R"                  " USS9S S /5      n[        R"                  " U	SS9n[        R$                  " XS
S9n[        R$                  " U[        R                  " U	R&                  S   UR                  S	95      n[        R                  " UR&                  S   UR                  S	9X   -
  nUR)                  S5      U-  U
-  U-   $ )N)   r   r   )valuer   r   rV   r   r   r   T)right)r   rn   rW   diffnn
functionalpadr@   wherer   longr    r   r   float32r   zeroscumsumsearchsortedr   r   )r6   ro   r   r   audio_token_maskr   _startsendssample_lengthsaudio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                    r8   _build_audio_timestamps=MusicFlamingoForConditionalGeneration._build_audio_timestamps	  s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	++dbj)-++EJJ7 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_Noo 	
 ''*_<?UUXeeer:   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   input_featuresinput_features_maskr7   c                    U R                   " U4USS.UD6nUR                  nU R                   R                  UR                  S5      R	                  [
        R                  5      5      u  pxU R                  X8UR                  S   5      n	U R                  U	R	                  UR                  5      UR                  S   S9u  p[        XjU5      nU R                  U5      n[
        R                  " UR                  S   UR                  S9SSS24   USS2S4   :  nXR	                  UR                  5         Ul        U$ )	aR  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padded feature indices.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
T)r   return_dictrV   r   )r   r   r   N)audio_towerlast_hidden_state _get_feat_extract_output_lengthsrZ   r   rW   r   r   r   r   r   r   multi_modal_projectorr   pooler_output)r6   r   r   ro   r7   audio_outputr   r   r   audio_timestampsr   r   audio_embeds
valid_masks                 r8   get_audio_features8MusicFlamingoForConditionalGeneration.get_audio_features+  s@   " ''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw77	Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<k-m#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1--@S@S2T%U"r:   Nattention_maskposition_idspast_key_valuesinputs_embedslabels	use_cachelogits_to_keepc                    Uc  U R                  5       " U5      nUb  Ub  U R                  X#USS9R                  nXR                  R                  :H  R                  S5      nUR                  UR                  UR                  5      UR                  UR                  5      5      nU R                  " SUUUUUU	U
S.UD6nU$ )a  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
    Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor

>>> model_id = "nvidia/music-flamingo-2601-hf"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")

>>> conversation = [
>>>     {
>>>         "role": "user",
>>>         "content": [
>>>             {
>>>                 "type": "text",
>>>                 "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
>>>             },
>>>             {
>>>                 "type": "audio",
>>>                 "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
>>>             },
>>>         ],
>>>     }
>>> ]

>>> inputs = processor.apply_chat_template(
>>>     conversation,
>>>     tokenize=True,
>>>     add_generation_prompt=True,
>>>     return_dict=True,
>>> ).to(model.device, model.dtype)

>>> outputs = model.generate(**inputs, max_new_tokens=100)

>>> decoded_outputs = processor.batch_decode(
>>>     outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
>>> )
>>> print(decoded_outputs)
["This track is an uplifting Eurodance-style Trance-Pop anthem..."]
```T)ro   r   rV   )r
  r  r  r	  r  r  r  r,   )
get_input_embeddingsr  r   r   rn   r   masked_scatterr   r   language_model)r6   ro   r   r   r  r  r	  r
  r  r  r  r7   r  r   outputss                  r8   r   -MusicFlamingoForConditionalGeneration.forwardO  s    F   557	BM%)*?22yVZ 3 m 
 !*[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +/*=*= 	+
')%+)	+
 	+
 r:   )r   )
NNNNNNNNNr   )r;   r<   r=   r>   r   rL   rW   
LongTensorr@   FloatTensorr   r   r   r   r   r   r   r   r  r	   boolr   r   rC   r   r   s   @r8   r   r      s   <2 < f## f && f 	 f
 
		 fD  w)) #\\ ##	
 +, 
+	+ @  .23737.204(,26*.!%-.Y##d*Y ))D0Y #\\D0	Y
 t+Y &&-Y Y ((4/Y   4'Y $;Y ell*Y +,Y 
 Y  Yr:   r   )r   rE   r   r   ).r\   mathr   huggingface_hub.dataclassesr   rW   r   r    r   r   cache_utilsr	   configuration_utilsr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   +audioflamingo3.configuration_audioflamingo3r   &audioflamingo3.modeling_audioflamingo3r   r   (audioflamingo3.processing_audioflamingo3r   autor   moonshine.modeling_moonshiner   r   rE   r   r   r   r   r   __all__r,   r:   r8   <module>r&     s     
  . + &   3 R - & ] ] N O ! C  :;61. 61  <61rMU4 MU`
H'(#; '(T=#@ = 
f,R f
fRr:   