
    Z jI                        S SK Jr  S SKJr  S SKJr  S SKJrJrJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJ r   SSK!J"r"J#r#  SSK$J%r%  \ " 5       (       a  S SKr " S S\	RL                  5      r'\ " S S\5      5       r( " S S\	RL                  5      r)S r*S r+\" SS9 " S S\(\5      5       r,SS/r-g)     )Callable)pi)Optional)Tensorbroadcast_tensorsnn   )initialization)ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONS)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_available   )	AutoModelAutoModelForCausalLM   )MusicFlamingoConfigNc                      ^  \ rS rSr% Sr\R                  \S'   SS\4U 4S jjjr	\
   SS\S-  S\S   S	\S-  S
\S\4   4S jj5       r\R                   " 5       S\S	\S
\\\4   4S j5       rS rSrU =r$ )MusicFlamingoRotaryEmbedding-   a  Rotary time embedding module used by MusicFlamingo checkpoints.

This is a checkpoint-faithful integration, not a direct implementation of the RoTE formulation described in
(Goel et al., 2024): https://arxiv.org/abs/2410.12109. It applies axial rotary embeddings over the window index
within each audio sample and the encoder time index within each window, then modulates both axes with absolute
timestamps in seconds.
inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  U R                  U R                  5      nU R                  SUSS9  g )N	rope_typedefaultr   F)
persistentoriginal_inv_freqposition_angles)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr    rope_parametersr"   compute_default_rope_parametersr   attention_scalingregister_bufferclone_compute_position_anglesr   )selfr    devicerope_init_fnr   r&   	__class__s         ډ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/musicflamingo/modeling_musicflamingo.pyr(   %MusicFlamingoRotaryEmbedding.__init__8   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuU77F.ER    r3   ztorch.deviceseq_lenreturnztorch.Tensorc           	      j   U R                   S   nU R                   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        XT-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  nX4$ )
aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetapartial_rotary_factorg      ?head_dimNr   r   dtyper3   r@   )r,   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r    r3   r9   baser=   r>   dimattention_factorr   s	            r6   r-   <MusicFlamingoRotaryEmbedding.compute_default_rope_parametersJ   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r8   
timestampsc                    USS2S4   R                  U R                  R                  U R                  R                  S9nU R                  R
                  S-  U-  n[        R                  " X4-  5      U R                  -  nUR                  S5      U R                  -  n[        R                  " USSS9nUSS2SSS24   nU R                  SU SSS2SS24   n[        Xg5      u  pg[        R                  " Xg4SS9nU* S-  [        -  R                  U5      n	XR                  S5      -  nUR                  5       UR!                  5       4$ )zBCompute 2D axial rotary embeddings for window and time dimensions.Nr   rA      r   rM   )rJ   r   r3   r@   r    audio_frame_steprG   roundr*   	unsqueezerepeat_interleaver&   r   catr   cossin)
r2   rP   r9   window_startswindow_durationwindow_positionswindow_freqs
time_freqsfreqsangles
             r6   forward$MusicFlamingoRotaryEmbedding.forwardj   s+   
 #1a4(++4==3G3Gt}}ObOb+c++66:WD ;;}'FG$JaJaa'11"5E..|QBG $AtQJ/))(73D!QJ?
#4\#N 		<4"=q2%))%0++yy{EIIK''r8   c                 2   [         R                  " [        U R                  5      UR                  UR
                  S9nX R                  -  S[        -  -  nUR                  S5      U-  n[         R                  " USSS9nUR                  UR
                  S9$ )NrA   r   rS   rT   r?   )
rG   rH   rF   r*   r3   r@   r   rW   rX   rJ   )r2   r   	positionsr&   s       r6   r1   5MusicFlamingoRotaryEmbedding._compute_position_angles~   s    LLT%<%<!=hoo]e]k]kl	 7 771r6B	#--b1H<11/1"M!!!77r8   )r.   r    r*   r+   r"   N)NNN)__name__
__module____qualname____firstlineno____doc__rG   r   __annotations__r   r(   staticmethodr   rF   tuplerK   r-   no_gradrc   r1   __static_attributes____classcell__r5   s   @r6   r   r   -   s     llS2 S S$ -1+/"*#d**(* t* 
~u$	%	* *> ]]_(& (3 (5;P ( (&8 8r8   r   c                   x   ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr\R                  " 5       U 4S j5       rS	rU =r$ )
MusicFlamingoPreTrainedModel   r    model)audiotextTNpast_key_valuesc                    > [         TU ]  U5        [        U[        5      (       a=  UR	                  UR
                  5      n[        R                  " UR                  U5        g g rh   )	r'   _init_weights
isinstancer   r1   r   initcopy_r&   )r2   modulebuffer_valuer5   s      r6   r}   *MusicFlamingoPreTrainedModel._init_weights   sK    f%f:;;!::6??KLJJv--|< <r8    )ri   rj   rk   rl   r   rn   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdparG   rq   r}   rr   rs   rt   s   @r6   rv   rv      sJ    (&*#"3N
]]_= =r8   rv   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ ) MusicFlamingoMultiModalProjector   z
Audio adaptor (small MLP) that projects MusicFlamingoEncoder features
to the LLM embedding space so they can replace `<sound>` tokens.
r    c                   > [         TU ]  5         [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  UR                  S9U l        g )N)bias)r'   r(   r   Linearaudio_configrD   text_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r2   r    r5   s     r6   r(   )MusicFlamingoMultiModalProjector.__init__   s    		++V-?-?-K-KRXRgRg
 &556		**F,>,>,J,JQWQfQf
r8   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rh   )r   r   r   )r2   audio_featureshidden_statess      r6   rc   (MusicFlamingoMultiModalProjector.forward   s2    n5/m4r8   )r   r   r   )
ri   rj   rk   rl   rm   r   r(   rc   rr   rs   rt   s   @r6   r   r      s    

2 
 r8   r   c                     U R                   " / U R                  S S QSPSP76 n U R                  SS9u  p[        R                  " U* U4SS9n U R                  S5      $ )NrS   r   rT   )reshapeshapeunbindrG   stackflatten)xx1x2s      r6   rotate_halfr      s]    			'1773B<''Q'AXX"XFBbS"I2&A99R=r8   c                 N   U R                   nU R                  [        R                  5      n UR                  U 5      nUR                  U 5      nUR                  S   nU SUS 24   nU SS U24   nXa-  [        U5      U-  -   n[        R                  " Xe4SS9R                  U5      $ )NrS   .rT   )r@   rJ   rG   float64r   r   rY   )r   rZ   r[   original_dtyperot_dimpassthroughrotateds          r6   apply_rotary_time_embr      s    "((N!$$U]]3M
&&
C
&&
CiimGWX.KC'M*G}W!5!;<G99g+477GGr8   z
    The MusicFlamingo model which consists of a fine-tuned Whisper encoder, rotary time embedding, a multi-modal projector, and a Qwen2 language model.
    custom_introc                     ^  \ rS rSrSrSrSrSrS\4U 4S jjr	S r
S rS rS	 rS
 rS r\\" SS9S\R&                  S\R(                  S\R*                  S\\   S\\-  4
S j5       5       r\\          S$S\R*                  S-  S\R&                  S-  S\R(                  S-  S\R(                  S-  S\R*                  S-  S\S-  S\R&                  S-  S\R*                  S-  S\S-  S\\R(                  -  S\\   S\4S jj5       5       rSS.S\4U 4S jjjr S\R*                  S \R*                  S!\S\R&                  4S" jr!S#r"U =r#$ )%%MusicFlamingoForConditionalGeneration   NTr    c                 N  > [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        [        U5      U l        U R                  5         g rh   )r'   r(   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   pos_emb	post_initr   s     r6   r(   .MusicFlamingoForConditionalGeneration.__init__   sz      ,,77$001D1DE2>>v?Q?QR%Ef%M"3F; 	r8   c                 6    U R                   R                  5       $ rh   )r   get_input_embeddingsr2   s    r6   r   :MusicFlamingoForConditionalGeneration.get_input_embeddings   s    ""7799r8   c                 :    U R                   R                  U5        g rh   )r   set_input_embeddings)r2   values     r6   r   :MusicFlamingoForConditionalGeneration.set_input_embeddings   s    007r8   c                 6    U R                   R                  5       $ rh   )r   get_output_embeddingsr   s    r6   r   ;MusicFlamingoForConditionalGeneration.get_output_embeddings   s    ""88::r8   c                 :    U R                   R                  U5        g rh   )r   set_output_embeddings)r2   new_embeddingss     r6   r   ;MusicFlamingoForConditionalGeneration.set_output_embeddings   s    11.Ar8   c                 :    U R                   R                  U5        g rh   )r   set_decoder)r2   decoders     r6   r   1MusicFlamingoForConditionalGeneration.set_decoder   s    ''0r8   c                 6    U R                   R                  5       $ rh   )r   get_decoderr   s    r6   r   1MusicFlamingoForConditionalGeneration.get_decoder   s    ""..00r8   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   input_featuresinput_features_mask	input_idskwargsr:   c                    U R                   " U4USS.UD6nUR                  nU R                   R                  UR                  S5      R	                  [
        R                  5      5      u  pxU R                  X8UR                  S   5      n	U R                  U	R	                  UR                  5      UR                  S   S9u  p[        XjU5      nU R                  U5      n[
        R                  " UR                  S   UR                  S9SSS24   USS2S4   :  nXR	                  UR                  5         Ul        U$ )	aR  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
    Mask to avoid performing attention on padded feature indices.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Token ids containing the audio token ID placeholders, for reconstructing rotary time embedding timestamps.
T)r   return_dictrS   r   )r9   r   r3   N)r   last_hidden_state _get_feat_extract_output_lengthssumrJ   rG   long_build_audio_timestampsr   r   r3   r   r   rH   pooler_output)r2   r   r   r   r   audio_outputr   _post_lengthsaudio_timestampsrZ   r[   audio_embeds
valid_masks                 r6   get_audio_features8MusicFlamingoForConditionalGeneration.get_audio_features   s@   " ''
 3
 	
 %66**KKL_LcLcdfLgLjLjkpkukuLvw77	Q^QdQdegQhi<< 0 3 3M4H4H IS`SfSfgiSj<k-m#F11-@ \\,"4"4Q"7@S@STUY[\U\]`lmnptmt`uu
%1--@S@S2T%U"r8   attention_maskposition_idsr{   inputs_embedslabels	use_cachelogits_to_keepc                    Uc  U R                  5       " U5      nUb  Ub  U R                  X#USS9R                  nXR                  R                  :H  R                  S5      nUR                  UR                  UR                  5      UR                  UR                  5      5      nU R                  " SUUUUUU	U
S.UD6nU$ )a  
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
    Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import MusicFlamingoForConditionalGeneration, AutoProcessor

>>> model_id = "nvidia/music-flamingo-2601-hf"
>>> processor = AutoProcessor.from_pretrained(model_id)
>>> model = MusicFlamingoForConditionalGeneration.from_pretrained(model_id, device_map="auto")

>>> conversation = [
>>>     {
>>>         "role": "user",
>>>         "content": [
>>>             {
>>>                 "type": "text",
>>>                 "text": "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
>>>             },
>>>             {
>>>                 "type": "audio",
>>>                 "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/song_1.mp3",
>>>             },
>>>         ],
>>>     }
>>> ]

>>> inputs = processor.apply_chat_template(
>>>     conversation,
>>>     tokenize=True,
>>>     add_generation_prompt=True,
>>>     return_dict=True,
>>> ).to(model.device, model.dtype)

>>> outputs = model.generate(**inputs, max_new_tokens=100)

>>> decoded_outputs = processor.batch_decode(
>>>     outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
>>> )
>>> print(decoded_outputs)
["This track is an uplifting Eurodance-style Trance-Pop anthem..."]
```T)r   r   rS   )r   r   r   r{   r   r   r   r   )
r   r   r   r    audio_token_idrW   masked_scatterrJ   r3   r   )r2   r   r   r   r   r   r{   r   r   r   r   r   r   audio_token_maskoutputss                  r6   rc   -MusicFlamingoForConditionalGeneration.forward  s    F   557	BM%)*?22yVZ 3 m 
 !*[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M +/*=*= 	+
')%+)	+
 	+
 r8   F)is_first_iterationr   c                   > UR                  SS 5      nUR                  SS 5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  Ub  XFS'   Ub  XVS'   U$ )Nr   r   r   F)popr'   prepare_inputs_for_generationrB   )r2   r   argsr   r   r   model_inputsr5   s          r6   r   CMusicFlamingoForConditionalGeneration.prepare_inputs_for_generationm  ss    $4d;$jj)>Ew<dMfM\%5%5k5%I%I)1?-.".6I23r8   r   max_post_lengthc                    XR                   R                  :H  n[        R                  " [        R                  R
                  R                  UR                  5       SSS9SS9n[        R                  " US:H  5      u  pg[        R                  " US:H  5      u  phX-
  R                  [        R                  5      n	U R                   R                  S-  n
[        R                  " X2R                  [        R                  S9U
-  n[        R                  " [        R                   " SUR                  S	9[        R"                  " USS9S S /5      n[        R"                  " U	SS9n[        R$                  " XS
S9n[        R$                  " U[        R                  " U	R&                  S   UR                  S	95      n[        R                  " UR&                  S   UR                  S	9X   -
  nUR)                  S5      U-  U
-  U-   $ )N)r   r   r   )r   r   rT   rS   rR   rA   r   T)right)r    r   rG   diffr   
functionalpadrF   whererJ   r   rU   rH   r3   float32rY   zeroscumsumsearchsortedr   rW   )r2   r   r   r   r   r   r   startsendssample_lengthsaudio_embed_frame_stepframe_offsetscumsum_postcumsum_samplessample_indicessample_start_rowswindow_indicess                    r6   r   =MusicFlamingoForConditionalGeneration._build_audio_timestamps{  s    %(B(BBzz%((--112B2F2F2H&XY1Z`abKK	*	++dbj)-++EJJ7 "&!=!=!ALL1D1DEMMZ]ss 	
 iiQ|7J7J!KU\\ZflmMnorprMs tun!<++NtT "..ELL)=)=a)@I\I\]
 LL++A.|7J7JKN_Noo 	
 ''*_<?UUXeeer8   )r   r   r   r   r   )
NNNNNNNNNr   )$ri   rj   rk   rl   _keep_in_fp32_modules_strict_supports_attention_backend_tp_plan_pp_planr   r(   r   r   r   r   r   r   r   r   rG   FloatTensorr   
LongTensorr   r   rp   r   r   r   boolrF   r   rc   r   r   rr   rs   rt   s   @r6   r   r      s<    $( "&HH	2 	:8;B11  w)) #\\ ##	
 +, 
+	+ @  .23737.204(,26*.!%-.Y##d*Y ))D0Y #\\D0	Y
 t+Y &&-Y Y ((4/Y   4'Y $;Y ell*Y +,Y 
 Y  Yv OT t   f## f && f 	 f
 
		 f  fr8   r   ).collections.abcr   mathr   typingr   rG   r   r   r    r
   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   autor   r   configuration_musicflamingor   Moduler   rv   r   r   r   r   __all__r   r8   r6   <module>r*     s   , %   / / & !   ) R 6 - & ] ] 2 < V8299 V8r =? = =$ryy .
H 
Rf,H/ Rf
Rfj 34R
Sr8   