
    Z jt-                        S SK r S SK Jr  SSKJr  SSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJrJrJr  SSKJr   " S S\5      r  " S S\5      r! " S S\5      r"\" SS9 " S S\5      5       r# " S S\RH                  5      r%\" SS9 " S S\"\5      5       r&/ S Qr'g)!    N)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       \ rS rSrSrg)VoxtralAttention)    N__name__
__module____qualname____firstlineno____static_attributes__r       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   )       r$   r   c                       \ rS rSrSrg)VoxtralEncoderLayer-   r   Nr   r   r$   r%   r(   r(   -   r&   r$   r(   c                   (    \ rS rSrSrSrSrSrSrSr	g)VoxtralPreTrainedModel1   TNr   )
r   r    r!   r"   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr#   r   r$   r%   r+   r+   1   s      "&!r$   r+   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                   V    \ rS rSr\\S.r\\ SS\	\
   S\\-  4S jj5       5       rSrg)	VoxtralEncoder:   )
attentionshidden_statesNkwargsreturnc           	         U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  pU	" UUS9nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptraining)attention_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr>   r?   r   
functionalgelupermuteembed_positionsdropoutrA   	enumeratelayers
layer_normr	   )
selfinput_featuresrB   r9   expected_seq_lengthinputs_embeds	embed_posr8   idxencoder_layers
             r%   forwardVoxtralEncoder.forwardE   s   ( #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--m||VZVcVc-d"+DKK"8C)-M #9 6)+
 	
r$   r   N)r   r    r!   r"   r   r(   _can_record_outputsr   r   r   r   tupler	   r\   r#   r   r$   r%   r5   r5   :   sS     ',
   +
 +,	+

 
+	++
   +
r$   r5   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VoxtralMultiModalProjectoru   rD   c                 ^  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rU   rD   	__class__s     r%   rg   #VoxtralMultiModalProjector.__init__v   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr$   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r^   )rm   ro   rp   )rU   audio_featuresr8   s      r%   r\   "VoxtralMultiModalProjector.forward|   s2    n5/m4r$   )ro   rm   rp   )	r   r    r!   r"   r   rg   r\   r#   __classcell__rr   s   @r%   rb   rb   u   s    n} n r$   rb   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                     ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\\" S
S9S\R                  S\\   S\\-  4S j5       5       r\\         SS\R*                  S-  S\R                  S-  S\R,                  S-  S\R*                  S-  S\S-  S\R                  S-  S\R*                  S-  S\S-  S\\R,                  -  S\\   S\4S jj5       5       rU 4S jrSrU =r$ )VoxtralForConditionalGeneration   rP   c                 .  > [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        U R                  5         g r^   )rf   rg   rk   
vocab_sizer   from_configri   audio_towerr   language_modelrb   multi_modal_projector	post_initrq   s     r%   rg   (VoxtralForConditionalGeneration.__init__   sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r$   c                 6    U R                   R                  5       $ r^   )r   get_input_embeddingsrU   s    r%   r   4VoxtralForConditionalGeneration.get_input_embeddings   s    ""7799r$   c                 :    U R                   R                  U5        g r^   )r   set_input_embeddings)rU   values     r%   r   4VoxtralForConditionalGeneration.set_input_embeddings   s    007r$   c                 6    U R                   R                  5       $ r^   )r   get_output_embeddingsr   s    r%   r   5VoxtralForConditionalGeneration.get_output_embeddings   s    ""88::r$   c                 :    U R                   R                  U5        g r^   )r   set_output_embeddings)rU   new_embeddingss     r%   r   5VoxtralForConditionalGeneration.set_output_embeddings   s    11.Ar$   c                 :    U R                   R                  U5        g r^   )r   set_decoder)rU   decoders     r%   r   +VoxtralForConditionalGeneration.set_decoder   s    ''0r$   c                 6    U R                   R                  5       $ r^   )r   get_decoderr   s    r%   r   +VoxtralForConditionalGeneration.get_decoder   s    ""..00r$   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r2   rV   r9   r:   c                     U R                   " U4SS0UD6nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nXSl        U$ )a)  
input_features (`torch.FloatTensor`):
    Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
return_dictTr<   )r   rC   reshaperD   ri   rj   r   pooler_output)rU   rV   r9   audio_outputsaudio_hidden_statesaudio_embedss         r%   get_audio_features2VoxtralForConditionalGeneration.get_audio_features   sj     ((TTTVT+==199"dkk>V>V>h>hi112EF&2#r$   N	input_idsrB   position_idspast_key_valuesrX   labels	use_cachelogits_to_keepc
                    Uc  U R                  5       " U5      nUb  Ub  U R                  USS9R                  nXR                  R                  :H  R                  S5      nUR                  UR                  UR                  5      UR                  UR                  5      5      nU R                  " SUUUUUUU	S.U
D6nU$ )a  
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```T)r   r<   )rB   r   r   rX   r   r   r   r   )
r   r   r   rD   audio_token_id	unsqueezemasked_scatterrK   r?   r   )rU   r   rV   rB   r   r   rX   r   r   r   r9   r   audio_token_maskoutputss                 r%   r\   'VoxtralForConditionalGeneration.forward   s    `   557	BM%)*?22>t2TbbL !*[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,0+>+> 	,
)%+')	,
 	,
 r$   c                    > UR                  SS 5      nUR                  SS5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  X5S'   U$ )NrV   is_first_iterationFr   T)popgetrf   prepare_inputs_for_generation)rU   argsr9   rV   r   model_inputsrr   s         r%   r   =VoxtralForConditionalGeneration.prepare_inputs_for_generation  s^      $4d;#ZZ(<eDw<dMfMVZZT%B%B-;)*r$   )r   r   r   r}   )	NNNNNNNNr   )r   r    r!   r"   _keep_in_fp32_modules_strictrg   r   r   r   r   r   r   r   r   torchFloatTensorr   r   r`   r	   r   
LongTensorTensorr   boolintr
   r\   r   r#   rw   rx   s   @r%   rz   rz      s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%-.D##d*D ))D0D t+	D
 &&-D D ((4/D   4'D $;D ell*D +,D 
 D  DL r$   rz   )r+   r5   rz   )(r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r	   r
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r(   r+   r5   Modulerb   rz   __all__r   r$   r%   <module>r      s       !   ) 
 ' I I 7 5 2  1	* 		0 	6  
3
& 3

3
l  
J&<o J
JZ Zr$   