
    Z jS                        S SK r S SKJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$  \RJ                  " \&5      r'  S+S\RP                  S\RR                  S\RR                  S\RR                  S\RR                  S-  S\*S-  S\*4S jjr+ " S S\RP                  5      r, " S S\5      r-\ " S S \5      5       r.\" S!S"9 " S# S$\.5      5       r/ " S% S&\RP                  5      r0\" S'S"9 " S( S)\.\
5      5       r1/ S*Qr2g),    N)Callable)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfigmodulequerykeyvalueattention_maskscalingdropoutc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr"   r)   
contiguous)
r   r   r   r   r    r!   r"   kwargsattn_weightsattn_outputs
             }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr5   -   s     **R.D(<<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$    c                   @  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	S\
R                  S\S\4S jr  SS\
R                  S\
R                  S-  S\S\\
R                  \
R                  S-  \\
R                     S-  4   4S jjrSrU =r$ )VoxtralAttentionG   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr"   
is_decoderbias	is_causal	layer_idxconfigc	                 `  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Uc4  U(       a-  [        R                  SU R                  R                   S35        Xpl        [         R"                  " XSS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r%   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr=   )super__init__r:   r;   r"   head_dimr@   
ValueErrorr!   r<   r>   loggerwarning_once	__class____name__r?   r   Lineark_projv_projq_projout_proj)
selfr:   r;   r"   r<   r=   r>   r?   r@   rI   s
            r4   rD   VoxtralAttention.__init__J   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	5Aii	4@ii	4@		)TBr6   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )viewr;   rE   r-   r0   )rP   rR   rS   rT   s       r4   _shapeVoxtralAttention._shaper   s5    {{3GQQRSUVWbbddr6   hidden_statesr    output_attentionsreturnc                 2   UR                  5       u  pVnU R                  U R                  U5      U R                  -  Xe5      nU R                  U R	                  U5      SU5      n	U R                  U R                  U5      SU5      n
[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                  SUS.UD6u  pUR                  XVS5      R                  5       nU R                  U5      nX4$ )z#Input shape: Batch x Time x Channelr$                 ?)r"   r!   rZ   )r*   rW   rN   r!   rL   rM   r   get_interfacer@   _attn_implementationr5   r)   r"   reshaper0   rO   )rP   rY   r    rZ   r1   rT   tgt_len_query_states
key_statesvalue_statesattention_interfacer3   r2   s                 r4   forwardVoxtralAttention.forwardu   s	    (,,.a {{4;;}#=#Lg[[[]!;RE
{{4;;}#=r3G(?(M(MKK,,.E)
 %8
%
  $}}C$,,/
%
 
%
! "))#;FFHmmK0((r6   )r@   r"   r:   rE   r>   r<   rL   r?   r;   rO   rN   r!   rM   )r]   FTFNNNF)rJ   
__module____qualname____firstlineno____doc__intfloatboolr   rD   r+   TensorrW   tuplerh   __static_attributes____classcell__rI   s   @r4   r8   r8   G   s   G   $'+&C&C &C 	&C
 &C &C &C :&C $&C &CPeU\\ eC ec e /3"'	')||') t+')  	') 
u||U\\D0%2E2LL	M') ')r6   r8   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jr
S	rU =r$ )
VoxtralEncoderLayer   r@   c                 j  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  US9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r:   r;   r"   r@   )rC   rD   d_modelr:   r8   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr"   r   activation_functionactivation_fnactivation_dropoutrK   encoder_ffn_dimfc1fc2final_layer_normrP   r@   rI   s     r4   rD   VoxtralEncoderLayer.__init__   s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r6   rY   r    r1   r[   c                    UnU R                  U5      nU R                  " SUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXA-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXA-   nUR                  [        R                  :X  aC  [        R                  " UR                  5      R                  S-
  n[        R                   " X* US9nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
)rY   r    r'   i  )minmax )r   r~   r   r.   r"   r)   r   r   r   r   r   dtyper+   float16finfor   clamp)rP   rY   r    r1   residualrc   clamp_values          r4   rh   VoxtralEncoderLayer.forward   sD    !11-@>> 
')
 

 --m||VZVcVc-d 0 --m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0%--/++m&9&9:>>EK!KK<[YMr6   )	r   r   r"   r:   r   r   r   r~   r   )rJ   rk   rl   rm   r   rD   r+   rr   r   r   rh   rt   ru   rv   s   @r4   rx   rx      sQ    =} =$"||" " +,	"
 
" "r6   rx   c                   L    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrSrSrg)	VoxtralPreTrainedModel   r@   model)audiotextTNpast_key_valuesr   )rJ   rk   rl   rm   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraphrt   r   r6   r4   r   r      sI    (&*#"3N "&!r6   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                      ^  \ rS rSr% Sr\\S'   SrSrS/r	\
\S.rS\4U 4S jjrS	 rS
\R                   4S jrS\R                   4S jr\\ SS\\   S
\\-  4S jj5       5       rS\R6                  4S jrSrU =r$ )VoxtralEncoder   z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`VoxtralEncoderLayer`].

Args:
    config: VoxtralEncoderConfig
r@   input_featuresr   rx   )
attentionsrY   c                 l  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l        UR                  (       a  [        R                  " U5      OSU l        [        R                  " U R                  USSS9U l        [        R                  " X"SSSS9U l        [        R                   " U R                  U5      U l        U R"                  R%                  S5        [        R&                  " [)        UR*                  5       Vs/ s H  n[-        U5      PM     sn5      U l        [        R0                  " UR
                  5      U l        [        R4                  " SSS9U l        SU l        U R;                  5         g s  snf )	Nr^   r   r   )kernel_sizepaddingr   )r   strider   F)r   )rC   rD   r"   encoder_layerdrop	layerdropr{   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersrx   layersr   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rP   r@   r:   rc   rI   s       r4   rD   VoxtralEncoder.__init__   s:    ~~11NN	"//$*$?$?!393I3I499Y/sYYt00)TUV
YYy1VWX
!||D,E,EyQ++E2mm%PVPePeJf$gJfQ%8%@Jf$gh,,v~~6,,q3&+# %hs   9F1c                 N    U R                  5        H
  nSUl        M     SU l        g rj   )
parametersrequires_grad_requires_grad)rP   params     r4   _freeze_parameters!VoxtralEncoder._freeze_parameters  s#    __&E"'E '#r6   r[   c                     U R                   $ Nr   rP   s    r4   get_input_embeddings#VoxtralEncoder.get_input_embeddings  s    zzr6   r   c                     Xl         g r   r   rP   r   s     r4   set_input_embeddings#VoxtralEncoder.set_input_embeddings   s    
r6   r1   c           	         U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  pU	" UUS9nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   r$   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r'   )r    )last_hidden_state)r@   r   r   r   r   shaperF   toweightr   r   r   r.   gelupermuter   r"   r)   	enumerater   r   r   )
rP   r   r    r1   expected_seq_lengthinputs_embeds	embed_posrY   idxencoder_layers
             r4   rh   VoxtralEncoder.forward#  s   ( #kk>>ARARSTAUUX\XbXbXiXijkXll#'::IJ]I^^jkykk  AC  lD  kE  Er  sF  rG  GH  I  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--m||VZVcVc-d"+DKK"8C)-M #9 6)+
 	
r6   input_lengthsc                 4    US-
  S-  S-   nUS-
  S-  S-   nX4$ )zc
Computes the output length of the convolutional layers and the output length of the audio encoder
r   r   r   )rP   r   output_lengthss      r4    _get_feat_extract_output_lengths/VoxtralEncoder._get_feat_extract_output_lengthsS  s5     '*q014'!+1A5,,r6   )r   r   r   r   r"   r   r   r   r   r   r   r   r   r   )rJ   rk   rl   rm   rn   r   r   main_input_namer   r   r8   rx   _can_record_outputsrD   r   r   Moduler   r   r   r   r   r   rs   r   rh   r+   
LongTensorr   rt   ru   rv   s   @r4   r   r      s     ! &O./&,
3 2$
bii "))    +
 +,	+

 
+	++
   +
\-e>N>N - -r6   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VoxtralMultiModalProjectori\  r@   c                 ^  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )NFrB   )rC   rD   r   rK   audio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r4   rD   #VoxtralMultiModalProjector.__init__]  sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr6   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rP   audio_featuresrY   s      r4   rh   "VoxtralMultiModalProjector.forwardc  s2    n5/m4r6   )r   r   r   )	rJ   rk   rl   rm   r   rD   rh   rt   ru   rv   s   @r4   r   r   \  s    n} n r6   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                     ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\\" S
S9S\R                  S\\   S\\-  4S j5       5       r\\         SS\R*                  S-  S\R                  S-  S\R,                  S-  S\R*                  S-  S\S-  S\R                  S-  S\R*                  S-  S\S-  S\\R,                  -  S\\   S\4S jj5       5       rU 4S jrSrU =r$ )VoxtralForConditionalGenerationij  r   c                 .  > [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        U R                  5         g r   )rC   rD   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r4   rD   (VoxtralForConditionalGeneration.__init__r  sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r6   c                 6    U R                   R                  5       $ r   )r
  r   r   s    r4   r   4VoxtralForConditionalGeneration.get_input_embeddings|  s    ""7799r6   c                 :    U R                   R                  U5        g r   )r
  r   r   s     r4   r   4VoxtralForConditionalGeneration.set_input_embeddings  s    007r6   c                 6    U R                   R                  5       $ r   )r
  get_output_embeddingsr   s    r4   r  5VoxtralForConditionalGeneration.get_output_embeddings  s    ""88::r6   c                 :    U R                   R                  U5        g r   )r
  set_output_embeddings)rP   new_embeddingss     r4   r  5VoxtralForConditionalGeneration.set_output_embeddings  s    11.Ar6   c                 :    U R                   R                  U5        g r   )r
  set_decoder)rP   decoders     r4   r  +VoxtralForConditionalGeneration.set_decoder  s    ''0r6   c                 6    U R                   R                  5       $ r   )r
  get_decoderr   s    r4   r  +VoxtralForConditionalGeneration.get_decoder  s    ""..00r6   zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r1   r[   c                     U R                   " U4SS0UD6nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nXSl        U$ )a)  
input_features (`torch.FloatTensor`):
    Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
    `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
    `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
return_dictTr$   )r	  r   ra   r@   r   r   r  pooler_output)rP   r   r1   audio_outputsaudio_hidden_statesaudio_embedss         r4   get_audio_features2VoxtralForConditionalGeneration.get_audio_features  sj     ((TTTVT+==199"dkk>V>V>h>hi112EF&2#r6   N	input_idsr    position_idsr   r   labels	use_cachelogits_to_keepc
                    Uc  U R                  5       " U5      nUb  Ub  U R                  USS9R                  nXR                  R                  :H  R                  S5      nUR                  UR                  UR                  5      UR                  UR                  5      5      nU R                  " SUUUUUUU	S.U
D6nU$ )a  
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```T)r   r$   )r    r(  r   r   r)  r*  r+  r   )
r   r%  r!  r@   audio_token_id	unsqueezemasked_scatterr   r   r
  )rP   r'  r   r    r(  r   r   r)  r*  r+  r1   r$  audio_token_maskoutputss                 r4   rh   'VoxtralForConditionalGeneration.forward  s    `   557	BM%)*?22>t2TbbL !*[[-G-G GRRSUV)88 ##M$8$89<??=K_K_;`M ,0+>+> 	,
)%+')	,
 	,
 r6   c                    > UR                  SS 5      nUR                  SS5      n[        TU ]  " U0 UD6nU(       d  UR                  SS5      (       d  X5S'   U$ )Nr   is_first_iterationFr*  T)popgetrC   prepare_inputs_for_generation)rP   argsr1   r   r4  model_inputsrI   s         r4   r7  =VoxtralForConditionalGeneration.prepare_inputs_for_generation  s^      $4d;#ZZ(<eDw<dMfMVZZT%B%B-;)*r6   )r	  r
  r  r  )	NNNNNNNNr   )rJ   rk   rl   rm   _keep_in_fp32_modules_strictrD   r   r   r  r  r  r  r   r   r+   FloatTensorr   r   rs   r   r%  r   rr   r   rq   ro   r   rh   r7  rt   ru   rv   s   @r4   r  r  j  s    %6#6 :8;B11  w#//;ABT;U	+	+ &  .237.204(,26*.!%-.D##d*D ))D0D t+	D
 &&-D D ((4/D   4'D $;D ell*D +,D 
 D  DL r6   r  )r   r   r  )Nr]   )3r   collections.abcr   r+   r   activationsr   cache_utilsr   
generationr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_voxtralr   r   
get_loggerrJ   rG   r   rr   rp   r5   r8   rx   r   r   r   r  __all__r   r6   r4   <module>rL     se  ,  $   !   ) 9 k k F & R R 7 5 2 F 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% %4U)ryy U)p54 5p "_ " " 
m-+ m-
m-`  
J&<o J
JZ Zr6   