
    Z jO                     .   S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJr  \" S5       " S S\R>                  5      5       r  " S S\R>                  5      r! " S S\R>                  5      r"\" SS9\ " S S\5      5       5       r#\\" SS9 " S S\5      5       5       r$\ " S  S!\5      5       r%\" S"S9 " S# S$\%5      5       r&\" S%S9 " S& S'\%\	5      5       r'/ S(Qr(g))    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuplemerge_with_config_defaults   )	AutoModel   )Mistral3ConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Mistral3RMSNorm(   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z.
Mistral3RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer   	__class__s      /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mistral3/modeling_mistral3.pyr    Mistral3RMSNorm.__init__*   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor"   float32powmeanrsqrtr%   r$   )r&   r,   input_dtypevariances       r)   forwardMistral3RMSNorm.forward2   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r+   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler$   shaper%   r&   s    r)   
extra_reprMistral3RMSNorm.extra_repr9   s*    ))*+6$2G2G1HIIr+   )r%   r$   )gư>)__name__
__module____qualname____firstlineno__floatr    r"   Tensorr8   r>   __static_attributes____classcell__r(   s   @r)   r   r   (   sB    $ $$ $ $;U\\ ;ell ;J Jr+   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Mistral3PatchMerger=   z4
Learned merging of spatial_merge_size ** 2 patches
configc                   > [         TU ]  5         Xl        UR                  R                  nUR
                  U l        U R                  R                  R                  U l        [        R                  " X R
                  S-  -  USS9U l	        g )Nr   Fbias)
r   r    rL   vision_configr'   spatial_merge_size
patch_sizer   Linearmerging_layer)r&   rL   r'   r(   s      r)   r    Mistral3PatchMerger.__init__B   sn    **66"(";";++33>>YY{5L5La5O'OQ\chir+   image_featuresimage_sizesr   c                    U Vs/ s H&  o3S   U R                   -  US   U R                   -  4PM(     nnU VVs/ s H	  u  pEXE-  PM     nnnUR                  S   n/ n[        UR                  U5      5       H  u  pX)   u  pEU
R	                  XEU5      R                  SSS5      R                  S5      n[        R                  R                  R                  XR                  U R                  S9nUR	                  XpR                  S-  -  S5      R                  5       nUR                  U5        M     [        R                  " USS9nU R                  U5      nU$ s  snf s  snnf )Nr   r   r.   r   )kernel_sizestridedim)rR   r<   	enumeratesplitviewpermute	unsqueezer"   r   
functionalunfoldrQ   tappendcatrT   )r&   rV   rW   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r)   r8   Mistral3PatchMerger.forwardK   s[   cn
cnU_]doo-z!}/OPcn 	 
 /::kdaAEk:  $)2>3G3GHX3Y)Z%K+DA%**13;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4( *[ ?:++N;)
 ;s
   -EE!)rL   rT   rR   rQ   )r@   rA   rB   rC   __doc__r   r    r"   rE   r8   rF   rG   rH   s   @r)   rJ   rJ   =   sD    j~ jell  RWR^R^  r+   rJ   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Mistral3MultiModalProjectorc   rL   c                   > [         TU ]  5         [        UR                  R                  UR
                  R                  S9U l        [        U5      U l	        [        UR                  [        5      (       a  SO[        UR                  5      U l        [        R                   " UR                  R                  U R                  -  UR
                  R                  UR"                  S9U l        [&        UR(                     U l        [        R                   " UR
                  R                  UR
                  R                  UR"                  S9U l        g )N)r   r   rN   )r   r    r   rP   r'   text_configrms_norm_epsnormrJ   patch_merger
isinstancevision_feature_layerintlennum_feature_layersr   rS   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2r&   rL   r(   s     r)   r    $Mistral3MultiModalProjector.__init__d   s    #F$8$8$D$D&J\J\JiJij	/7 F77==A3vGbGbCc 	 		  ,,t/F/FF**11

 &556		**F,>,>,J,JQWQqQq
r+   rV   rW   c                     U R                  U5      nU R                  X5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ N)ry   rz   r   r   r   )r&   rV   rW   r,   s       r)   r8   #Mistral3MultiModalProjector.forwardv   sP    >2**>Gn5/m4r+   )r   r   r   ry   r   rz   )r@   rA   rB   rC   r   r    r"   rE   r8   rF   rG   rH   s   @r)   rt   rt   c   s/    
~ 
$ell   r+   rt   zT
    Base class for Mistral3 causal language model (or autoregressive) outputs.
    custom_introc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)Mistral3CausalLMOutputWithPast   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valuesr,   
attentionsimage_hidden_states )r@   rA   rB   rC   rr   r   r"   FloatTensor__annotations__r   r   r   r,   r;   r   r   rF   r   r+   r)   r   r      s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r+   r   zM
    Base class for Mistral3 outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Mistral3ModelOutputWithPast   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr   r   )
r@   rA   rB   rC   rr   r   r"   r   r   rF   r   r+   r)   r   r      s    	 59**T18r+   r   c                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrg)	Mistral3PreTrainedModel   rL   model)imagetextTr   r   N)r@   rA   rB   rC   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendrF   r   r+   r)   r   r      s=    (&*#"3N!"&r+   r   zx
    The Mistral3 model which consists of a vision backbone and a language model, without a language modeling head.
    c                   r  ^  \ rS rSrS\4U 4S jjrS rS r\\	\
" SS9  SS	\R                  S
\R                  S\\\   -  \\   -  S-  S\S-  S\\   S\\-  4S jj5       5       5       rS\R,                  S\R                  S\R                  4S jr\\	\
         SS\R,                  S-  S	\R                  S-  S\R                  S-  S\R,                  S-  S\S-  S\R                  S-  S\\\   -  \\   -  S-  S\S-  S
\R                  S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )Mistral3Model   rL   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g r   )r   r    r   from_configrP   vision_towerrt   multi_modal_projectorrw   language_model	post_initr   s     r)   r    Mistral3Model.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr+   c                 6    U R                   R                  5       $ r   )r   get_input_embeddingsr=   s    r)   r   "Mistral3Model.get_input_embeddings   s    ""7799r+   c                 :    U R                   R                  U5        g r   )r   set_input_embeddingsr&   values     r)   r   "Mistral3Model.set_input_embeddings   s    007r+   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesrW   r|   output_hidden_stateskwargsr   c                    UR                  5        VVs0 s H  u  pgUc  M
  Xg_M     nnnU R                  " U4USSS.UD6n[        U[        5      (       a  UR                  U   n	O3U V
s/ s H  oR                  U
   PM     nn
[
        R                  " USS9n	U R                  U	R                  S5      U5      nU R                  R                  U R                  R                  -  n[
        R                  " X,R                  S9U-  R                  SS9R                  5       n[
        R                   " UR                  S5      U5      nXl        U$ s  snnf s  sn
f )NT)rW   r   return_dictr.   r[   r   )device)itemsr   r{   r}   r,   r"   rf   r   squeezerR   rL   rQ   	as_tensorr   prodtolistr^   pooler_output)r&   r   rW   r|   r   r   kvimage_outputsselected_image_feature	layer_idxhs_poolrV   downsample_ratiosplit_sizess                  r)   get_image_features Mistral3Model.get_image_features   sN    $*<<>C>41Q$!$>C))
#!%	

 
 *C00%2%@%@AU%V"OcdOc)229=OcGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XX__[1F1FGK[[aafhaippr 	 ^%;%;A%>L&4#3 D es   	EE*E	input_idsinputs_embedsrV   c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r0   r   r.   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r"   tensorrL   image_token_idlongr   allsumr<   ra   	expand_asr1   r   numel)r&   r   r   rV   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_mask"Mistral3Model.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r+   attention_maskposition_idsr   	use_cachec
           	          US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbv  U R                  UUU	SS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " S	UUUUUS.U
D6n[        UR                  UR                  UR                  UR                   Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsT)r   r|   rW   r   r   r[   )r   rV   )r   r   r   r   r   )last_hidden_stater   r,   r   r   r   )
ValueErrorr   r   r   r"   rf   r1   r   r0   r   masked_scatterr   r   r   r   r,   r   )r&   r   r   r   r   r   r   r|   r   rW   r   rV   r   outputss                 r)   r8   Mistral3Model.forward  s@     -t";<YZZ  557	BM#!44)%9' 	 5 
 m  #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K\M%% 
)%+'
 
 +%77#33!//))2>2J
 	

 QU
 	
r+   )r   r   r   )NN)	NNNNNNNNN)r@   rA   rB   rC   r   r    r   r   r   r   r   r"   r   rE   r}   listboolr   r   r;   r   r   
LongTensorr   r   r   r8   rF   rG   rH   s   @r)   r   r      s   ~ :8  n DH,0!''! \\! "DIoS	9D@	!
 #Tk! +,! 
+	+!   
!F"))":?:K:K"]b]n]n"0   .215.204(,26CG!%+//
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
 "DIoS	9D@/
 $;/
 \\D(/
 +,/
 
,	,/
    /
r+   r   zV
    The MISTRAL3 model which consists of a vision backbone and a language model.
    c                   n  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\\\ SS\R                   S\R"                  S\\\   -  \\   -  S
-  S\\   S\\-  4
S jj5       5       5       r\\\          SS\R2                  S
-  S\R                   S
-  S\R"                  S
-  S\R2                  S
-  S\S
-  S\R                   S
-  S\R2                  S
-  S\S
-  S\\R"                  -  S\R"                  S
-  S\\   S\\-  4S jj5       5       5       r      SU 4S jjrSrU =r $ ) Mistral3ForConditionalGenerationiK  zlm_head.weightz(model.language_model.embed_tokens.weightrL   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrN   )r   r    r   r   r   rS   rw   r'   
vocab_sizelm_headr   r   s     r)   r    )Mistral3ForConditionalGeneration.__init__S  sS     "6*
yy!3!3!?!?ASASA^A^ejkr+   c                 6    U R                   R                  5       $ r   )r   r   r=   s    r)   r   5Mistral3ForConditionalGeneration.get_input_embeddingsY  s    zz..00r+   c                 :    U R                   R                  U5        g r   )r   r   r   s     r)   r   5Mistral3ForConditionalGeneration.set_input_embeddings\  s    

''.r+   r   c                     U R                   $ r   )r   r=   s    r)   get_output_embeddings6Mistral3ForConditionalGeneration.get_output_embeddings_  s    ||r+   Nr   rW   r|   r   c                 B    U R                   R                  " SUUUS.UD6$ )N)r   rW   r|   r   )r   r   )r&   r   rW   r|   r   s        r)   r   3Mistral3ForConditionalGeneration.get_image_featuresb  s3     zz,, 
%#!5
 	
 	
r+   r   r   r   r   r   labelsr   logits_to_keepc                    U R                   " SUUUUUUUU
S.UD6nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

>>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
>>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

>>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is the image?The image depicts two cats lying on a pink blanket."
```)r   r   r   r   r   r   r   rW   r   N)r   r   r   )r   r   r   r,   r   r   r   )r   r{   r}   slicer   loss_functionrL   rw   r   r   r   r,   r   r   )r&   r   r   r   r   r   r   r   r   r   rW   r   r   r,   slice_indicesr   r   s                    r)   r8   (Mistral3ForConditionalGeneration.forwards  s    R ** 

%)%+'#

 

  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD .#33!//)) ' ; ;
 	
r+   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r   r   r   r   is_first_iterationr   Tr   )r   prepare_inputs_for_generationget)r&   r   r   r   r   r   r   r  r   model_inputsr(   s             r)   r  >Mistral3ForConditionalGeneration.prepare_inputs_for_generation  sZ     w<
+'))1
 
 VZZT%B%B
 ,8(r+   )r   r   r   )
NNNNNNNNr   N)NNNNNF)!r@   rA   rB   rC   _tied_weights_keysr   r    r   r   r   Moduler   r   r   r   r"   r   rE   r}   r   r   r   r;   r   r   r   r   r   r   r8   r  rF   rG   rH   s   @r)   r   r   K  s    +,VW~ 1/ryy   
 DH	
''
 \\
 "DIoS	9D@	

 +,
 
+	+
    
   .215.204(,26*.!%-.+/C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
   4'C
 $;C
 ell*C
 \\D(C
 +,C
 
/	/C
    C
P   r+   r   )r   r   r   ))dataclassesr   r"   r   activationsr   cache_utilsr   
generationr   integrationsr	   modeling_outputsr
   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   autor   configuration_mistral3r   r	  r   rJ   rt   r   r   r   r   r   __all__r   r+   r)   <module>r     sU  , "   !   ) 7 ` ` - & O O I  2 Y'Jbii J (J(#")) #L")) 8 
 9[ 9 90 
9"9 9 9 'o ' ' 
A
+ A

A
H 
I'> I
IX [r+   