
    Z j9                        S SK r S SKJr  S SK Jr  SSKJrJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJ r J!r!  \RD                  " \#5      r$\" SS9\ " S S\5      5       5       r% " S S\ 5      r& " S S\!5      r'\" SS9\ " S S\5      5       5       r( " S S\5      r) " S S\5      r* " S S\5      r+ " S  S!\5      r, " S" S#\5      r-/ S$Qr.g)%    N)strict)nn   )CacheDynamicCache)GenerationConfig)FlashAttentionKwargs)BaseModelOutputWithPooling)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )Idefics3ConfigIdefics3VisionConfig)Idefics3ImageProcessor)Idefics3ImageProcessorPil)Idefics3BaseModelOutputWithPast Idefics3ForConditionalGenerationIdefics3ModelIdefics3PreTrainedModelIdefics3VisionTransformerz$HuggingFaceTB/SmolVLM2-2.2B-Instruct)
checkpointc                       \ rS rSrSrSrSrg)SmolVLMVisionConfig*   aS  
Example:

```python
>>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
>>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

>>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
>>> configuration = SmolVLMVisionConfig()

>>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
>>> model = SmolVLMVisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlm_vision N__name__
__module____qualname____firstlineno____doc__
model_type__static_attributes__r        |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   *   s    " "Jr)   r   c                       \ rS rSrSrg)SmolVLMPreTrainedModelA   r    Nr"   r#   r$   r%   r(   r    r)   r*   r,   r,   A       r)   r,   c                       \ rS rSrSrg)SmolVLMVisionTransformerE   r    Nr.   r    r)   r*   r1   r1   E   r/   r)   r1   c                       \ rS rSrSrSrSrg)SmolVLMConfigI   a  
scale_factor (`int`, *optional*, defaults to 2):
    The scale factor for the image encoder.

Example:
```python
>>> from transformers import SmolVLMModel, SmolVLMConfig
>>> # Initializing configuration
>>> configuration = SmolVLMConfig()
>>> # Initializing a model from the configuration
>>> model = SmolVLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlmr    Nr!   r    r)   r*   r4   r4   I   s     Jr)   r4   c                       \ rS rSrSrg)SmolVLMImageProcessor^   r    Nr.   r    r)   r*   r8   r8   ^   r/   r)   r8   c                       \ rS rSrSrg)SmolVLMImageProcessorPilb   r    Nr.   r    r)   r*   r;   r;   b   r/   r)   r;   c                       \ rS rSrSrg)SmolVLMBaseModelOutputWithPastf   r    Nr.   r    r)   r*   r>   r>   f   r/   r)   r>   c                       \ rS rSrSrS\R                  S\R                  S\R                  4S jr\	\
" SS9 SS
\R                  S\R                  S	-  S\\   S\\-  4S jj5       5       r\	\
" SS9\	         SS\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S
\R                  S	-  S\R$                  S	-  S\R                  S	-  S\S	-  S\\   S\\-  4S jj5       5       5       rSrg	)SmolVLMModelj   z
A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
in forward. Instead, we override inputs_merger here with custom logic.
	input_idsinputs_embedsimage_hidden_statesc           	         UR                   u  pEnUc^  X R                  5       " [        R                  " U R                  R
                  [        R                  UR                  S95      :H  nUS   nOXR                  R
                  :H  nUR                  SS9n[        [        R                  " Xu-  S:H  5      S5        Xu-  n[        R                  R                  R                  UR                  SS9SSS9n	U	S S	 n
UR                  S	S9nUS-
  U-  nUS-
  U-  nU
R                  S5      U-   n[        R                   " U5      nX>U   X   S S 24   X'   [        R"                  " UR                  S	5      X5      nU$ )
Ndtypedevice).r      dimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rJ   r   )value)shapeget_input_embeddingstorchtensorconfigimage_token_idlongrI   sumr   allr   
functionalpadcumsum	unsqueeze
zeros_likewhere)selfrC   rD   rE   _
patch_size
image_masknum_image_tokensblocks_per_sampleoffsetsblock_offsetrow_cum	chunk_idx	local_idx	block_idximage_embedsmerged_embedss                    r*   inputs_mergerSmolVLMModel.inputs_mergerp   s}    /44q&*C*C*ET[[77uzzR_RfRfg+ J $F+J"kk&@&@@J%>>a>0II&3q89Q	
 -:((%%))*;*B*Bq*B*I6YZ)[s|###+q[Z/	q[J.	 **1-	9	''6#67LiNcef7f#g J$8$8$<lZr)   zVEncodes images into continuous embeddings that can be forwarded to the language model.)custom_introNpixel_valuespixel_attention_maskkwargsreturnc                    UR                   u  pEpgnUR                  U R                  S9nUR                  " XE-  /UR                   SS Q76 nUR                   SS R	                  5       n	US:H  R                  SS9U	:g  n
U
S==   [        R                  " U
5      ) -  ss'   X   R                  5       nUcL  [        R                  " S	 Vs/ s H  oR                   U   PM     sn[        R                  UR                  S
9nO4UR                  " XE-  /UR                   SS Q76 nX*   R                  5       nU R                  R                  R                  nUR                  SXS9nUR                  SXS9nUR                  SS9S:  R                  5       nU R                   " SXSS.UD6nUR"                  nU R%                  U5      nUUl        U$ s  snf )a  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
pixel_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask indicating padded regions in the image.
)rH   r   NrJ   g        )rN   rK   r   )r   r   r   )sizerH   rI   )	dimensionrv   step)rN   rt   T)ro   patch_attention_maskreturn_dictr    )rO   torH   viewnumelrV   rQ   any
contiguousonesboolrI   rS   vision_configr`   unfoldvision_modellast_hidden_state	connectorpooler_output)r^   ro   rp   rq   
batch_size
num_imagesnum_channelsheightwidthnb_values_per_imagereal_images_indsir`   patches_subgridry   image_outputsrE   image_featuress                     r*   get_image_featuresSmolVLMModel.get_image_features   s     ?K>P>P;
e#TZZ8#(()@Z<CUCUVWVXCYZ +004::<(C/444FJ]] 			*: ;;;#5@@B'#(::5>?Y((+Y?jj#**$  $8#<#<Z=T#vWkWqWqrsrtWu#v #7#I#T#T#V [[..99
.55
5d)001:0_ / 3 3 3 AA EKKM )) 
%^b
fl
 ,== (;<&4#/ @s   	Ga  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        attention_maskposition_idspast_key_values	use_cachec
           	      L   U R                   (       a9  U R                  R                  (       a  U	(       a  [        R	                  S5        Sn	Ub  UR
                  u  pOUb  UR
                  u  pnO[        S5      eU	(       a  Uc  [        U R                  S9nUc9  U R                  R                  5       " U5      R                  UR                  5      nUb  Ub  [        S5      eUb6  U R                  XgSS9R                  nUR                  UR                  5      nO'Ub$  UR                  U R                  UR                  S9nUb  U R                  UUUS	9nU R                  " SUUUUU	S
.U
D6n[!        UR"                  UR$                  UR&                  UR(                  US9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embeds)rS   zMYou cannot specify both pixel_values and image_hidden_states at the same timeT)rz   rG   )rC   rD   rE   )rD   r   r   r   r   )r   r   hidden_states
attentionsrE   r    )training
text_modelgradient_checkpointingloggerwarning_oncerO   
ValueErrorr   rS   rP   r{   rI   r   r   rH   rl   r>   r   r   r   r   )r^   rC   r   r   r   rD   ro   rp   rE   r   rq   r   
seq_lengthr_   outputss                  r*   forwardSmolVLMModel.forward   s   4 ==T__CC	l I %.__"J
&(5(;(;%JATUU0*$++>O  OO@@B9MPPQZQaQabM#(;(Glmm#"&"9"9 #: #m   #6"8"89M9M"N ,"5"8"8tzzR_RfRf"8"g* ..#+$7 / M // 
')%+
 
 .%77#33!//)) 3
 	
r)   r    )N)	NNNNNNNNN)r"   r#   r$   r%   r&   rQ   
LongTensorTensorrl   r   r   FloatTensorr   r   tupler
   r   r   
BoolTensorr   r	   r>   r   r(   r    r)   r*   rA   rA   j   s   
)):?,,]b]i]iB m 9=2''2 $..52 +,	2
 
+	+2 2h 
  .2.204(,26158<8<!%A
##d*A
 t+A
 &&-	A

 A
 ((4/A
 ''$.A
 $..5A
 #..5A
 $;A
 -.A
 
/	/A
 
 A
r)   rA   c                   <   ^  \ rS rSrSS0rU 4S jrU 4S jrSrU =r$ )SmolVLMForConditionalGenerationi  zlm_head.weightz$model.text_model.embed_tokens.weightc                 H  > [         TU ]  U5        [        U5      U l        [        R
                  " U5      U R                  R                  l        [        R                  " UR                  R                  UR                  R                  SS9U l        U R                  5         g )NF)bias)super__init__rA   modelr   from_model_configr   generation_configr   Lineartext_confighidden_size
vocab_sizelm_head	post_init)r^   rS   	__class__s     r*   r   (SmolVLMForConditionalGeneration.__init__  sq     !&)
2B2T2TU[2\

/yy!3!3!?!?ASASA^A^ejkr)   c                 &   > [         TU ]  " S0 UD6  g)a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The hidden states of the image encoder after modality projection.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import httpx
>>> from io import BytesIO
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "video", "path": path/to/video},
...             {"type": "text", "text": "What is happening in this video?"},
...         ]
...     }
... ]

>>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts)
```Nr    )r   r   )r^   super_kwargsr   s     r*   r   'SmolVLMForConditionalGeneration.forward$  s    d 	','r)   )r   r   )	r"   r#   r$   r%   _tied_weights_keysr   r   r(   __classcell__)r   s   @r*   r   r     s    *,RS2( 2(r)   r   )r   r4   r8   r;   r   r,   rA   r1   )/rQ   huggingface_hub.dataclassesr   r   cache_utilsr   r   
generationr   modeling_flash_attention_utilsr	   modeling_outputsr
   processing_utilsr   utilsr   r   r   r   r   idefics3.configuration_idefics3r   r   "idefics3.image_processing_idefics3r   &idefics3.image_processing_pil_idefics3r   idefics3.modeling_idefics3r   r   r   r   r   
get_loggerr"   r   r   r,   r1   r4   r8   r;   r>   rA   r   __all__r    r)   r*   <module>r      s  "  .  . * B : & j j R G N  
		H	% AB". "  C"*	4 		8 	 ABN   C&	2 		8 		%D 	m
= m
`<(&F <(~	r)   