
    Z jZ                        S SK r S SKJr  S SKJs  Jr  S SKJr  S SKJ	r	  S SK
Jr  S SKJrJrJr  S SKJrJrJrJrJrJrJrJrJrJrJrJr  SSKJr  SS	KJ r   SS
K!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)   " S S\5      r*\#" SS9\ " S S\5      5       5       r+\#" SS9\ " S S\5      5       5       r,\#" SS9\ " S S\5      5       5       r- " S S\5      r. " S S\5      r/ " S S\5      r0 " S S\Rb                  5      r2 " S S \5      r3 " S! S"\5      r4 " S# S$\5      r5 " S% S&\5      r6 " S' S(\5      r7 " S) S*\5      r8/ S+Qr9g),    N)strict)normalizers)GemmaTokenizer)SiglipConfigSiglipTextConfigSiglipVisionConfig)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSiglipForImageClassificationSiglipModel#SiglipMultiheadAttentionPoolingHeadSiglipOutputSiglipPreTrainedModelSiglipTextModelSiglipTextModelOutputSiglipVisionModelSiglipVisionModelOutput   )create_bidirectional_mask)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuplemerge_with_config_defaults)capture_outputsc                      ^  \ rS rSrSr       SS\\\\4   -  S-  S\\\   -  S-  S\S\S\S	\S
\4U 4S jjjr	Sr
U =r$ )Siglip2Tokenizer2   zF
Gemma tokenizer + SigLIP2 training default: lowercase normalization.
Nvocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc                   > [         T
U ]  " SUUUUUUUS.UD6  [        U S5      (       aO  [        U R                  [
        5      (       a0  U R                  R                  SU R                  R                  5        [        U SS 5      n	U	bI  U	R                  b;  [        R                  " [        R                  " 5       U	R                  /5      U	l
        g g g )N)r!   r"   r#   r$   r%   r&   r'   init_kwargstokenizer_class
_tokenizer )super__init__hasattr
isinstancer)   dict
setdefault	__class____name__getattr
normalizerr   Sequence	Lowercase)selfr!   r"   r#   r$   r%   r&   r'   kwargsbackendr3   s             |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/siglip2/modular_siglip2.pyr.   Siglip2Tokenizer.__init__7   s     	 		
!		
 		
 4''Jt7G7G,N,N''(94>>;R;RS$d37#5#5#A!,!5!5{7L7L7NPWPbPb6c!dG $B    r,   )NNz<unk>z<bos>z<eos>z<pad>z<mask>)r4   
__module____qualname____firstlineno____doc__strr1   intlistr.   __static_attributes____classcell__r3   s   @r<   r   r   2   s     .2)-    "eT#s(^#d*e d3i$&e 	e
 e e e e er>   r   z"google/siglip2-base-patch16-naflex)
checkpointc                       \ rS rSrSrg)Siglip2TextConfigV   r,   Nr4   r?   r@   rA   rF   r,   r>   r<   rK   rK   V        	r>   rK   c                   6    \ rS rSr% SrSr\\S'   \" 5       r	Sr
g)Siglip2VisionConfig\   a6  
num_patches (`int`, *optional*, defaults to 256):
    The number of patches in the image with the size of (`patch_size`, `patch_size`).
    The image is resized to fill maximum of this number of patches, and to preserve
    the aspect ratio. In case the resulted number of patches is lower, the image is
    padded in "patch" dimension.

Example:

```python
>>> from transformers import Siglip2VisionConfig, Siglip2VisionModel

>>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
>>> configuration = Siglip2VisionConfig()

>>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
>>> model = Siglip2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```   num_patchesr,   N)r4   r?   r@   rA   rB   rS   rD   __annotations__AttributeError
image_sizerF   r,   r>   r<   rP   rP   \   s    , K!Jr>   rP   c                       \ rS rSrSrg)Siglip2Configy   r,   NrM   r,   r>   r<   rX   rX   y   rN   r>   rX   c                       \ rS rSrSrg)Siglip2VisionOutput   r,   NrM   r,   r>   r<   r[   r[          r>   r[   c                       \ rS rSrSrg)Siglip2TextOutput   r,   NrM   r,   r>   r<   r_   r_      r]   r>   r_   c                       \ rS rSrSrg)Siglip2Output   r,   NrM   r,   r>   r<   rb   rb      r]   r>   rb   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\
S\R                  4S j5       rS	\R                  S\R                  S\R                  4S
 jrSrU =r$ )Siglip2VisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        [        R                  " UR                  U R
                  -  U R
                  -  U R                  S9U l	        UR                  U l
        [        U R                  S-  5      U l        [        R                  " U R                  U R                  5      U l        g )N)in_featuresout_featuresg      ?)r-   r.   rg   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingrS   rD   position_embedding_size	Embeddingposition_embeddingr9   rg   r3   s     r<   r.    Siglip2VisionEmbeddings.__init__   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr>   positional_embeddingsspatial_shapes
max_lengthreturnc           	         UR                   S   nU R                   S   nU R                  n[        R                  " X2U4U R                  US9nU R                  SSS5      R                  S5      n U R                  R                  S:X  a  U R                  [        R                  5      n [        U5       H  nX   R                  5       u  p[        U	S:  S5        [        US:  S5        [        X-  U:*  S	5        [        R                  " U X4S
SSS9n
U
R                  XHU	-  5      R!                  SS5      n
U
R                  U5      n
XUSX-  24'   U
S   XgX-  S24'   M     U$ )a  
Resize positional embeddings to image-specific size and pad to a fixed size.

Args:
    positional_embeddings (`torch.Tensor`):
        Position embeddings of shape (height, width, embed_dim)
    spatial_shapes (`torch.LongTensor`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
    max_length (`int`):
        Maximum length of the positional embeddings to pad resized positional embeddings to

Returns:
    `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
r   devicedtype      cpuz8Width of resized positional embeddings must be positive.z9Height of resized positional embeddings must be positive.z0Resized positional embeddings exceed max_length.bilinearFT)sizemodealign_corners	antialiasN)shaper   torchemptyr~   permute	unsqueezetypetofloat32rangetolistr   Finterpolatereshape	transpose)rw   rx   ry   
batch_sizerl   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r<   resize_positional_embeddings4Siglip2VisionEmbeddings.resize_positional_embeddings   s   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z"A*-446MF"EAI0jk"FQJ1lm"FNz#ACuv!"%_#" "4!;!;IPU~!V!`!`abde!f "4!6!6|!DBT1.>.>+>?BTUVBW*fn.>+>?+ #. .-r>   pixel_valuesc                 :   U R                   R                  R                  nU R                  UR                  US95      nU R                  R                  R                  U R                  U R                  S5      nU R                  XRUR                  S   S9nXF-   nU$ )a  
Args:
    pixel_values (`torch.FloatTensor`):
        Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
    spatial_shapes (`list[tuple[int, int]]`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
)r   r|   r   )ry   )	rq   weightr   r   rt   r   rr   r   r   )r9   r   rx   target_dtypepatch_embedsrw   resized_positional_embeddings
embeddingss           r<   forwardSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!l>P>PQR>S )J )
%
 "A
r>   )rg   rl   rS   rq   rm   rt   rr   )r4   r?   r@   rA   rP   r.   staticmethodr   Tensor
LongTensorrD   r   FloatTensorr   rF   rG   rH   s   @r<   re   re      s    Q2 Q ;.$||;.((;. ;. 
	;. ;.zE$5$5 uGWGW \a\h\h  r>   re   c                       \ rS rSrSrSrSrg)Siglip2PreTrainedModel   Fr,   N)r4   r?   r@   rA   _supports_flex_attn_supports_flash_attnrF   r,   r>   r<   r   r      s     r>   r   c                      ^  \ rS rSrS\4U 4S jjr\\" SS9\S\	R                  S\	R                  S\	R                  S	\\   S
\4
S j5       5       5       rSrU =r$ )Siglip2VisionModel   rg   c                 $   > [         TU ]  U5        g N)r-   r.   ru   s     r<   r.   Siglip2VisionModel.__init__   s     r>   F)tie_last_hidden_statesr   pixel_attention_maskrx   r:   rz   c                    U R                  X5      n[        U R                  UUS9nU R                  " SUUS.UD6nUR                  nU R                  U5      nU R                  (       a  U R                  X5      OSn	[        UU	S9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```
)rg   inputs_embedsattention_mask)r   r   N)last_hidden_statepooler_outputr,   )	r   r   rg   encoderr   post_layernormuse_headheadr
   )
r9   r   r   rx   r:   hidden_statesencoder_attention_maskencoder_outputsr   r   s
             r<   r   Siglip2VisionModel.forward   s    L E!:;;'/"
 ,0<< ,
'1,
 ,
 ,== //0ABNRmm		"3Jae)/'
 	
r>   r,   )r4   r?   r@   rA   rP   r.   r   r   r   r   r   r   r   r   r   r
   r   rF   rG   rH   s   @r<   r   r      s    !2 !  E29
''9
 $ll9
 ((	9

 +,9
 
$9
  3  9
r>   r   c                       \ rS rSrSrg)Siglip2TextModeli<  r,   NrM   r,   r>   r<   r   r   <  r]   r>   r   c                      ^  \ rS rSrS\4U 4S jjrS
S\R                  S\R                  S-  S\R                  4S jjrS	r	U =r
$ )$Siglip2MultiheadAttentionPoolingHeadi@  rg   c                 R   > [         TU ]  U5        Xl        UR                  U l        g r   )r-   r.   rg   num_attention_heads	num_headsru   s     r<   r.   -Siglip2MultiheadAttentionPoolingHead.__init__A  s"     33r>   Nhidden_stater   rz   c           	         UR                   S   nU R                  R                  USS5      nUb  UR                   S   UR                   S   pe[        U R                  UUUS9nUb  UR                  SU R
                  US5      nUR                  SXV5      nUR                  [        R                  :X  ah  [        R                  " U[        R                  " SUR                  UR                  S9[        R                  " UR                  5      R                  5      nU R                  XAXS9S   nUnU R!                  U5      nXpR#                  U5      -   nUS S 2S4   $ )Nr   r   )rg   r   r   encoder_hidden_statesr|   g        r}   )	attn_mask)r   proberepeatr   rg   r   r   r   r   boolwheretensorr~   finfomin	attention	layernormmlp)r9   r   r   r   r   
target_len
source_lenresiduals           r<   r   ,Siglip2MultiheadAttentionPoolingHead.forwardG  s;   !''*


!!*a3%%*[[^\5G5G5J
6{{#-&2	N )!/!6!6q$..*VW!X!/!7!7J!S "''5::5%*[[&S1F1FekkZEKK044&N ~~e<~bcde~~l3((<"88AqD!!r>   )rg   r   r   )r4   r?   r@   rA   rP   r.   r   r   r   rF   rG   rH   s   @r<   r   r   @  sD    42 4"ELL "%,,QUBU "afamam " "r>   r   c                      \ rS rSr\\   SS\R                  S-  S\R                  S-  S\R                  S-  S\
\   S\\-  4
S jj5       5       r\\       SS	\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\
\   S\4S jj5       5       rSrg)Siglip2Modelih  Nr   r   rx   r:   rz   c                 .    U R                   " SUUUS.UD6$ )aY  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModel
>>> from transformers.image_utils import load_image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```
r   r   rx   r,   )vision_model)r9   r   r   rx   r:   s        r<   get_image_featuresSiglip2Model.get_image_featuresj  s0    D    
%!5)
 	
 	
r>   	input_idsr   position_idsreturn_lossc           
         U R                   " SUUUS.UD6n	U R                  " SUUUS.UD6n
U	R                  nU
R                  nXR                  SSSS9-  nXR                  SSSS9-  n[        R
                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      pXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR                  S5      UR                  S	9n[        R                  " U5      * SU-  -   n[        R                  R                   R#                  UU-  5      n[        R$                  " USS
9* nUR'                  5       n[)        UUUUUU
U	S9$ )a}  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```
r   )r   r   r   r   r|   T)pdimkeepdimNr   )r~   r   )losslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr,   )r   
text_modelr   normr   matmultr   r~   logit_scale
logit_biasexpeyer   	ones_likern   
functional
logsigmoidsummeanrb   )r9   r   r   r   rx   r   r   r   r:   vision_outputstext_outputsr   r   r   r   r   r   r   r   m1_diag1logliknlls                         r<   r   Siglip2Model.forward  s   d 6:5F5F 6
%!5)6
 	6
 48?? 4
)%4
 	4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r>   r,   )NNN)NNNNNNN)r4   r?   r@   rA   r   r   r   r   r   r   r   r   tupler
   r   r   rb   r   rF   r,   r>   r<   r   r   h  sa    264826	%
''$.%
 $llT1%
 ((4/	%

 +,%
 
+	+%
  %
P  .2154826.204#'^
##d*^
 ''$.^
 $llT1	^

 ((4/^
 t+^
 &&-^
 D[^
 +,^
 
^
  ^
r>   r   c                       \ rS rSr\\    SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	\
   S\4S	 jj5       5       rS
rg)Siglip2ForImageClassificationi  Nr   r   rx   labelsr:   rz   c                    U R                   " U4UUS.UD6nUR                  nUbL  US   R                  UR                  5      n[        R
                  " Xx-  SS9[        R
                  " USS9-  nO[        R                  " USS9nU R                  U5      n	Sn
Ub  U R                  XIU R                  5      n
[        U
U	UR                  UR                  S9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> # note: we are loading a `Siglip2Model` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
>>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```
)r   rx   N).Nr   r   )r   logitsr   
attentions)r   r   r   r~   r   r  r  
classifierloss_functionrg   r   r   r  )r9   r   r   rx   r  r:   outputssequence_output	pool_maskr  r   s              r<   r   %Siglip2ForImageClassification.forward  s    ` /3.?.?/
!5)/
 	/
 "33  +,Y7::?;Q;QRI#ii(CKeiiXaghNiiO#jja@O 1%%fdkkBD$!//))	
 	
r>   r,   )NNNN)r4   r?   r@   rA   r   r   r   r   r   r   r   r   r   rF   r,   r>   r<   r  r    s     -14826&*J
llT)J
 $llT1J
 ((4/	J

 t#J
 +,J
 
J
  J
r>   r  )	rX   rK   rP   r   r   r   r   r  r   ):r   torch.nnrn   torch.nn.functionalr   r   huggingface_hub.dataclassesr   
tokenizersr   ,transformers.models.gemma.tokenization_gemmar   /transformers.models.siglip.configuration_siglipr   r   r   *transformers.models.siglip.modeling_siglipr	   r
   r   r   r   r   r   r   r   r   r   r   masking_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   r   rK   rP   rX   r[   r_   rb   Modulere   r   r   r   r   r   r  __all__r,   r>   r<   <module>r$     sv        . " G n n    7 & 
 J 5!e~ !eH ?@	( 	  A	 ?@", "  A"6 ?@	L 	  A		1 		- 		L 	ebii eP!2 !@
* @
F	 	%"+N %"PL
; L
^N
$@ N
b
r>   