
    Z j|                        S SK Jr  S SKJr  S SKJr  S SKrS SKrS SK	J
r
  S SKJ
s  Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-  \#" SS9\ " S S\!5      5       5       r.\\#" SS9 " S S\!5      5       5       r/\#\ " S S\!5      5       5       r0 " S S\
Rb                  5      r2 " S S\
Rb                  5      r3 S?S\
Rb                  S \Rh                  S!\Rh                  S"\Rh                  S#\Rh                  S-  S$\5S%\54S& jjr6 " S' S(\
Rb                  5      r7 " S) S*\
Rb                  5      r8 " S+ S,\5      r9\# " S- S.\5      5       r: " S/ S0\
Rb                  5      r;\#" S1S9 " S2 S3\:5      5       r<\#" S4S9 " S5 S6\:5      5       r= " S7 S8\
Rb                  5      r>\# " S9 S:\:5      5       r?\#" S;S9 " S< S=\:5      5       r@/ S>QrAg)@    )Callable)	dataclass)AnyN   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringtorch_compilable_check)can_return_tuplemerge_with_config_defaults)capture_outputs   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
Siglip2VisionOutput,   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations__r!   r"   tupler#   __static_attributes__r$       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   r   ,   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r/   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
Siglip2TextOutput>   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr!   .r"   r#   r$   )r%   r&   r'   r(   r)   r4   r*   r+   r,   r!   r"   r-   r#   r.   r$   r/   r0   r2   r2   >   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r/   r2   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)Siglip2OutputP   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Siglip2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Siglip2VisionModel`].
Nlosslogits_per_imagelogits_per_textr4   r    text_model_outputvision_model_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     r0   	<genexpr>)Siglip2Output.to_tuple.<locals>.<genexpr>o   s)     ^P]1Z;%?%?QZZ\QFP]s   46)r-   valuesselfs    r0   rB   Siglip2Output.to_tuplen   s    ^PTP[P[P]^^^r/   r$   )r%   r&   r'   r(   r)   r8   r*   r+   r,   r9   r:   r4   r    r;   r   r<   r-   r   rB   r.   r$   r/   r0   r6   r6   P   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r/   r6   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\
S\R                  4S j5       rS	\R                  S\R                  S\R                  4S
 jrSrU =r$ )Siglip2VisionEmbeddingsr   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        [        R                  " UR                  U R
                  -  U R
                  -  U R                  S9U l	        UR                  U l
        [        U R                  S-  5      U l        [        R                  " U R                  U R                  5      U l        g )N)in_featuresout_featuresg      ?)super__init__rN   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingrI   rN   	__class__s     r0   rS    Siglip2VisionEmbeddings.__init__s   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr/   positional_embeddingsspatial_shapes
max_lengthr=   c           	         UR                   S   nU R                   S   nU R                  n[        R                  " X2U4U R                  US9nU R                  SSS5      R                  S5      n U R                  R                  S:X  a  U R                  [        R                  5      n [        U5       H  nX   R                  5       u  p[        U	S:  S5        [        US:  S5        [        X-  U:*  S	5        [        R                  " U X4S
SSS9n
U
R                  XHU	-  5      R!                  SS5      n
U
R                  U5      n
XUSX-  24'   U
S   XgX-  S24'   M     U$ )a  
Resize positional embeddings to image-specific size and pad to a fixed size.

Args:
    positional_embeddings (`torch.Tensor`):
        Position embeddings of shape (height, width, embed_dim)
    spatial_shapes (`torch.LongTensor`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
    max_length (`int`):
        Maximum length of the positional embeddings to pad resized positional embeddings to

Returns:
    `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
r   devicedtype   r   cpuz8Width of resized positional embeddings must be positive.z9Height of resized positional embeddings must be positive.z0Resized positional embeddings exceed max_length.bilinearFT)sizemodealign_corners	antialiasN)shaperj   r*   emptyri   permute	unsqueezetypetofloat32rangetolistr   Finterpolatereshape	transpose)rc   rd   re   
batch_sizerU   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r0   resize_positional_embeddings4Siglip2VisionEmbeddings.resize_positional_embeddings   s   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z"A*-446MF"EAI0jk"FQJ1lm"FNz#ACuv!"%_#" "4!;!;IPU~!V!`!`abde!f "4!6!6|!DBT1.>.>+>?BTUVBW*fn.>+>?+ #. .-r/   pixel_valuesc                 :   U R                   R                  R                  nU R                  UR                  US95      nU R                  R                  R                  U R                  U R                  S5      nU R                  XRUR                  S   S9nXF-   nU$ )a  
Args:
    pixel_values (`torch.FloatTensor`):
        Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
    spatial_shapes (`list[tuple[int, int]]`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
)rj   rg   r   )re   )	rZ   weightrj   rw   r_   r}   r]   r   rr   )rI   r   rd   target_dtypepatch_embedsrc   resized_positional_embeddings
embeddingss           r0   forwardSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!l>P>PQR>S )J )
%
 "A
r/   )rN   rU   r[   rZ   rV   r_   r]   )r%   r&   r'   r(   r   rS   staticmethodr*   Tensor
LongTensorr\   r   r+   r   r.   __classcell__ra   s   @r0   rL   rL   r   s    Q2 Q ;.$||;.((;. ;. 
	;. ;.zE$5$5 uGWGW \a\h\h  r/   rL   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )Siglip2TextEmbeddings   rN   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_idsr   rg   F)
persistent)rR   rS   rT   rW   r^   
vocab_sizetoken_embeddingmax_position_embeddingsr_   register_bufferr*   arangeexpandrI   rN   rU   ra   s      r0   rS   Siglip2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r/   N	input_idsr   inputs_embedsr=   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrg   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rr   r_   r   
ValueErrorr   r   )rI   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r0   r   Siglip2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r/   )r_   r   NNN)r%   r&   r'   r(   r   rS   r*   r   r+   r   r   r.   r   r   s   @r0   r   r      sp    

0 

 .20426	##d* &&- ((4/	
 
 r/   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrg   r   )dimrj   )ptrainingr   rk   )r*   matmulr~   rW   
functionalsoftmaxrx   rw   rj   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r/   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )Siglip2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rR   rS   rN   rT   rU   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalrW   rX   k_projv_projq_projout_projr`   s     r0   rS   Siglip2Attention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar/   Nr"   r   r=   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrg   r   rk           )r   r   r   )rr   r   r   viewr~   r   r   r   get_interfacerN   _attn_implementationr   r   r   r   r   r}   r   r   )rI   r"   r   r   input_shapehidden_shapequerieskeysrG   attention_interfacer   r   s               r0   r   Siglip2Attention.forward0  s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r/   )rN   r   rU   r   r   r   r   r   r   r   r   r@   )r%   r&   r'   r(   r)   rS   r*   r   r-   r   r.   r   r   s   @r0   r   r     s[    GB. /3!)||!) t+!)
 
u||U\\D00	1!) !)r/   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
Siglip2MLPiT  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r@   )rR   rS   rN   r   
hidden_actactivation_fnrW   rX   rT   intermediate_sizefc1fc2r`   s     r0   rS   Siglip2MLP.__init__U  sb    #F$5$5699V//1I1IJ99V55v7I7IJr/   r"   r=   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r@   )r   r   r   )rI   r"   s     r0   r   Siglip2MLP.forward\  s4    /**=9/r/   )r   rN   r   r   )
r%   r&   r'   r(   rS   r*   r   r   r.   r   r   s   @r0   r   r   T  s)    KU\\ ell  r/   r   c            	          ^  \ rS rSrS\\-  4U 4S jjr\S\R                  S\R                  S\
\   S\R                  4S j5       rS	rU =r$ )
Siglip2EncoderLayeric  rN   c                 <  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        g Neps)rR   rS   rT   rU   rW   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr`   s     r0   rS   Siglip2EncoderLayer.__init__d  sm    ++<<F<Q<QR)&1<<F<Q<QRf%r/   r"   r   r   r=   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r"   r   r$   )r   r   r   r   )rI   r"   r   r   residual_s         r0   r   Siglip2EncoderLayer.forwardl  sz     !((7>> 
')
 

 !0 ((7/ 0r/   )rU   r   r   r   r   )r%   r&   r'   r(   r   r   rS   r   r*   r   r   r   r+   r   r.   r   r   s   @r0   r   r   c  sd    &25FF & ||  +,	
 
		 r/   r   c                   |    \ rS rSr% \\S'   SrSrSr/ SQr	Sr
SrSrSr\\S.r\R$                  " 5       S	 5       rS
rg)Siglip2PreTrainedModeli  rN   siglip2)imagetextT)r   rL   r   $Siglip2MultiheadAttentionPoolingHeadF)r"   r#   c                 H   [        U[        5      (       Ga  [        U R                  [        5      (       a   U R                  R                  R
                  OU R                  R
                  n[        R                  " UR                  R                  S[        R                  " U5      -  S9  [        US5      (       a\  [        R                  " UR                  [        R                   " UR                  R"                  S   5      R%                  S5      5        gg[        U[&        R(                  5      (       a!  [        R*                  " UR                  5        g[        U[,        5      (       GaQ  [        R.                  " UR0                  R                  5        [        R.                  " UR2                  R                  5        [        R.                  " UR4                  R                  5        [        R.                  " UR6                  R                  5        [        R8                  " UR0                  R:                  5        [        R8                  " UR2                  R:                  5        [        R8                  " UR4                  R:                  5        [        R8                  " UR6                  R:                  5        g[        U[<        5      (       a  [        R.                  " UR>                  R                  5        [        R.                  " UR@                  R                  5        [        R                  " UR>                  R:                  SS9  [        R                  " UR@                  R:                  SS9  g[        U[B        5      (       au  [        R.                  " URD                  5        [        R.                  " URF                  RH                  5        [        R8                  " URF                  RJ                  5        g[        U[L        5      (       aA  [        R8                  " URN                  5        [        R8                  " URP                  5        g[        U[R        5      (       ab  [        R                  " URT                  R                  U R                  R                  R
                  S-  U R                  RV                  -  S9  g[        U[&        RX                  [&        RZ                  45      (       aO  [        R\                  " UR                  5        UR:                  b!  [        R8                  " UR:                  5        gg[        U[&        R^                  5      (       aA  [        R8                  " UR:                  5        [        R`                  " UR                  5        g[        U[b        5      (       a\  [        R                  " UR                  [        R                   " UR                  R"                  S   5      R%                  S5      5        gg)	zInitialize the weightsr   )stdr   rg   r   gư>r   N)2rA   rL   rN   r   vision_configrT   initnormal_r_   r   npsqrthasattrcopy_r   r*   r   rr   r   rW   r^   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglip2Modellogit_scale
logit_biasSiglip2ForImageClassification
classifierinitializer_factorrX   Conv2dlecun_normal_r   ones_r   )rI   r   r   s      r0   _init_weights$Siglip2PreTrainedModel._init_weights  s    f566 dkk=99 ))55[[,, 
 LL2299q2775>?QRv~..

6..V=P=P=V=VWY=Z0[0b0bcj0kl /--))&--8 011  !5!56  !5!56  !5!56  !7!78KK**+KK**+KK**+KK,,-
++  !2!23  !2!23LLd3LLd3 DEE  .  !1!1!@!@AKK((556--KK**+KK))* =>>LL!!((KK--994?$++B`B`` BII 677v}}-{{&FKK( '--KK$JJv}}% 566JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 7r/   r$   N)r%   r&   r'   r(   r   r,   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr*   no_gradr  r.   r$   r/   r0   r   r     sg    !(&*# !N"& -&
 ]]_/i /ir/   r   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ SS\R                  S-  S\
\   S\4S	 jj5       rS
rU =r$ )Siglip2Encoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Siglip2EncoderLayer`].

Args:
    config: Siglip2Config
rN   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rR   rS   rN   rW   
ModuleListry   num_hidden_layersr   layersgradient_checkpointing)rI   rN   r   ra   s      r0   rS   Siglip2Encoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %ks   A&Nr   r   r=   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)r!   )r)  r   )rI   r   r   r   r"   encoder_layers         r0   r   Siglip2Encoder.forward  s>     &![[M) M ) ??r/   )rN   r*  r)  r@   )r%   r&   r'   r(   r)   r   rS   r   r*   r   r   r   r   r   r.   r   r   s   @r0   r%  r%    s_    ,} ,  /3@ t+@ +,	@
 
@ @r/   r%  zN
    The vision model from Siglip2 without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrSr	S\4U 4S jjr
\\" SS	9\S\R                  S
\R                   S\R"                  S\\   S\4
S j5       5       5       rSrU =r$ )Siglip2VisionModeli  rN   r   r   vision_modelrZ   c                 x  > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        US5      (       d  SOUR                  U l        U R                  (       a  [        U5      U l        U R#                  5         g )Nr   vision_use_headT)rR   rS   rN   rT   rL   r   r%  encoderrW   r   r   post_layernormr  r4  use_headr   head	post_initr   s      r0   rS   Siglip2VisionModel.__init__  s     &&	1&9%f- ll9:O:OP$+F4E$F$FFLbLb==<VDDIr/   Ftie_last_hidden_statespixel_attention_maskrd   r   r=   c                    U R                  X5      n[        U R                  UUS9nU R                  " SUUS.UD6nUR                  nU R                  U5      nU R                  (       a  U R                  X5      OSn	[        UU	S9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```
rN   r   r   r   r   Nr!   pooler_outputr$   )	r   r	   rN   r5  r!   r6  r7  r8  r   )
rI   r   r=  rd   r   r"   encoder_attention_maskencoder_outputsr!   rB  s
             r0   r   Siglip2VisionModel.forward  s    L E!:;;'/"
 ,0<< ,
'1,
 ,
 ,== //0ABNRmm		"3Jae)/'
 	
r/   )rN   r   r5  r8  r6  r7  )r%   r&   r'   r(   r   r,   main_input_namer  r  _input_embed_layerrS   r   r   r   r*   r+   r   r   r   r   r   r   r.   r   r   s   @r0   r0  r0    s      $O!&*2   E29
''9
 $ll9
 ((	9

 +,9
 
$9
  3  9
r/   r0  zL
    The text model from Siglip2 without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS\4U 4S jjr	\
\" SS9\   SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\\   S\4
S jj5       5       5       rSrU =r$ )Siglip2TextModeliJ  rN   )r   
text_modelr   c                 8  > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  5      U l        U R                  5         g r   )rR   rS   rN   rT   r   r   r%  r5  rW   r   r   final_layer_normrX   projection_sizer8  r9  r   s      r0   rS   Siglip2TextModel.__init__U  so     &&	/7%f- "Y<Q<Q RIIi)?)?@	r/   Fr;  Nr   r   r   r   r=   c                 b   Uc  [        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[	        U R
                  UUS9nU R                  " SUUS.UD6nUR                  nU R                  U5      nUSS2SSS24   n	U R                  U	5      n	[        UU	S9$ )	aX  
Examples:

```python
>>> from transformers import AutoTokenizer, Siglip2TextModel

>>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzYou have to specify input_idsrg   )r   r   r?  r@  rA  r$   )r   rn   r   r   r	   rN   r5  r!   rL  r8  r   )
rI   r   r   r   r   r   r"   rD  r!   pooled_outputs
             r0   r   Siglip2TextModel.forward`  s    4 <==nn&NN2{27	)W 3;;')
 ,0<< ,
'),
 ,
 ,== 112CD *!R(3		-0)/'
 	
r/   )rN   r   r5  rL  r8  r   )r%   r&   r'   r(   r   r,   r  r  rG  rS   r   r   r   r*   r   r   r   r   r   r.   r   r   s   @r0   rI  rI  J  s      $*	0 	  E2 *..2,0	6
<<$&6
 t+6
 llT)	6

 +,6
 
$6
  3  6
r/   rI  c                      ^  \ rS rSrSrS\4U 4S jjrSS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )r   i  zMultihead Attention Pooling.rN   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        R                  R                  UR                  UR                  SS9U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        Xl        UR                  U l        g )Nr   T)batch_firstr   )rR   rS   rW   	Parameterr*   randnrT   r  MultiheadAttentionr   r  r   r   	layernormr   r   rN   r   r`   s     r0   rS   -Siglip2MultiheadAttentionPoolingHead.__init__  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STf%33r/   Nhidden_stater   r=   c           	         UR                   S   nU R                  R                  USS5      nUb  UR                   S   UR                   S   pe[        U R                  UUUS9nUb  UR                  SU R
                  US5      nUR                  SXV5      nUR                  [        R                  :X  ah  [        R                  " U[        R                  " SUR                  UR                  S9[        R                  " UR                  5      R                  5      nU R                  XAXS9S   nUnU R!                  U5      nXpR#                  U5      -   nUS S 2S4   $ )Nr   r   )rN   r   r   encoder_hidden_statesrg   r   rh   )	attn_mask)rr   r  repeatr	   rN   r   r}   rj   r*   boolwheretensorri   finfominr  rX  r   )rI   rZ  r   r   r  
target_len
source_lenr   s           r0   r   ,Siglip2MultiheadAttentionPoolingHead.forward  s;   !''*


!!*a3%%*[[^\5G5G5J
6{{#-&2	N )!/!6!6q$..*VW!X!/!7!7J!S "''5::5%*[[&S1F1FekkZEKK044&N ~~e<~bcde~~l3((<"88AqD!!r/   )r  rN   rX  r   r   r  r@   )r%   r&   r'   r(   r)   r   rS   r*   r   r   r.   r   r   s   @r0   r   r     sG    &	42 	4"ELL "%,,QUBU "afamam " "r/   r   c                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S\R                  4S jr
\\  SS	\R                  S
\R                  S-  S\R                  S-  S\\   S\\-  4
S jj5       5       r\\   SS\R(                  S-  S\R                  S-  S\R*                  S-  S\\   S\\-  4
S jj5       5       r\\       SS	\R*                  S-  S\R(                  S-  S\R                  S-  S\R*                  S-  S
\R                  S-  S\R*                  S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )r  i  rN   c                   > [         TU ]  U5        UR                  nUR                  n[        R                  U5      U l        [        R                  U5      U l        [        R                  " [        R                  " S5      5      U l        [        R                  " [        R                  " S5      5      U l        U R                  5         g )Nr   )rR   rS   text_configr   rI  _from_configrJ  r0  r2  rW   rU  r*   rV  r  r  r9  )rI   rN   ri  r   ra   s       r0   rS   Siglip2Model.__init__  s     ((,, +77D.;;MJ<<A7,,u{{1~6 	r/   r=   c                 B    U R                   R                  R                  $ r@   rJ  r   r   rH   s    r0   get_input_embeddings!Siglip2Model.get_input_embeddings  s    ))999r/   r   c                 8    XR                   R                  l        g r@   rm  rI   r   s     r0   set_input_embeddings!Siglip2Model.set_input_embeddings  s    5:""2r/   Nr   r   r   r   c                 .    U R                   " SUUUS.UD6$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
...     text_features = model.get_text_features(**inputs)
```r   r   r   r$   )rJ  )rI   r   r   r   r   s        r0   get_text_featuresSiglip2Model.get_text_features  s-    0  
)%
 	
 	
r/   r   r=  rd   c                 .    U R                   " SUUUS.UD6$ )aY  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModel
>>> from transformers.image_utils import load_image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```
r   r=  rd   r$   )r2  )rI   r   r=  rd   r   s        r0   get_image_featuresSiglip2Model.get_image_features  s0    D    
%!5)
 	
 	
r/   return_lossc           
         U R                   " SUUUS.UD6n	U R                  " SUUUS.UD6n
U	R                  nU
R                  nXR                  SSSS9-  nXR                  SSSS9-  n[        R
                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      pXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR                  S5      UR                  S	9n[        R                  " U5      * SU-  -   n[        R                  R                   R#                  UU-  5      n[        R$                  " USS
9* nUR'                  5       n[)        UUUUUU
U	S9$ )a}  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```
ry  ru  rk   rg   T)r   r   keepdimNr   )ri   r   )r8   r9   r:   r4   r    r;   r<   r$   )r2  rJ  rB  normr*   r   trw   ri   r  r  expeyern   	ones_likerW   r   
logsigmoidsummeanr6   )rI   r   r   r=  rd   r   r   r|  r   vision_outputstext_outputsr    r4   r:   r  r  r9   r8   r  m1_diag1logliknlls                         r0   r   Siglip2Model.forward.  s   d 6:5F5F 6
%!5)6
 	6
 48?? 4
)%4
 	4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r/   )r  r  rJ  r2  )NNr   )NNNNNNN)r%   r&   r'   r(   r   r,   rS   rW   Modulern  rr  r   r   r*   r   r   r   r-   r   rv  r+   r   rz  r_  r6   r   r.   r   r   s   @r0   r  r    s   }  :bii :;")) ;  /3,0	
<<
 t+
 llT)	

 +,
 
+	+
  
:  264826	%
''$.%
 $llT1%
 ((4/	%

 +,%
 
+	+%
  %
P  .2154826.204#'^
##d*^
 ''$.^
 $llT1	^

 ((4/^
 t+^
 &&-^
 D[^
 +,^
 
^
  ^
r/   r  z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                   0  ^  \ rS rSrSrSrS\SS4U 4S jjrS\R                  4S jr
S	\R                  4S
 jr\\    SS\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )r  i  r   r1  rN   r=   Nc                 l  > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rR   rS   
num_labelsr0  rj  r   r2  rW   rX   rT   Identityr  r9  r`   s     r0   rS   &Siglip2ForImageClassification.__init__  s      ++.;;F<P<PQ OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r/   c                 B    U R                   R                  R                  $ r@   r2  r   rZ   rH   s    r0   rn  2Siglip2ForImageClassification.get_input_embeddings  s      ++;;;r/   r   c                 8    XR                   R                  l        g r@   r  rq  s     r0   rr  2Siglip2ForImageClassification.set_input_embeddings  s    7<$$4r/   r=  rd   labelsr   c                    U R                   " U4UUS.UD6nUR                  nUbL  US   R                  UR                  5      n[        R
                  " Xx-  SS9[        R
                  " USS9-  nO[        R                  " USS9nU R                  U5      n	Sn
Ub  U R                  XIU R                  5      n
[        U
U	UR                  UR                  S9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> # note: we are loading a `Siglip2Model` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
>>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```
)r=  rd   N).Nr   r  )r8   logitsr"   r#   )r2  r!   rw   ri   r*   r  r  r  loss_functionrN   r   r"   r#   )rI   r   r=  rd   r  r   outputssequence_output	pool_maskr  r8   s              r0   r   %Siglip2ForImageClassification.forward  s    ` /3.?.?/
!5)/
 	/
 "33  +,Y7::?;Q;QRI#ii(CKeiiXaghNiiO#jja@O 1%%fdkkBD$!//))	
 	
r/   )r  r  r2  )NNNN)r%   r&   r'   r(   rF  r  r   rS   rW   r  rn  rr  r   r   r*   r   r   r   r   r   r   r.   r   r   s   @r0   r  r    s     %O!}  <bii <=")) =  -14826&*J
llT)J
 $llT1J
 ((4/	J

 t#J
 +,J
 
J
  J
r/   r  )r  r   rI  r0  r  )r   )Bcollections.abcr   dataclassesr   typingr   numpyr  r*   torch.nnrW   torch.nn.functionalr   r{    r   r  activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_siglip2r   r   r   r   r2   r6   r  rL   r   r   floatr   r   r   r   r   r%  r0  rI  r   r  r  __all__r$   r/   r0   <module>r     so  , % !       & ! 6 9 b b F & \ \ I 5 X X 
 	<+ 	< 	< 
	< 	< 	< 
_K _  _@ebii eP%BII %^ %II%<<% 
% <<	%
 LL4'% % %.8)ryy 8)v 4 D Gi_ Gi GiT@RYY @D 
P
/ P

P
f 
J
- J

J
Z,"299 ,"^ B
) B
 B
J d
$: d
d
Nr/   