
    Z j،                        S r SSKJr  SSKJr  SSKJr  SSKrSSK	r	SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \ " SS9\ " S S\5      5       5       r+\\ " SS9 " S S\5      5       5       r,\ \ " S S\5      5       5       r- " S S\
R\                  5      r/ " S S \
R\                  5      r0 SAS!\
R\                  S"\	Rb                  S#\	Rb                  S$\	Rb                  S%\	Rb                  S-  S&\2S'\24S( jjr3 " S) S*\
R\                  5      r4 " S+ S,\
R\                  5      r5 " S- S.\5      r6\  " S/ S0\5      5       r7 " S1 S2\
R\                  5      r8\ " S3S9 " S4 S5\75      5       r9\ " S6S9 " S7 S8\75      5       r: " S9 S:\
R\                  5      r;\  " S; S<\75      5       r<\ " S=S9 " S> S?\75      5       r=/ S@Qr>g)BzPyTorch Siglip model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)merge_with_config_defaults)capture_outputs   )SiglipConfigSiglipTextConfigSiglipVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
SiglipVisionModelOutput+   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r!   torchFloatTensor__annotations__r"   r#   tupler$   __static_attributes__r%       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/siglip/modeling_siglip.pyr   r   +   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r0   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
SiglipTextModelOutput>   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr"   .r#   r$   r%   )r&   r'   r(   r)   r*   r5   r+   r,   r-   r"   r#   r.   r$   r/   r%   r0   r1   r3   r3   >   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r0   r3   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)SiglipOutputQ   am  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`SiglipTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`SiglipVisionModel`].
Nlosslogits_per_imagelogits_per_textr5   r!   text_model_outputvision_model_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     r1   	<genexpr>(SiglipOutput.to_tuple.<locals>.<genexpr>q   s)     ^P]1Z;%?%?QZZ\QFP]s   46)r.   valuesselfs    r1   rC   SiglipOutput.to_tuplep   s    ^PTP[P[P]^^^r0   r%   )r&   r'   r(   r)   r*   r9   r+   r,   r-   r:   r;   r5   r!   r<   r   r=   r.   r   rC   r/   r%   r0   r1   r7   r7   Q   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r0   r7   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )SiglipVisionEmbeddingst   configc                 ^  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  g )Nvalid)in_channelsout_channelskernel_sizestridepadding   position_idsr   F
persistent)super__init__rO   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr+   arangeexpandrJ   rO   	__class__s     r1   r^   SiglipVisionEmbeddings.__init__u   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr0   
embeddingsheightwidthr>   c                    UR                   S   nU R                  R                  R                   S   n[        R                  R                  5       (       d%  XE:X  a   X#:X  a  U R                  U R                  5      $ U R                  R                  R                  S5      nUR                   S   nX R                  -  nX0R                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   rZ   g      ?r   rW   bicubicF)sizemodealign_corners)shaperi   weightr+   jit
is_tracingrX   	unsqueezerb   r   reshapepermuter   
functionalinterpolateview)rJ   rp   rq   rr   rf   rg   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r1   interpolate_pos_encoding/SiglipVisionEmbeddings.interpolate_pos_encoding   s:    !&&q)//66<<Q? yy##%%+*F6?**4+<+<==1188BB1Er".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr0   pixel_valuesc                 X   UR                   u    p4nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU(       a  XR                  XU5      -   nU$ XR                  U R                  5      -   nU$ )N)dtyperW   r   )
rx   re   ry   r   toflatten	transposer   ri   rX   )	rJ   r   r   _rq   rr   target_dtypepatch_embedsrp   s	            r1   forwardSiglipVisionEmbeddings.forward   s    *001e++2288++LOO,O,OP!))!,66q!<
##&C&CJX]&^^J  $&=&=d>O>O&PPJr0   )rO   r`   ra   rf   rg   re   rb   ri   F)r&   r'   r(   r)   r   r^   r+   Tensorintr   r,   r   r/   __classcell__rn   s   @r1   rM   rM   t   se    q1 q($5<< $ $UX $]b]i]i $L
E$5$5 
Z_ZfZf 
 
r0   rM   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )SiglipTextEmbeddings   rO   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )NrX   rY   Fr[   )r]   r^   r_   r   rh   
vocab_sizetoken_embeddingmax_position_embeddingsri   rj   r+   rk   rl   rJ   rO   r`   rn   s      r1   r^   SiglipTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r0   N	input_idsrX   inputs_embedsr>   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )NrZ   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rx   ri   ry   
ValueErrorrX   r   )rJ   r   rX   r   
seq_lengthmax_position_embeddingposition_embeddingsrp   s           r1   r   SiglipTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r0   )ri   r   NNN)r&   r'   r(   r)   r   r^   r+   
LongTensorr,   r   r   r/   r   r   s   @r1   r   r      sp    

/ 

 .20426	##d* &&- ((4/	
 
 r0   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrZ   r   )r   r   )ptrainingr   rW   )r+   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r0   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )SiglipAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r]   r^   rO   r_   r`   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrm   s     r1   r^   SiglipAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar0   Nr#   r   r>   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrZ   r   rW           )r   r   r   )rx   r   r   r   r   r   r   r   get_interfacerO   _attn_implementationr   r   r   r   r   r}   r   r   )rJ   r#   r   r   input_shapehidden_shapequerieskeysrH   attention_interfacer   r   s               r1   r   SiglipAttention.forward  s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r0   )rO   r   r`   r   r   r   r   r   r   r   r   rA   )r&   r'   r(   r)   r*   r^   r+   r   r.   r   r/   r   r   s   @r1   r   r      s[    GB. /3!)||!) t+!)
 
u||U\\D00	1!) !)r0   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	SiglipMLPi8  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g rA   )r]   r^   rO   r	   
hidden_actactivation_fnr   r   r_   intermediate_sizefc1fc2rm   s     r1   r^   SiglipMLP.__init__9  sb    #F$5$5699V//1I1IJ99V55v7I7IJr0   r#   r>   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rA   )r   r   r   )rJ   r#   s     r1   r   SiglipMLP.forward@  s4    /**=9/r0   )r   rO   r   r   )
r&   r'   r(   r)   r^   r+   r   r   r/   r   r   s   @r1   r   r   8  s)    KU\\ ell  r0   r   c            	          ^  \ rS rSrS\\-  4U 4S jjr\S\R                  S\R                  S\
\   S\R                  4S j5       rS	rU =r$ )
SiglipEncoderLayeriG  rO   c                 <  > [         TU ]  5         UR                  U l        [        R
                  " U R                  UR                  S9U l        [        U5      U l	        [        R
                  " U R                  UR                  S9U l
        [        U5      U l        g Neps)r]   r^   r_   r`   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprm   s     r1   r^   SiglipEncoderLayer.__init__H  sm    ++<<F<Q<QR(0<<F<Q<QRV$r0   r#   r   r   r>   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r#   r   r%   )r   r   r   r   )rJ   r#   r   r   residualr   s         r1   r   SiglipEncoderLayer.forwardP  sz     !((7>> 
')
 

 !0 ((7/ 0r0   )r`   r   r   r   r   )r&   r'   r(   r)   r   r   r^   r   r+   r   r   r   r,   r   r/   r   r   s   @r1   r   r   G  sd    %14DD % ||  +,	
 
		 r0   r   c                   |    \ rS rSr% \\S'   SrSrSr/ SQr	Sr
SrSrSr\\S.r\R$                  " 5       S 5       rS	rg
)SiglipPreTrainedModelii  rO   siglip)imagetextT)r   rM   r   #SiglipMultiheadAttentionPoolingHead)r#   r$   c                 H   [        U[        5      (       Ga  [        U R                  [        5      (       a   U R                  R                  R
                  OU R                  R
                  n[        R                  " UR                  R                  S[        R                  " U5      -  S9  [        US5      (       a\  [        R                  " UR                  [        R                   " UR                  R"                  S   5      R%                  S5      5        gg[        U[&        R(                  5      (       a!  [        R*                  " UR                  5        g[        U[,        5      (       GaQ  [        R.                  " UR0                  R                  5        [        R.                  " UR2                  R                  5        [        R.                  " UR4                  R                  5        [        R.                  " UR6                  R                  5        [        R8                  " UR0                  R:                  5        [        R8                  " UR2                  R:                  5        [        R8                  " UR4                  R:                  5        [        R8                  " UR6                  R:                  5        g[        U[<        5      (       a  [        R.                  " UR>                  R                  5        [        R.                  " UR@                  R                  5        [        R                  " UR>                  R:                  SS9  [        R                  " UR@                  R:                  SS9  g[        U[B        5      (       au  [        R.                  " URD                  5        [        R.                  " URF                  RH                  5        [        R8                  " URF                  RJ                  5        g[        U[L        5      (       aA  [        R8                  " URN                  5        [        R8                  " URP                  5        g[        U[R        5      (       ab  [        R                  " URT                  R                  U R                  R                  R
                  S-  U R                  RV                  -  S9  g[        U[&        RX                  [&        RZ                  45      (       aO  [        R\                  " UR                  5        UR:                  b!  [        R8                  " UR:                  5        gg[        U[&        R^                  5      (       aA  [        R8                  " UR:                  5        [        R`                  " UR                  5        g[        U[b        5      (       a\  [        R                  " UR                  [        R                   " UR                  R"                  S   5      R%                  S5      5        gg)	zInitialize the weightsr   )stdrX   rZ   rY   gư>r   N)2rB   rM   rO   r   vision_configr_   initnormal_ri   ry   npsqrthasattrcopy_rX   r+   rk   rx   rl   r   rh   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglipModellogit_scale
logit_biasSiglipForImageClassification
classifierinitializer_factorr   rc   lecun_normal_r   ones_r   )rJ   r   rr   s      r1   _init_weights#SiglipPreTrainedModel._init_weights  s    f455 dkk<88 ))55[[,, 
 LL2299q2775>?QRv~..

6..V=P=P=V=VWY=Z0[0b0bcj0kl /--))&--800  !5!56  !5!56  !5!56  !7!78KK**+KK**+KK**+KK,,-	**  !2!23  !2!23LLd3LLd3 CDD  .  !1!1!@!@AKK((556,,KK**+KK))* <==LL!!((KK--994?$++B`B`` BII 677v}}-{{&FKK( '--KK$JJv}}% 455JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 6r0   r%   N)r&   r'   r(   r)   r   r-   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr+   no_gradr  r/   r%   r0   r1   r   r   i  sg     (&*#  N"& ,%
 ]]_/i /ir0   r   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ SS\R                  S-  S\
\   S\4S	 jj5       rS
rU =r$ )SiglipEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`SiglipEncoderLayer`].

Args:
    config: SiglipConfig
rO   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r]   r^   rO   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rJ   rO   r   rn   s      r1   r^   SiglipEncoder.__init__  sS    mmvOgOgIh$iIhA%7%?Ih$ij&+# %js   A&Nr   r   r>   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)r"   )r&  r   )rJ   r   r   r   r#   encoder_layers         r1   r   SiglipEncoder.forward  s>     &![[M) M ) ??r0   )rO   r'  r&  rA   )r&   r'   r(   r)   r*   r   r^   r   r+   r   r   r   r   r   r/   r   r   s   @r1   r!  r!    s_    ,| ,  /3@ t+@ +,	@
 
@ @r0   r!  zK
    The text model from SigLIP without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS\4U 4S jjr	\
\" SS9\   SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\\   S\4
S jj5       5       5       rSrU =r$ )SiglipTextModeli  rO   )r   
text_modelr   c                 8  > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  5      U l        U R                  5         g r   )r]   r^   rO   r_   r   rp   r!  encoderr   r   r   final_layer_normr   projection_sizehead	post_initr   s      r1   r^   SiglipTextModel.__init__  so     &&	.v6$V, "Y<Q<Q RIIi)?)?@	r0   Ftie_last_hidden_statesNr   r   rX   r   r>   c                 b   Uc  [        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[	        U R
                  UUS9nU R                  " SUUS.UD6nUR                  nU R                  U5      nUSS2SSS24   n	U R                  U	5      n	[        UU	S9$ )	aT  
Examples:

```python
>>> from transformers import AutoTokenizer, SiglipTextModel

>>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzYou have to specify input_idsrZ   )r   rX   )rO   r   r   )r   r   r"   pooler_outputr%   )r   ru   r   rp   r
   rO   r0  r"   r1  r3  r   )
rJ   r   r   rX   r   r   r#   encoder_outputsr"   pooled_outputs
             r1   r   SiglipTextModel.forward  s    4 <==nn&NN2{27	)W 3;;')
 ,0<< ,
'),
 ,
 ,== 112CD *!R(3		-0)/'
 	
r0   )rO   rp   r0  r1  r3  r   )r&   r'   r(   r)   r   r-   r  r  _input_embed_layerr^   r   r   r   r+   r   r   r   r   r   r/   r   r   s   @r1   r-  r-    s      $*	/ 	  E2 *..2,0	6
<<$&6
 t+6
 llT)	6

 +,6
 
$6
  3  6
r0   r-  zM
    The vision model from SigLIP without any head or projection on top.
    c            
          ^  \ rS rSr% \\S'   SrSrSrSr	S\4U 4S jjr
\\" SS	9\ SS
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )SiglipVisionModeli(  rO   r   r   vision_modelre   c                 x  > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        [        US5      (       d  SOUR                  U l        U R                  (       a  [        U5      U l        U R#                  5         g )Nr   vision_use_headT)r]   r^   rO   r_   rM   rp   r!  r0  r   r   r   post_layernormr  rD  use_headr   r3  r4  r   s      r1   r^   SiglipVisionModel.__init__4  s     &&	08$V, ll9:O:OP$+F4E$F$FFLbLb==;FCDIr0   Fr6  r   Nr   r>   c                     U R                  XS9nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )a  
Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, SiglipVisionModel

>>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```)r   r   Nr9  r%   )rp   r0  r"   rE  rF  r3  r   )rJ   r   r   r   r#   r;  r"   r:  s           r1   r   SiglipVisionModel.forwardA  s|    > h+/<< ,
',
,

 ,== //0AB8<		"344)/'
 	
r0   )rO   rp   r0  r3  rE  rF  r   )r&   r'   r(   r)   r   r-   main_input_namer  r  r>  r^   r   r   r   boolr   r   r   r   r/   r   r   s   @r1   r@  r@  (  s     $O!&*1   E2 16+
 #'++
 +,	+

 
$+
  3  +
r0   r@  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )r   ir  zMultihead Attention Pooling.rO   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        R                  R                  UR                  UR                  SS9U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        g )Nr   T)batch_firstr   )r]   r^   r   	Parameterr+   randnr_   r  MultiheadAttentionr   r	  r   r   	layernormr   r   rm   s     r1   r^   ,SiglipMultiheadAttentionPoolingHead.__init__u  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STV$r0   c                     UR                   S   nU R                  R                  USS5      nU R                  X1U5      S   nUnU R	                  U5      nX@R                  U5      -   nUS S 2S4   $ )Nr   r   )rx   r  repeatr	  rR  r   )rJ   hidden_state
batch_sizer  r   s        r1   r   +SiglipMultiheadAttentionPoolingHead.forward}  sr    !''*


!!*a3~~e<HK~~l3((<"88AqD!!r0   )r	  rR  r   r  )
r&   r'   r(   r)   r*   r   r^   r   r/   r   r   s   @r1   r   r   r  s    &%1 %
" 
"r0   r   c                   $  ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S\R                  4S jr
\\  SS	\R                  S
\R                  S-  S\R                  S-  S\\   S\\-  4
S jj5       5       r\\ SS\R(                  S\S\\   S\\-  4S jj5       5       r\\      SS	\R.                  S-  S\R(                  S-  S
\R                  S-  S\R.                  S-  S\S-  S\S\\   S\4S jj5       5       rSrU =r$ )r  i  rO   c                   > [         TU ]  U5        UR                  nUR                  n[        R                  U5      U l        [        R                  U5      U l        [        R                  " [        R                  " S5      5      U l        [        R                  " [        R                  " S5      5      U l        U R                  5         g )Nr   )r]   r^   text_configr   r-  _from_configr.  r@  rB  r   rO  r+   rP  r  r  r4  )rJ   rO   r[  r   rn   s       r1   r^   SiglipModel.__init__  s     ((,, *66{C-::=I<<A7,,u{{1~6 	r0   r>   c                 B    U R                   R                  R                  $ rA   r.  rp   r   rI   s    r1   get_input_embeddings SiglipModel.get_input_embeddings  s    ))999r0   r   c                 8    XR                   R                  l        g rA   r_  rJ   r   s     r1   set_input_embeddings SiglipModel.set_input_embeddings  s    5:""2r0   Nr   r   rX   r   c                 .    U R                   " SUUUS.UD6$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

>>> # important: make sure to set padding="max_length" as that's how the model was trained
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
>>> with torch.no_grad():
...     text_features = model.get_text_features(**inputs)
```r   r   rX   r%   )r.  )rJ   r   r   rX   r   s        r1   get_text_featuresSiglipModel.get_text_features  s-    0  
)%
 	
 	
r0   r   r   c                 ,    U R                   " SUUS.UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModel
>>> from transformers.image_utils import load_image

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```r   r   r%   )rB  )rJ   r   r   r   s       r1   get_image_featuresSiglipModel.get_image_features  s,    6    
%%=
 
 	
r0   return_lossc           
         U R                   " SUUS.UD6nU R                  " SUUUS.UD6n	UR                  n
U	R                  nXR                  SSSS9-  n
XR                  SSSS9-  n[        R
                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      pXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR                  S5      UR                  S	9n[        R                  " U5      * SU-  -   n[        R                  R                   R#                  UU-  5      n[        R$                  " USS
9* nUR'                  5       n[)        UUUUU
U	US9$ )aD  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```rk  rg  rW   rZ   T)r   r   keepdimNr   )devicer   )r9   r:   r;   r5   r!   r<   r=   r%   )rB  r.  r:  normr+   r   tr   rq  r  r  expeyeru   	ones_liker   r   
logsigmoidsummeanr7   )rJ   r   r   r   rX   rn  r   r   vision_outputstext_outputsr!   r5   r;   r  r  r:   r9   rv  m1_diag1logliknlls                        r1   r   SiglipModel.forward  s   X 6:5F5F 6
%%=6
 6
 48?? 4
)%4
 	4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r0   )r  r  r.  rB  )NNr   )NNNNNF)r&   r'   r(   r)   r   r-   r^   r   Moduler`  rd  r   r   r+   r   r   r   r.   r   rh  r,   rK  rl  r   r7   r   r/   r   r   s   @r1   r  r    s   |  :bii :;")) ;  /3,0	
<<
 t+
 llT)	

 +,
 
+	+
  
:  */
''
 #'
 +,	

 
+	+
  
@  .215.204#').W
##d*W
 ''$.W
 t+	W

 &&-W
 D[W
 #'W
 +,W
 
W
  W
r0   r  z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrSrSrS\SS4U 4S jjrS\R                  4S jr
S	\R                  4S
 jr\\   SS\R                  S-  S\R                  S-  S\S\\   S\4
S jj5       5       rSrU =r$ )r  iA  r   rA  rO   r>   Nc                 l  > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )r]   r^   
num_labelsr@  r\  r   rB  r   r   r_   Identityr  r4  rm   s     r1   r^   %SiglipForImageClassification.__init__K  s      ++-::6;O;OP OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r0   c                 B    U R                   R                  R                  $ rA   rB  rp   re   rI   s    r1   r`  1SiglipForImageClassification.get_input_embeddingsY  s      ++;;;r0   r   c                 8    XR                   R                  l        g rA   r  rc  s     r1   rd  1SiglipForImageClassification.set_input_embeddings\  s    7<$$4r0   labelsr   r   c                    U R                   " U4SU0UD6nUR                  n[        R                  " USS9nU R	                  U5      nSnUb  U R                  X'U R                  5      n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, SiglipForImageClassification
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> # note: we are loading a `SiglipModel` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
>>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```r   r   rr  N)r9   logitsr#   r$   )
rB  r"   r+   rz  r  loss_functionrO   r   r#   r$   )	rJ   r   r  r   r   outputssequence_outputr  r9   s	            r1   r   $SiglipForImageClassification.forward_  s    T /3.?.?/
%=/
 /
 "33  **_!<1%%fdkkBD$!//))	
 	
r0   )r  r  rB  )NNF)r&   r'   r(   r)   rJ  r  r   r^   r   r  r`  rd  r   r   r+   r   rK  r   r   r   r   r/   r   r   s   @r1   r  r  A  s     %O!|  <bii <=")) =  -1&*).	>
llT)>
 t#>
 #'	>

 +,>
 
>
  >
r0   r  )r  r   r-  r@  r  )r   )?r*   collections.abcr   dataclassesr   typingr   numpyr   r+   r    r   r   activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_siglipr   r   r   r   r3   r7   r  rM   r   r   floatr   r   r   r   r   r!  r-  r@  r   r  r  __all__r%   r0   r1   <module>r     sq    $ !     & ! 6 9 b b F &  8 5 T T 
 	<k 	< 	< 	<K 	< 	< 
_; _  _@ERYY ER%299 %^ %II%<<% 
% <<	%
 LL4'% % %.8)bii 8)x		 3 D FiO Fi FiT@BII @D 
J
+ J

J
Z 
B
- B

B
J"")) "0 s
' s
 s
l X
#8 X
X
vr0   