
    Z jp                        S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJ	s  J
r  S SKJ	r	  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+  \"\ " S S\ 5      5       5       r,\" S5       " S S\	RZ                  5      5       r. " S S\	RZ                  5      r/ " S S\	RZ                  5      r0 " S S\	RZ                  5      r1 S>S\	RZ                  S \Rd                  S!\Rd                  S"\Rd                  S#\Rd                  S-  S$\3S%\34S& jjr4 " S' S(\	RZ                  5      r5 " S) S*\5      r6 " S+ S,\	RZ                  5      r7 " S- S.\	RZ                  5      r8\" " S/ S0\5      5       r9\"" S1S29 " S3 S4\95      5       r:\"" S5S29 " S6 S7\95      5       r;S8\Rd                  S9\Rd                  4S: jr<\" " S; S<\95      5       r=/ S=Qr>g)?    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)Aimv2Output-   ai  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2VisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>'Aimv2Output.to_tuple.<locals>.<genexpr>L   s)     ^P]1Z;%?%?QZZ\QFP]s   46)tuplevaluesselfs    r.   r+   Aimv2Output.to_tupleK   s    ^PTP[P[P]^^^     )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r    r!   r"   r#   r$   r   r%   r1   r   r+   __static_attributes__r7   r6   r.   r   r   -   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r6   r   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Aimv2RMSNormO   epsr&   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Aimv2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr=   onesweightvariance_epsilon)r4   hidden_sizerE   	__class__s      r.   rH   Aimv2RMSNorm.__init__Q   s/     	ll5::k#:; #r6   hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor=   float32powmeanrsqrtrL   rK   )r4   rP   input_dtypevariances       r.   forwardAimv2RMSNorm.forwardY   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r6   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r1   rK   shaperL   r3   s    r.   
extra_reprAimv2RMSNorm.extra_repr`   s*    ))*+6$2G2G1HIIr6   )rL   rK   )gư>)r8   r9   r:   r;   floatrH   r=   Tensorr]   ra   r@   __classcell__rN   s   @r.   rC   rC   O   sB    $ $$ $ $;U\\ ;ell ;J Jr6   rC   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Aimv2MLPd   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)rG   rH   configrM   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr4   rm   rN   s     r.   rH   Aimv2MLP.__init__e   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r6   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r)   )rs   ru   rq   rr   )r4   xrs   s      r.   r]   Aimv2MLP.forwardo   s6    NN4;;t~~a/@#ADLLQRO#ST	r6   )ru   rm   rs   rq   rM   rn   rr   )r8   r9   r:   r;   rH   r]   r@   re   rf   s   @r.   rh   rh   d   s    0 r6   rh   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddingst   rm   c                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestriderR   position_idsr   rS   F
persistent)rG   rH   rm   
patch_sizer   Conv2dnum_channelsrM   patch_embedrC   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr=   arangeexpand)r4   rm   num_patchesrN   s      r.   rH   Aimv2VisionEmbeddings.__init__u   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir6      g     @cpur&   c                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	NrU   devicexy)indexing   g      ?).Nr   dim)r=   r   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rU   grid_wgrid_hpos_dimomegaout_hout_ws               r.   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr6   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )NrR   r   )r   r   rU   )sizer   r   	transposer   rm   r   r   r   rM   r   rU   r   r   )r4   r   _r   r   rP   	pos_embeds          r.   r]   Aimv2VisionEmbeddings.forward   s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1r6   )rm   r   r   r   r   )r8   r9   r:   r;   r   rH   staticmethodr=   rW   rd   r   r]   r@   re   rf   s   @r.   r|   r|   t   sb    j0 j !$'%u}}e	e e ELL U\\  r6   r|   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )Aimv2TextEmbeddings   rm   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr   r   Fr   )rG   rH   rM   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r=   r   r   )r4   rm   r   rN   s      r.   rH   Aimv2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r6   N	input_idsr   inputs_embedsr&   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )NrS   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r`   r   rK   
ValueErrorr   r   )r4   r   r   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r.   r]   Aimv2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r6   )r   r   NNN)r8   r9   r:   r;   r   rH   r=   
LongTensorr>   rd   r]   r@   re   rf   s   @r.   r   r      so    

 

 .20426	##d* &&- ((4/	
 
 r6   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrS   r   )r   rU   )ptrainingr   rR   )r=   matmulr   r   
functionalsoftmaxrW   rV   rU   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r6   c            
          ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\\R                  \R                  S-  4   4S jjr	S	r
U =r$ )Aimv2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperc                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frk   )rG   rH   rm   rM   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   ro   qkv_biask_projv_projq_projout_projrv   s     r.   rH   Aimv2Attention.__init__   s0   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr6   NrP   r   r&   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrS   r   rR           )r   r   r   )r`   r   r   viewr   r   r   r   get_interfacerm   _attn_implementationr   r   r   r   r   reshaper   r   )r4   rP   r   r   input_shapehidden_shapequerieskeysr2   attention_interfacer   r   s               r.   r]   Aimv2Attention.forward   s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r6   )rm   r   r   r   r   r   r   r   r   r   r   r)   )r8   r9   r:   r;   r<   rH   r=   rd   r1   r]   r@   re   rf   s   @r.   r   r      s[    GX, /3!)||!) t+!)
 
u||U\\D00	1!) !)r6   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )Aimv2EncoderLayeri!  rm   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g r)   )rG   rH   r   	attentionrh   ffnrC   rM   r   	rms_norm1	rms_norm2rv   s     r.   rH   Aimv2EncoderLayer.__init__"  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr6   NrP   r   r   r&   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)rP   r   r7   )r   r   r   r   )r4   rP   r   r   norm_hidden_statesr   r   
mlp_outputs           r.   r]   Aimv2EncoderLayer.forward)  sa     "^^M:r6Hrkqr%3!^^M:XX01
%2r6   )r   r   r   r   r)   )r8   r9   r:   r;   r   rH   r=   rd   r   r   r]   r@   re   rf   s   @r.   r   r   !  s^    O0 O /3|| t+ +,	
 
 r6   r   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ SS\R                  S-  S\
\   S\4S	 jj5       rS
rU =r$ )Aimv2Encoderi:  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Aimv2EncoderLayer`].

Args:
    config: Aimv2Config
rm   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rG   rH   rm   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r4   rm   r   rN   s      r.   rH   Aimv2Encoder.__init__C  sS    mmfNfNfHg$hHg1%6v%>Hg$hi&+# %is   A&Nr   r   r&   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)last_hidden_state)r  r   )r4   r   r   r   rP   encoder_layers         r.   r]   Aimv2Encoder.forwardJ  s>     &![[M) M ) ??r6   )rm   r  r  r)   )r8   r9   r:   r;   r<   r   rH   r   r=   rd   r   r   r   r]   r@   re   rf   s   @r.   r  r  :  s_    ,{ ,  /3@ t+@ +,	@
 
@ @r6   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHeadi\  rm   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nrk   r   T)rG   rH   rM   r   r   r   ro   r   r   r   rI   r=   zeros	cls_tokenoutput_projrv   s     r.   rH   "Aimv2AttentionPoolingHead.__init__]  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr6   rP   r&   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )NrS   r   r   rR   r   r   )r`   r  r   r   r   r   r   permuteFscaled_dot_product_attentionr   rY   r  )r4   rP   
batch_sizeseq_len
hidden_dimr  r   r   r   r   outputs              r.   r]   !Aimv2AttentionPoolingHead.forwardh  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.r6   )r  rM   r   r   r  r   )r8   r9   r:   r;   r   rH   r=   rd   r]   r@   re   rf   s   @r.   r  r  \  s2    	T0 	TU\\ ell  r6   r  c                      ^  \ rS rSr% Sr\\S'   SrSrSr	/ SQr
SrSrSr\R                  " 5       U 4S j5       rS	rU =r$ )
Aimv2PreTrainedModeli~  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rm   aimv2)imageT)r   r  r|   r   c                 ^  > [         TU ]  U5        [        US5      (       a`  [        UR                  [
        R                  5      (       a6  [        R                  " UR                  [        R                  " S5      5        g g [        U[        5      (       a5  [        R                  " UR                  SU R                  R                  S9  g [        U[         5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g [        U[.        5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g g )Nlogit_scaleg$I$I,@r   )rY   stdrS   r   )rG   _init_weightshasattrr*   r(  r   rI   init	constant_mathlogr  normal_r  rm   initializer_ranger|   copy_r   r=   r   r`   r   r   )r4   r   rN   s     r.   r*  "Aimv2PreTrainedModel._init_weights  s   f%6=))&,,bll;;v11488H3EF < 9::LL))9V9VW 566JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r6   r7   )r8   r9   r:   r;   r<   r   r?   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr=   no_gradr*  r@   re   rf   s   @r.   r$  r$  ~  sW    
 !&*# N
]]_
i 
ir6   r$  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jr\\" SS	9\S
\\   S\4S j5       5       5       rSrU =r$ )Aimv2VisionModeli  rm   r   rP   
attentionsc                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r)   )rG   rH   rm   r|   r   r  encoderrC   rM   r   r   use_headr  head	post_initrv   s     r.   rH   Aimv2VisionModel.__init__  so     /7#F+$V%7%79L9LM==1&9DIr6   r&   c                 .    U R                   R                  $ r)   )r   r   r3   s    r.   get_input_embeddings%Aimv2VisionModel.get_input_embeddings  s    ***r6   Ftie_last_hidden_statesr   c                     U R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```r   Nr  pooler_outputr7   )r   rB  r  r   rC  rD  r   )r4   r   r   rP   encoder_outputsr  rN  s          r.   r]   Aimv2VisionModel.forward  sx    < 5+/<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r6   )rm   r   rB  rD  r   rC  )r8   r9   r:   r;   r   r?   main_input_namer   r   _can_record_outputsrH   r   ModulerH  r   r   r   r   r   r   r]   r@   re   rf   s   @r.   r>  r>    s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
r6   r>  zJ
    The text model from AIMv2 without any head or projection on top.
    c            
          ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
R                  4S jrS r\\" S	S
9\ SS\R$                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )Aimv2TextModeli  r   r?  rm   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r)   )rG   rH   rm   r   r   r  rB  rC   rM   r   r   eos_token_idrE  rv   s     r.   rH   Aimv2TextModel.__init__  s_     -f5#F+$V%7%79L9LM"//r6   r&   c                 .    U R                   R                  $ r)   r   r   r3   s    r.   rH  #Aimv2TextModel.get_input_embeddings  s    ...r6   c                 $    XR                   l        g r)   rZ  )r4   r   s     r.   set_input_embeddings#Aimv2TextModel.set_input_embeddings
  s    */'r6   FrJ  Nr   r   c                    U R                  U5      nUR                  u  pVn[        R                  " U[        R                  UR
                  S9nUR                  S5      R                  US5      nUb  [        U R                  UUUS S9nU R                  " S	UUS.UD6n	U	R                  n
U R                  U
5      n
U
[        R                  " U
R                  S   U
R
                  S9UR                  [        R                  U
R
                  S9U R                  :H  R                  5       R!                  SS94   n[#        U
US9$ )
Nr   r   rS   )rm   r   r   r   past_key_values)r   r   )r   r   rM  r7   )r   r`   r=   r   longr   	unsqueezer   r   rm   rB  r  r   rV   r   rW  argmaxr   )r4   r   r   r   rP   r  r  r   r   rO  r  pooled_outputs               r.   r]   Aimv2TextModel.forward  sG    	2!.!4!4
Q||G5::mFZFZ[#--a077
BG%/{{+)- $N ,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r6   )rm   r   rB  rW  r   r)   )r8   r9   r:   r;   rQ  r   r   rR  r   rH   r   rS  rH  r]  r   r   r   r=   rd   r   r   r   r]   r@   re   rf   s   @r.   rU  rU    s     "O +$
	 	/bii /0  E2 /3&
 t+&
 +,	&

 
$&
  3  &
r6   rU  tensorr&   c                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
rR   rS   T)r   rT   g      ?)r=   rX   sum)rf  square_tensor
sum_tensornormed_tensors       r.   _get_vector_normrl  9  s<    
 IIfa(M=b$?JIIj#.Mr6   c                     ^  \ rS rSrSrS\4U 4S jjr\\  SS\	R                  S\	R                  S-  S\	R                  S-  S	\\   S
\\-  4
S jj5       5       r\\ SS\	R                   S\S	\\   S
\\-  4S jj5       5       r\\   SS\	R&                  S-  S\	R                   S-  S\	R                  S-  S	\\   S
\4
S jj5       5       rSrU =r$ )
Aimv2ModeliD  Trm   c                   > [         TU ]  U5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFrk   )rG   rH   projection_dimvision_configrM   vision_embed_dimtext_configtext_embed_dimr>  _from_configvision_modelrU  
text_modelr   ro   visual_projectiontext_projectionrI   r=   rf  rm   logit_scale_init_valuer(  r.  r/  max_logit_scalemax_log_logit_scalerE  rv   s     r.   rH   Aimv2Model.__init__H  s     $33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r6   Nr   r   r   r   r&   c                 x    U R                   " SUUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, Aimv2Model

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```T)r   r   r   return_dictr7   )rw  rN  ry  )r4   r   r   r   r   text_outputsrd  s          r.   get_text_featuresAimv2Model.get_text_featuresZ  sS    0 48?? 4
)%	4

 4
 %22%)%9%9-%H"r6   r   interpolate_pos_encodingc                 v    U R                   " SUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a&  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, Aimv2Model
>>> from transformers.image_utils import load_image

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```T)r   r  r  r7   )rv  rN  rx  )r4   r   r  r   vision_outputsrd  s         r.   get_image_featuresAimv2Model.get_image_features~  sR    6 6:5F5F 6
%%=6
 	6
 '44'+'='=m'L$r6   c           	          U R                   " SSU0UD6nU R                  " SUUS.UD6nUR                  nU R                  U5      nUR                  nU R	                  U5      nU[        U5      -  nU[        U5      -  nU R                  R                  SU R                  5      R                  5       R                  UR                  5      n	X-  UR                  5       -  n
U
R                  5       n[        UU
UUUUS9$ )aD  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   )r   r   r   )r    r!   r"   r#   r$   r%   r7   )rv  rw  rN  rx  ry  rl  r(  clampr|  exprV   r   tr   )r4   r   r   r   r   r  r  r#   r"   r(  r!   r    s               r.   r]   Aimv2Model.forward  s   B 6:5F5F 6
%6
6

 48?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r6   )	r(  r|  rp  rt  rw  ry  rr  rv  rx  )NN)Fr   )r8   r9   r:   r;   r9  r   rH   r   r   r=   rd   r   r   r1   r   r  r>   boolr  r   r   r]   r@   re   rf   s   @r.   rn  rn  D  sb   { $  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
r6   rn  )r>  rn  r$  rU  )r   )?r.  collections.abcr   dataclassesr   typingr   r=   torch.nn.functionalr   r   r   r   r,  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_aimv2r   r   r   r   rS  rC   rh   r|   r   rd   rc   r   r   r   r  r  r$  r>  rU  rl  rn  __all__r7   r6   r.   <module>r     s&  ,  $ !      & ! 7 / 9 K F & V V 7 5 P P 
_+ _  _@ Y'J299 J (J(ryy  1BII 1h%")) %^ %II%<<% 
% <<	%
 LL4'% % %.7)RYY 7)t2 2@299 @D		 D i? i iD 
F
+ F

F
R 
B
) B

B
JU\\ ell  `
% `
 `
F Wr6   