
    Z j                        S r SSKJr  SSKJr  SSKJr  SSKrSSKJr  SSK	J
r  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)  \RT                  " \+5      r,S\RZ                  S\RZ                  4S jr.S\RZ                  S\RZ                  4S jr/S\RZ                  S\RZ                  4S jr0\" SS9\ " S S\5      5       5       r1\" SS9\ " S  S!\5      5       5       r2\\ " S" S#\5      5       5       r3 " S$ S%\Rh                  5      r5 " S& S'\Rh                  5      r6 SKS(\Rh                  S)\RZ                  S*\RZ                  S+\RZ                  S,\RZ                  S-  S-\7S.\7S/\\   4S0 jjr8 " S1 S2\Rh                  5      r9 " S3 S4\Rh                  5      r: " S5 S6\5      r;\ " S7 S8\5      5       r< " S9 S:\Rh                  5      r=\" S;S9 " S< S=\<5      5       r>\" S>S9 " S? S@\<5      5       r?\ " SA SB\<5      5       r@\ " SC SD\<5      5       rA\ " SE SF\<5      5       rB\" SGS9 " SH SI\<5      5       rC/ SJQrDg)LzPyTorch CLIP model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr"   )r   s    w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr)   /   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 P    [        U 5      n[        U R                  5      nX-   S-  $ )Ng       @)r)   T)r+   caption_loss
image_losss      r(   image_text_contrastive_lossr0   3   s(    #J/L!*,,/J%,,r*   tensorc                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
   T)dimkeepdim      ?)r%   powsum)r1   square_tensor
sum_tensornormed_tensors       r(   _get_vector_normr=   9   s<    
 IIfa(M=b$?JIIj#.Mr*   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
CLIPVisionModelOutputD   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__rB   r%   FloatTensor__annotations__rC   rD   tuplerE   __static_attributes__rF   r*   r(   r@   r@   D   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r*   r@   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
CLIPTextModelOutputV   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsrC   .rD   rE   rF   )rG   rH   rI   rJ   rK   rS   r%   rL   rM   rC   rD   rN   rE   rO   rF   r*   r(   rQ   rQ   V   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r*   rQ   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)
CLIPOutputh   ae  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPVisionModel`].
Nlosslogits_per_imagelogits_per_textrS   rB   text_model_outputvision_model_outputr   c                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     r(   	<genexpr>&CLIPOutput.to_tuple.<locals>.<genexpr>   s)     ^P]1Z;%?%?QZZ\QFP]s   46)rN   valuesselfs    r(   r`   CLIPOutput.to_tuple   s    ^PTP[P[P]^^^r*   rF   )rG   rH   rI   rJ   rK   rW   r%   rL   rM   rX   rY   rS   rB   rZ   r   r[   rN   r   r`   rO   rF   r*   r(   rU   rU   h   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r*   rU   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )CLIPVisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr3   r   position_idsr   r4   
persistent)super__init__rl   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr&   expandrg   rl   	__class__s     r(   rx   CLIPVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr4   r7   r   r3   bicubicF)sizemodealign_cornersr5   )shaper   weight	unsqueezer%   jit
is_tracingrs   r|   r   reshapepermuter   r#   interpolateviewcat)rg   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr5   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encoding-CLIPVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr*   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).)dtyper3   r   r4   r   )r   r{   
ValueErrorr   r   r   toflatten	transposer   r   r%   r   r   r   rs   )rg   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r(   forwardCLIPVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr*   )	r   rl   rz   r{   r   r   r   r|   r   F)rG   rH   rI   rJ   r   rx   r%   Tensorintr   rL   r   rO   __classcell__r   s   @r(   rj   rj      sj    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r*   rj   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )CLIPTextEmbeddings   rl   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nrs   rt   Fru   )rw   rx   ry   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r%   r&   r   rg   rl   rz   r   s      r(   rx   CLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r*   N	input_idsrs   inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nr4   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   r   rs   r   )rg   r   rs   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r(   r   CLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r*   )r   r   NNN)rG   rH   rI   rJ   r   rx   r%   
LongTensorrL   r   r   rO   r   r   s   @r(   r   r      so    

~ 

 .20426	##d* &&- ((4/	
 
 r*   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr4   r   )r5   r   )ptrainingr   r3   )r%   matmulr   r   r#   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r*   c                      ^  \ rS rSrSrS\\-  4U 4S jjr SS\R                  S\R                  S-  S\
\   S	\\R                  \R                  S-  4   4S
 jjrSrU =r$ )CLIPAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrl   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  S-  U l        UR                  U l
        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )N      F)rw   rx   rl   ry   rz   num_attention_heads	num_headshead_dimscaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r(   rx   CLIPAttention.__init__  s    ++33$..8]]D(
//ii?ii?ii?		$..$..Ar*   NrD   r   r   r   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNr4   r   r3           )r   r   )r   r   r   r   r   r   r   r   get_interfacerl   _attn_implementationr   r   r   r   r   r   r   )rg   rD   r   r   input_shapehidden_shapequerieskeysre   attention_interfacer   r   s               r(   r   CLIPAttention.forward,  sI    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
! "));;;;FFHmmK0((r*   )rl   r   rz   r   r   r   r   r   r   r   r   r^   )rG   rH   rI   rJ   rK   r   r   rx   r%   r   r   r   rN   r   rO   r   r   s   @r(   r   r     sy    GB/.@ B$ /3%)||%) t+%) +,	%)
 
u||U\\D00	1%) %)r*   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CLIPMLPiT  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r^   )rw   rx   rl   r	   
hidden_actactivation_fnr   r   ry   intermediate_sizefc1fc2r   s     r(   rx   CLIPMLP.__init__U  sb    #F$5$5699V//1I1IJ99V55v7I7IJr*   rD   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r^   )r   r   r   )rg   rD   s     r(   r   CLIPMLP.forward\  s4    /**=9/r*   )r   rl   r   r   )
rG   rH   rI   rJ   rx   r%   r   r   rO   r   r   s   @r(   r   r   T  s)    KU\\ ell  r*   r   c                      ^  \ rS rSrS\\-  4U 4S jjrS\R                  S\R                  S\	\
   S\R                  4S jrS	rU =r$ )
CLIPEncoderLayeric  rl   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)eps)rw   rx   ry   rz   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r(   rx   CLIPEncoderLayer.__init__d  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr*   rD   r   r   r   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)rD   r   rF   )r  r	  r  r  )rg   rD   r   r   residualr   s         r(   r   CLIPEncoderLayer.forwardl  sz     !((7>> 
')
 

 !0 ((7/ 0r*   )rz   r  r  r  r	  )rG   rH   rI   rJ   r   r   rx   r%   r   r   r   rL   r   rO   r   r   s   @r(   r  r  c  s[    S/.@ S||  +,	
 
		 r*   r  c                   |    \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrSr\\S.r\R$                  " 5       S 5       rS	rg
)CLIPPreTrainedModeli  rl   clip)imagetext)r   r  rj   T)rD   rE   c                    U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SUS-  S9  [        R
                  " UR                  R                  SUS-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        GOp[        U[        5      (       Ga   [        R
                  " UR                   SUR"                  S-  U-  S9  [        R
                  " UR$                  R                  UR                   R&                  U-  S9  [        R
                  " UR                  R                  UR                   R&                  U-  S9  [        R                  " UR                  [        R                  " UR(                  5      R                  S5      5        GOZ[        U[*        5      (       a  UR"                  S-  SUR                   R,                  -  S-  -  U-  nUR"                  S-  U-  n[        R
                  " UR.                  R                  US9  [        R
                  " UR0                  R                  US9  [        R
                  " UR2                  R                  US9  [        R
                  " UR4                  R                  US9  GO^[        U[6        5      (       a  UR                   R8                  S-  SUR                   R,                  -  S-  -  U-  nSUR                   R8                  -  S-  U-  n[        R
                  " UR:                  R                  US9  [        R
                  " UR<                  R                  US9  GO[        U[>        5      (       at  [        R
                  " UR@                  R                  URB                  S-  U-  S9  [        R
                  " URD                  R                  URF                  S-  U-  S9  GO[        U[H        5      (       aD  [        R
                  " URD                  R                  U R                   R8                  S-  U-  S9  O[        U[J        5      (       aD  [        R
                  " UR@                  R                  U R                   R8                  S-  U-  S9  Ob[        U[L        5      (       aM  [        R
                  " URN                  R                  U R                   RP                  R8                  S-  U-  S9  [        U[R        RT                  5      (       a@  [        RV                  " URX                  5        [        RZ                  " UR                  5        [        U[R        R\                  5      (       a/  URX                  b!  [        RV                  " URX                  5        g	g	g	)
zInitialize the weightsr   g{Gz?)meanstdr4   rt   r   )r  r3   N)/rl   initializer_factorr_   r   initnormal_r   r   r   copy_rs   r%   r&   r   r   rj   r   rz   r   initializer_ranger   r   num_hidden_layersr   r   r   r   r   ry   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r
  zeros_rr   ones_r   )rg   r   factorin_proj_stdout_proj_stdfc_stds         r(   _init_weights!CLIPPreTrainedModel._init_weights  s4    //f011LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 455LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_..!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B((!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<	**LL&&--))4/&8 LL((//++T1F:  =>>LL((//KK++T1F:  ;<<LL&&--KK++T1F:  :;;LL!!((KK--994?&H
 fbll++KK$JJv}}%fbii((V[[-DKK$ .E(r*   rF   N)rG   rH   rI   rJ   r   rM   base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr  r   _can_record_outputsr%   no_gradr1  rO   rF   r*   r(   r  r    s_    (Z&*#N"&)#
 ]]_5% 5%r*   r  c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )CLIPEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPEncoderLayer`].

Args:
    config: CLIPConfig
rl   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rw   rx   rl   r   
ModuleListranger   r  layersgradient_checkpointing)rg   rl   r   r   s      r(   rx   CLIPEncoder.__init__  sS    mmuVMeMeGf$gGf!%5f%=Gf$gh&+# %hs   A&Nr   r   r   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)rC   )rC  r   )rg   r   r   r   rD   encoder_layers         r(   r   CLIPEncoder.forward  sC     &![[M) M ) +
 	
r*   )rl   rD  rC  r^   )rG   rH   rI   rJ   rK   r   rx   r%   r   r   r   r   r   rO   r   r   s   @r(   r>  r>    sP    ,z , /3
 t+
 +,	

 

 
r*   r>  zI
    The text model from CLIP without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjr\	\
" SS9\   SS	\R                  S-  S
\R                  S-  S\R                  S-  S\\   S\4
S jj5       5       5       rSrU =r$ )CLIPTextModeli  rl   r  r   c                   > [         TU ]  U5        UR                  n[        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l
        UR                  U l        U R                  5         g r  )rw   rx   ry   r   r   r>  encoderr   r
  r  final_layer_normeos_token_id	post_initr   s      r(   rx   CLIPTextModel.__init__  sd     &&	,V4"6* "Y<Q<Q R #//r*   Ftie_last_hidden_statesNr   r   rs   r   r   c                 8   Uc  [        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[	        U R
                  UUSS9nUR                  SS5        U R                  " SUUSS.UD6nUR                  nU R                  U5      nU R                  S	:X  ae  U[        R                  " UR                  S
   UR                  S9UR                  [        R                   UR                  S9R#                  SS94   n	OU[        R                  " UR                  S
   UR                  S9UR                  [        R                   UR                  S9U R                  :H  R!                  5       R#                  SS94   n	[%        UU	S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPTextModel

>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzYou have to specify input_idsr4   )r   rs   )rl   r   r   past_key_valuesr   T)r   r   r   r3   r   r!   )r   r"   r   rC   pooler_outputrF   )r   r   r   r   r
   rl   poprM  rC   rN  rO  r%   r&   r   r"   r   r   argmaxr   )
rg   r   r   rs   r   r   rD   encoder_outputsrC   pooled_outputs
             r(   r   CLIPTextModel.forward  s   2 <==nn&NN2{27	)W+;;') 	
 	

;%+/<< ,
'),
 	,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r*   )r   rM  rO  rN  r   )rG   rH   rI   rJ   r   rM   r4  _input_embed_layerrx   r   r   r   r%   r   r   r   r   r   rO   r   r   s   @r(   rJ  rJ    s      *	~ 	  E2 *..2,0	I
<<$&I
 t+I
 llT)	I

 +,I
 
$I
  3  I
r*   rJ  zK
    The vision model from CLIP without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS\4U 4S jjr	\
\" SS9\  SS\R                  S	-  S
\S	-  S\\   S\4S jj5       5       5       rSrU =r$ )CLIPVisionModeliT  rl   r   r  r   c                 (  > [         TU ]  U5        UR                  n[        U5      U l        [
        R                  " X!R                  S9U l        [        U5      U l
        [
        R                  " X!R                  S9U l        U R                  5         g r  )rw   rx   ry   rj   r   r   r
  r  pre_layrnormr>  rM  post_layernormrP  r   s      r(   rx   CLIPVisionModel.__init___  sk     &&	.v6LL8M8MN"6* ll9:O:OPr*   FrR  Nr   r   r   c                     U R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nUSS2SSS24   nU R	                  U5      n[        UUS9$ )a  
Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, CLIPVisionModel

>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```)r   r   Nr   rV  rF   )r   rb  rM  rC   rc  r   )rg   r   r   r   rD   rZ  rC   r[  s           r(   r   CLIPVisionModel.forwardi  s    > h))-8+/<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r*   )r   rM  rc  rb  r@  )rG   rH   rI   rJ   r   rM   main_input_namer4  r]  rx   r   r   r   r%   rL   boolr   r   r   r   rO   r   r   s   @r(   r_  r_  T  s     $O!*/   E2 2605+
''$.+
 #'++
 +,	+

 
$+
  3  +
r*   r_  c                     ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S\R                  S-  S\R                  S-  S\
\   S	\\-  4
S
 jj5       5       r\\ SS\R                  S\S\
\   S	\\-  4S jj5       5       r\\      SS\R$                  S-  S\R                  S-  S\R                  S-  S\R$                  S-  S\S-  S\S\
\   S	\4S jj5       5       rSrU =r$ )r!  i  rl   c                   > [         TU ]  U5        UR                  nUR                  nUR                  U l        UR
                  U l        UR
                  U l        [        R                  U5      U l
        [        R                  U5      U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        U R/                  5         g NF)rr   )rw   rx   text_configr*  projection_dimry   r#  r%  rJ  _from_config
text_modelr_  vision_modelr   r   r$  r"  r}   r%   r1   rl   logit_scale_init_valuelogit_scalerP  )rg   rl   rl  r*  r   s       r(   rx   CLIPModel.__init__  s     ((,,$33)55 - 9 9'44[A+88G!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   Nr   r   rs   r   r   c                 x    U R                   " SUUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CLIPModel

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```T)r   r   rs   return_dictrF   )ro  rW  r"  )rg   r   r   rs   r   text_outputsr[  s          r(   get_text_featuresCLIPModel.get_text_features  sS    0 48?? 4
)%	4

 4
 %22%)%9%9-%H"r*   r   r   c                 v    U R                   " SUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a"  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPModel
>>> from transformers.image_utils import load_image

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```T)r   r   ru  rF   )rp  rW  r$  )rg   r   r   r   vision_outputsr[  s         r(   get_image_featuresCLIPModel.get_image_features  sR    6 6:5F5F 6
%%=6
 	6
 '44'+'='=m'L$r*   return_lossc           
      
   U R                   " SUUS.UD6nU R                  " SUUUS.UD6n	UR                  n
U	R                  nU
[        U
5      -  n
U[        U5      -  n[        R
                  " XR                  5       R                  UR                  5      5      nXR                  R                  5       R                  UR                  5      -  nUR                  5       nSnU(       a  [        U5      n[        UUUUU
U	US9$ )aj  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPModel
>>> from transformers.image_utils import load_image

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   r   r   r   rs   N)rW   rX   rY   rS   rB   rZ   r[   rF   )r{  rw  rW  r=   r%   r   tr   r"   rr  expr0   rU   )rg   r   r   r   rs   r}  r   r   rz  rv  rB   rS   rY   rX   rW   s                  r(   r   CLIPModel.forward  s!   L 6:5L5L 6
%%=6
 6
 483I3I 4
)%4
 	4
 &33"00 $&6|&DD!$4[$AA  ,,{NN4D4G4GHZHZ4[\),<,<,@,@,B,E,EkFXFX,YY*,,..?D-+#%* .
 	
r*   )rr  rm  r#  ro  r"  r%  rp  r$  NNr   )NNNNNF)rG   rH   rI   rJ   r   rx   r   r   r%   r   r   r   rN   r   rw  rL   rh  r{  r   rU   r   rO   r   r   s   @r(   r!  r!    s   z (  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.204#').J
##d*J
 ''$.J
 t+	J

 &&-J
 D[J
 #'J
 +,J
 
J
  J
r*   r!  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S r\\   SS	\R                  S-  S
\R                  S-  S\R                  S-  S\\   S\4
S jj5       5       rSrU =r$ )r'  iI  rl   rK  c                    > [         TU ]  U5        [        R                  U5      U l        [
        R                  " UR                  UR                  SS9U l	        U R                  5         g rk  )rw   rx   rJ  rn  ro  r   r   ry   rm  r"  rP  r   s     r(   rx   $CLIPTextModelWithProjection.__init__N  sP     '44V<!yy););V=R=RY^_ 	r*   r   c                 B    U R                   R                  R                  $ r^   ro  r   r   rf   s    r(   get_input_embeddings0CLIPTextModelWithProjection.get_input_embeddingsW  s    ))999r*   c                 8    XR                   R                  l        g r^   r  )rg   r   s     r(   set_input_embeddings0CLIPTextModelWithProjection.set_input_embeddingsZ  s    5:""2r*   Nr   r   rs   r   c                     U R                   " SUUUS.UD6nUR                  nU R                  U5      n[        UUR                  UR
                  UR                  S9$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

>>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```r  )rS   rC   rD   rE   rF   )ro  rW  r"  rQ   rC   rD   rE   )rg   r   r   rs   r   rv  r[  rS   s           r(   r   #CLIPTextModelWithProjection.forward]  sr    4 48?? 4
)%4
 	4
 %22**=9"#*<<&44#..	
 	
r*   )ro  r"  r   )rG   rH   rI   rJ   r   rM   r4  rx   r   Moduler  r  r   r   r%   r   r   r   rQ   r   rO   r   r   s   @r(   r'  r'  I  s     ~ :bii :;  *..2,0	&
<<$&&
 t+&
 llT)	&

 +,&
 
&
  &
r*   r'  c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\\  SS\R                  S-  S	\S
\\   S\4S jj5       5       rSrU =r$ )r&  i  rl   r   r`  c                    > [         TU ]  U5        [        R                  U5      U l        [
        R                  " UR                  UR                  SS9U l	        U R                  5         g rk  )rw   rx   r_  rn  rp  r   r   ry   rm  r$  rP  r   s     r(   rx   &CLIPVisionModelWithProjection.__init__  sQ     +88@!#6+=+=v?T?T[`!a 	r*   r   c                 B    U R                   R                  R                  $ r^   )rp  r   r   rf   s    r(   r  2CLIPVisionModelWithProjection.get_input_embeddings  s      ++;;;r*   Nr   r   c                     U R                   " SUUS.UD6nUR                  nU R                  U5      n[        UUR                  UR
                  UR                  S9$ )aX  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
>>> from transformers.image_utils import load_image

>>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> image_embeds = outputs.image_embeds
```r  )rB   rC   rD   rE   rF   )rp  rW  r$  r@   rC   rD   rE   )rg   r   r   r   rz  r[  rB   s          r(   r   %CLIPVisionModelWithProjection.forward  sq    : 6:5F5F 6
%%=6
 6

 '44--m<$%,>>(66%00	
 	
r*   )rp  r$  r@  )rG   rH   rI   rJ   r   rM   rg  r4  rx   r   r  r  r   r   r%   rL   rh  r   r   r@   r   rO   r   r   s   @r(   r&  r&    s    $O!/ <bii <  26).(
''$.(
 #'(
 +,	(

 
(
  (
r*   r&  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrSrSrS\SS4U 4S jjr\\	  SS\
R                  S-  S\
R                  S-  S	\\   S\4S
 jj5       5       rSrU =r$ )r(  i  r   r`  rl   r   Nc                 l  > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rw   rx   
num_labelsr_  rn  r*  rp  r   r   ry   Identityr)  rP  r   s     r(   rx   #CLIPForImageClassification.__init__  s      +++889M9MN OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r*   labelsr   c                 &   U R                   " U40 UD6nUR                  n[        R                  " USS2SS2SS24   SS9nU R	                  U5      nSnUb  U R                  X&U R                  5      n[        UUUR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r   )rW   r   rD   rE   )
rp  rC   r%   r  r)  loss_functionrl   r   rD   rE   )rg   r   r  r   outputssequence_outputr   rW   s           r(   r   "CLIPForImageClassification.forward  s     /3.?.?/
/

 "33**_QAX%>AF1%%fdkkBD$!//))	
 	
r*   )r)  r  rp  r  )rG   rH   rI   rJ   rg  r4  r   rx   r   r   r%   r   r   r   r   r   rO   r   r   s   @r(   r(  r(    s     %O!z d   -1&*
llT)
 t#
 +,	

 

  
r*   r(  )r!  r  rJ  r'  r_  r&  r(  )r   )ErK   collections.abcr   dataclassesr   typingr   r%   r    r   r  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipr   r   r   
get_loggerrG   loggerr   r)   r0   r=   r@   rQ   rU   r  rj   r   floatr   r   r   r  r  r>  rJ  r_  r!  r'  r&  r(  __all__rF   r*   r(   <module>r     s    $ !    & ! / 9 b b F &  J 5 L L 
		H	%
`U\\ `ell `-ELL -U\\ -U\\ ell  
 	<K 	< 	< 
 	<+ 	< 	< _ _  _@P299 Pf% %^ %II%<<% 
% <<	%
 LL4'% % % '(%*7)BII 7)tbii 1 B F%/ F% F%R
")) 
D 
\
' \

\
~ 
>
) >

>
B k
# k
 k
\ ;
"5 ;
 ;
| ;
$7 ;
 ;
| 3
!4 3
3
lr*   