
    Z jC                        S r SSKrSSKJr  SSKJr  SSKrSSK	r	SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)  \ RT                  " \+5      r,S\	RZ                  S\	RZ                  4S jr.S\	RZ                  S\	RZ                  4S jr/S\	RZ                  S\04S jr1SHS\	RZ                  S\2S\3S\0S\	RZ                  4
S jjr4SIS jr5S r6 " S S \
Rn                  5      r8 " S! S"\
Rn                  5      r9 " S# S$\
Rn                  5      r:\\ " S% S&\5      5       5       r; " S' S(\
Rn                  5      r< " S) S*\
Rn                  5      r= " S+ S,\
Rn                  5      r> " S- S.\
Rn                  5      r? " S/ S0\
Rn                  5      r@ " S1 S2\@5      rA " S3 S4\
Rn                  5      rB " S5 S6\5      rC\ " S7 S8\5      5       rD " S9 S:\
Rn                  5      rE " S; S<\
Rn                  5      rF " S= S>\D5      rG " S? S@\D5      rH " SA SB\
Rn                  5      rI " SC SD\D5      rJ\ " SE SF\D5      5       rK/ SGQrLg)JzPyTorch GroupViT model.    N)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)capture_outputs   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr&   *   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 P    [        U 5      n[        U R                  5      nX-   S-  $ )Ng       @)r&   T)r(   caption_loss
image_losss      r%   image_text_contrastive_lossr-   /   s(    #J/L!*,,/J%,,r'   dimc                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NTkeepdimr   memory_format      ?)softmaxmaxr"   
zeros_likelegacy_contiguous_formatscatter_detach)r   r.   y_softindexy_hardrets         r%   hard_softmaxr?   5   sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJr'   tauhardc           	      ,   [         R                  R                  R                  [         R                  " SU R
                  U R                  S9[         R                  " SU R
                  U R                  S95      nUR                  U R                  5      nX-   U-  nUR                  U5      nU(       a]  UR                  USS9S   n[         R                  " U [         R                  S9R                  X7S5      nXR                  5       -
  U-   n	U	$ Un	U	$ )N        )r   dtyper4   Tr0   r   r2   )r"   distributionsgumbelGumbeltensorr   rD   sampleshaper5   r6   r7   r8   r9   r:   )
r   r@   rA   r.   gumbel_distgumbelsr;   r<   r=   r>   s
             r%   gumbel_softmaxrM   ?   s    %%,,33SfllCSfllCK   .G3&G__S!F

3
-a0!!&8V8VW``admpq}}&/ J Jr'   c                    X-  U R                   S   -  S-  nX:  a4  [        [        R                  " X$-  5      5      nU R                   S   U-  nO3[        [        R                  " X-  5      5      nU R                   S   U-  nU R                   S   nU R                   S   nU R	                  XxXe5      n [
        R                  R                  XU4SUS9n U $ )a  
Args:
    attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
    height (`int`): height of the output attention map
    width (`int`): width of the output attention map
    align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

Returns:
    `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
         ?r   r   bilinearsizemodealign_corners)rJ   intnproundreshaper   r    interpolate)	
attentionsheightwidthrU   scale
feat_widthfeat_height
batch_sizegroupss	            r%   resize_attention_maprc   U   s     ^z//22s:E~%-01
 &&q)Z7"((6>23%%a(K7
!!!$Ja F##JPJ**%z + J r'   c           	      V   / n[         R                  " 5          SnU  Hj  nUR                  SSS5      R                  5       nUc  UnOX4-  n[	        UR                  SSS5      R                  5       /UQ76 nUR                  U5        Ml     SSS5        US   nU$ ! , (       d  f       N= f)a  
Args:
    attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
    hw_shape (`tuple(int)`): height and width of the output attention map
Returns:
    `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
Nr   rO   r   )r"   no_gradpermute
contiguousrc   append)r[   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r%   get_grouping_from_attentionsrp   s   s     I	$J#++Aq!4??AJ&","1">/0G0G1a0P0[0[0]i`hiL\* % 
 r]N! 
s   A3B
B(c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )GroupViTCrossAttentionLayer   configc                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        g Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrt   	__class__s     r%   rz   $GroupViTCrossAttentionLayer.__init__   sb    %f-	\\&"4"4&:O:OP
v&f&8&8f>S>STr'   c                     UnX0R                  XS9S   -   nX0R                  U R                  U5      5      -   nU R                  U5      nU$ )N)encoder_hidden_statesr   r|   r   r   r   )r   querykeyxs       r%   forward#GroupViTCrossAttentionLayer.forward   sK    		%	;A>>A''NN1r'   r   )	__name__
__module____qualname____firstlineno__r   rz   r   __static_attributes____classcell__r   s   @r%   rr   rr      s    U3 U r'   rr   c                   @   ^  \ rS rSrS\4U 4S jjrSS jrS rSrU =r	$ )GroupViTAssignAttention   rt   c                   > [         TU ]  5         UR                  S-  U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l	        UR                  U l
        g )N      )ry   rz   r~   r^   r   Linearq_projk_projv_projproj
assign_epsr   s     r%   rz    GroupViTAssignAttention.__init__   s    ''-
ii 2 2F4F4FGii 2 2F4F4FGii 2 2F4F4FGIIf00&2D2DE	 ++r'   c                     U(       a  U R                   (       a  [        USUS9nU$ U(       a  [        USS9nU$ [        R                  R                  USS9nU$ )N)r.   rA   r.   )trainingrM   r?   r   r    r5   )r   r|   rF   rA   s       r%   get_attn GroupViTAssignAttention.get_attn   sX    dmm!$BT:D  #Db1  }},,Tr,:r'   c                 `   UnU R                  U5      nU R                  U5      nU R                  U5      nXR                  SS5      -  U R                  -  nU R                  U5      nU R                  USSS9nXUR                  SSS9U R                  -   -  nXS-  nU R                  U5      nXv4$ )Nr   re   F)rF   rA   Tr.   r1   )	r   r   r   	transposer^   r   sumr   r   )r   r   r   valueraw_attnr|   	soft_attnouts           r%   r   GroupViTAssignAttention.forward   s    E" kk# E" MM"b11TZZ?}}X&MM(5uME	xxBx5GHliin~r'   )r   r   r   r   r^   r   )TT)
r   r   r   r   r   rz   r   r   r   r   r   s   @r%   r   r      s    ,3 ,	 r'   r   c                   <   ^  \ rS rSrS\4U 4S jjrS rS rSrU =r	$ )GroupViTTokenAssign   rt   c                 V  > [         TU ]  5         X0l        [        R                  " UR
                  UR                  S9U l        [        UR                  [        R                  R                  5      (       a  UR                  OUR                  UR                  4nU Vs/ s H  n[        XQR
                  -  5      PM     snu  pg[        XXc5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR
                  UR                  S9U l        [%        U5      U l        [)        U5      U l        [        R                  " UR
                  UR                  S9U l        [/        XR
                  XqR
                  5      U l        g s  snf rv   )ry   rz   num_output_groupr   r}   r~   r   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterablerV   GroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrr   pre_assign_attnr   assign
norm_new_xr   mlp_channels)	r   rt   num_group_tokenr   r   r   
tokens_dimchannels_dimr   s	           r%   rz   GroupViTTokenAssign.__init__   sD    0<<(:(:@U@UV &11;??3K3KLL ##))6+B+BC 	
 JZ#ZIYAC,>,>(>$?IY#Z 
)&:` "V-?-?VEZEZ [ll6#5#56;P;PQ:6B-f5,,v'9'9v?T?TU'0B0BLRdRde $[s   !F&c                 J    U R                  U5      nU R                  U5      nU$ )z
Args:
    group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

Returns:
    projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
)r   r   )r   group_tokensprojected_group_tokenss      r%   project_group_token'GroupViTTokenAssign.project_group_token   s+     "&!=!%!6!67M!N%%r'   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  X15      nU R	                  X15      u  pEXC-  nX@R                  U R                  U5      5      -   nXE4$ )z
Args:
    image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
    group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
)r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r%   r   GroupViTTokenAssign.forward   s     ''5{{<0!%!9!9,!G!%!5!56L![&*kk2H&W#2+.?.?P`@a.bb**r'   )	r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   rz   r   r   r   r   r   s   @r%   r   r      s!    f3 f*&+ +r'   r   c                   :   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\\S
'   Sr\\S'   S\\   4S jrSrg)GroupViTModelOutputi  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
    Classification scores for each pixel.

    <Tip warning={true}>

    The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
    to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
    original image size as post-processing. You should always check your logits shape and resize as needed.

    </Tip>
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of
    [`GroupViTTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of
    [`GroupViTVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`GroupViTTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`GroupViTVisionModel`].
Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r   r   N)getattrto_tuple).0kr   s     r%   	<genexpr>/GroupViTModelOutput.to_tuple.<locals>.<genexpr>3  s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr   s   `r%   r   GroupViTModelOutput.to_tuple2  s#     
YY[
 
 	
r'    )r   r   r   r   __doc__r   r"   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r'   r%   r   r     s    > &*D%

d
")15e''$.504OU&&-448**T18,0K""T)0-1L%##d*148186:3:
%* 
r'   r   c            	          ^  \ rS rSrSr    SS\\\   -  \\\4   -  S\\\\4   -  S\S\4U 4S jjjrSS\	R                  S	\S
\	R                  4S jjrSrU =r$ )GroupViTPatchEmbeddingsi9  z
Image to Patch Embedding.

image_size
patch_sizenum_channels	embed_dimc                 `  > [         TU ]  5         [        U[        R                  R
                  5      (       a  UOX4n[        U[        R                  R
                  5      (       a  UOX"4nUS   US   -  US   US   -  -  nXl        X l        XPl        [        R                  " X4X"S9U l        g )Nr   r   )kernel_sizestride)ry   rz   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r%   rz    GroupViTPatchEmbeddings.__init__>  s     	#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$&))Lgr'   pixel_valuesinterpolate_pos_encodingr   c                 >   UR                   u  p4pVU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   r   zInput image size (*z) doesn't match model ().rO   )rJ   r   
ValueErrorr   flattenr   )r   r   r   ra   r   r\   r]   r   s           r%   r   GroupViTPatchEmbeddings.forwardO  s    2>2D2D/
&'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr'   )r   r   r   r   )      r   i   F)r   r   r   r   r   rV   listr   rz   r"   Tensorboolr   r   r   r   s   @r%   r   r   9  s     9<,.h$s)OeCHo5h %S/)h 	h
 h h"	ELL 	D 	]b]i]i 	 	r'   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )GroupViTVisionEmbeddingsi[  rt   c                   > [         TU ]  5         [        UR                  UR                  UR
                  UR                  S9U l        U R                  R                  n[        R                  " [        R                  " SX!R                  5      5      U l        [        R                  " UR                  5      U l        [        R                   " UR                  UR"                  S9U l        UR                  U l        Xl        g )N)r   r   r   r   r   rw   )ry   rz   r   r   r   r   r~   patch_embeddingsr   r   	Parameterr"   zerosposition_embeddingsDropoutdropoutr}   r   	layernormrt   )r   rt   r   r   s      r%   rz   !GroupViTVisionEmbeddings.__init__\  s     7((((,,((	!
 ++77#%<<A{L^L^0_#` zz&..1f&8&8f>S>ST ++r'   
embeddingsr\   r]   r   c                 ,   UR                   S   nU R                  R                   S   n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  nUR                   S   nX R
                  -  nX0R
                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   re   rP   r   r   rO   bicubicFrR   )rJ   r  r"   jit
is_tracingr   r   rY   rg   r   r    rZ   view)r   r  r\   r]   r   num_positionspatch_pos_embedr.   
new_height	new_widthsqrt_num_positionss              r%   r   1GroupViTVisionEmbeddings.interpolate_pos_encodingl  s    !&&q)0066q9 yy##%%+*F6?+++22r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr'   r   r   c                     UR                   u  p4pVU R                  XS9nU R                  U5      nUR                  5       u  p8n	U(       a  XpR	                  XuU5      -   nOXpR
                  -   nU R                  U5      nU$ )N)r   )rJ   r  r  rS   r   r  r  )
r   r   r   ra   r   r\   r]   r  seq_len_s
             r%   r    GroupViTVisionEmbeddings.forward  s    2>2D2D/
&**<*k
^^J/
!+!2
Q $#&C&CJX]&^^J#&>&>>J\\*-
r'   )rt   r  r  r  r   r  r  )r   r   r   r   r   rz   r"   r  rV   r   r  r   r   r   r   s   @r%   r	  r	  [  sh    3  $5<< $ $UX $]b]i]i $LELL D ]b]i]i  r'   r	  c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )GroupViTTextEmbeddingsi  rt   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_idsr   re   F)
persistent)ry   rz   r~   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr"   r#   expandr   rt   r   r   s      r%   rz   GroupViTTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r'   N	input_idsr&  inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nre   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rJ   r-  weightr   r&  r+  )r   r2  r&  r3  
seq_lengthmax_position_embeddingr  r  s           r%   r   GroupViTTextEmbeddings.forward  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r'   )r-  r+  NNN)r   r   r   r   r   rz   r"   
LongTensorr   r  r   r   r   r   s   @r%   r$  r$    sp    

1 

 .20426	##d* &&- ((4/	
 
 r'   r$  c            
       &  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr\S	 5       r	S
 r
SS\R                  S\R                  S-  S\R                  4S jjr  SS\R                  S\R                  S-  S\S-  S\\R                      4S jjrSrU =r$ )GroupViTStagei  zMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rt   depthnum_prev_group_tokenr   r   c           	      j  > [         TU ]  5         X l        X@l        US:  a;  [        R
                  " [        R                  " SXAR                  5      5      U l	        OS U l	        [        R                  " [        U5       Vs/ s H  n[        U5      PM     sn5      U l        US:  a  [        UUUS9U l        OS U l        US:  ab  US:  a\  [        R                   " [        R"                  " UR                  UR$                  S9['        XUR                  S-  U5      5      U l        g S U l        g s  snf )Nr   r   )rt   r   r   rw   rO   )ry   rz   r=  r   r   r  r"   r  r~   group_token
ModuleListrangeGroupViTEncoderLayerlayersr   
downsample
Sequentialr}   r   r   group_projector)r   rt   r=  r>  r   r   r!  r   s          r%   rz   GroupViTStage.__init__  s     	
.Q!||EKK?L^L^,_`D#Dmm5QV<$X<a%9&%A<$XYQ1 /!1DO #DO!#!(;#%==V//V5J5JK v?Q?QUV?VXgh$D 
 $(D # %Ys   D0c                     U R                   S L$ N)r@  r   s    r%   with_group_tokenGroupViTStage.with_group_token  s    t++r'   c                     U R                   (       a,  US S 2S U R                  * 24   US S 2U R                  * S 24   4$ US 4$ rJ  )rK  r   )r   r   s     r%   split_xGroupViTStage.split_x  sN      Q/4/////0!A8L8L7L7N4N2OOOd7Nr'   Nr   r@  r   c                 8    Uc  U$ [         R                  " X/SS9$ )Nr   r   )r"   cat)r   r   r@  s      r%   concat_xGroupViTStage.concat_x  s!    Hyy!)q11r'   hidden_statesprev_group_tokenoutput_attentionsc                    U R                   (       aM  U R                  R                  UR                  S5      SS5      nU R                  b  X@R	                  U5      -   nOSnUnU R                  XT5      nU R                   H
  nU" USS9nM     U R                  U5      u  pTSnU R                  b  U R                  XT5      u  pXXT4n	U(       a  X4-   n	U	$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the grouping tensors of Grouping block.
r   re   N)attention_mask)	rK  r@  r/  rS   rG  rR  rD  rN  rE  )
r   rT  rU  rV  r@  r   cat_xlayerr   outputss
             r%   r   GroupViTStage.forward  s       **11-2D2DQ2GRPK##/),@,@AQ,RRKa-[[E%5E ! e,	??&??1:LA",Gr'   )r=  rE  rG  r@  rD  r   rJ  NF)r   r   r   r   r   r   rV   rz   propertyrK  rN  r"   r  rR  r  r   r   r   r   r   r   s   @r%   r<  r<    s    W ($ (  ( "	 (
  (  (D , ,2%,, 2U\\D5H 2TYT`T` 2 15).	&||&  ,,-&  $;	&
 
u  	!& &r'   r<  c            
          ^  \ rS rSr   SS\S\S-  S\S-  S\S-  4U 4S jjjrS\R                  S	\R                  4S
 jr	Sr
U =r$ )r   i+  Nrt   r~   intermediate_sizeoutput_sizec                   > [         TU ]  5         Xl        [        UR                     U l        Ub  UOUR                  nUb  UOUR                  nUb  UOUn[        R                  " X#5      U l
        [        R                  " X45      U l        g rJ  )ry   rz   rt   r   
hidden_actactivation_fnr~   r`  r   r   fc1fc2)r   rt   r~   r`  ra  r   s        r%   rz   GroupViTMLP.__init__,  s|     	#F$5$56%0%<k&BTBT1B1N-TZTlTl%0%<k+99[<99.<r'   rT  r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rJ  )re  rd  rf  )r   rT  s     r%   r   GroupViTMLP.forward<  s4    /**=9/r'   )rd  rt   re  rf  r9  )r   r   r   r   r   rV   rz   r"   r  r   r   r   r   s   @r%   r   r   +  sj     #'(,"&=$= 4Z= :	=
 4Z= = U\\ ell  r'   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )r   iC  c                 f   > [         TU ]  UR                  SS5      5      nUR                  SS5      $ Nr   rO   )ry   r   r   )r   r   r   s     r%   r   GroupViTMixerMLP.forwardD  s-    GOAKK1-.{{1a  r'   r   )r   r   r   r   r   r   r   r   s   @r%   r   r   C  s    ! !r'   r   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	  SS	\R                  S
\R                  S-  S\R                  S-  S\\R                  \R                  S-  4   4S jjrSrU =r$ )r{   iI  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )ry   rz   rt   r~   r   num_attention_heads	num_headshead_dimr   r^   attention_dropoutr  r   r   r   r   r   out_projr   s     r%   rz   GroupViTAttention.__init__L  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar'   rH   r   bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ rl  )r  rq  rr  r   rh   )r   rH   r   rv  s       r%   _shapeGroupViTAttention._shape_  s5    {{3GQQRSUVWbbddr'   NrT  rX  r   r   c                    UR                  5       u  pVnUSLnU R                  U5      U R                  -  n	U(       aE  U R                  U R	                  U5      SU5      n
U R                  U R                  U5      SU5      nODU R                  U R	                  U5      SU5      n
U R                  U R                  U5      SU5      nXPR                  -  SU R                  4nU R                  XU5      R                  " U6 n	U
R                  " U6 n
UR                  " U6 nU
R                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XPR                  -  Xm4:w  a-  [        SXPR                  -  Xm4 SUR                  5        35      eUbv  UR                  5       USXm4:w  a"  [        SUSXm4 SUR                  5        35      eUR                  XPR                  Xm5      U-   nUR                  XPR                  -  Xm5      n[        R                  R                  USS9nUR                  XPR                  Xm5      nUR                  XPR                  -  Xm5      n[        R                  R!                  XR                   U R"                  S	9n[        R                  " UU5      nUR                  5       XPR                  -  X`R                  4:w  a5  [        S
XPR                  X`R                  4 SUR                  5        35      eUR                  XPR                  X`R                  5      nUR                  SS5      nUR%                  XVU5      nU R'                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNre   r   rO   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rS   r   r^   rx  r   r   rq  rr  r  r"   bmmr   r   r   r    r5   r  r   rY   rt  )r   rT  rX  r   kwargsrv  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                     r%   r   GroupViTAttention.forwardb  s=    #0"4"4"6i2$> {{=1DJJ>T[[1F%GSQJ;;t{{3H'I2sSLT[[%?SIJ;;t{{='A2sKLNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B !- 1 1#~~w X,11#2FY]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r'   )
rt   r  r   rr  r   rq  rt  r   r^   r   NN)r   r   r   r   r   rz   r"   r  rV   rx  r   r   r   r   r   r   s   @r%   r{   r{   I  s    GB&eU\\ eC ec e /3:>	D2||D2 t+D2  %0047	D2 
u||U\\D00	1D2 D2r'   r{   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jrS	rU =r$ )
rC  i  rt   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g rv   )ry   rz   r~   r   r{   	self_attnr   r}   r   layer_norm1r   r   layer_norm2r   s     r%   rz   GroupViTEncoderLayer.__init__  sm    ++*62<<F<Q<QRv&<<F<Q<QRr'   rT  rX  r}  r   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)rT  rX  r   )r  r  r  r   )r   rT  rX  r}  residualr!  s         r%   r   GroupViTEncoderLayer.forward  sz     !((7>> 
')
 

 !0 ((7/ 0r'   )r   r  r  r   r  )r   r   r   r   r   rz   r"   r  r   r   r   r   r   r   r   s   @r%   rC  rC    sV    S3 S||  +,	
 
		 r'   rC  c                   d    \ rS rSr% \\S'   SrSrSr\	\
S.r\R                  " 5       S 5       rSrg	)
GroupViTPreTrainedModeli  rt   groupvit)imagetextT)rT  r[   c                 t   U R                   R                  n[        U[        R                  [        R
                  45      (       aN  [        R                  " UR                  SUS9  UR                  b   [        R                  " UR                  5        O[        U[        R                  [        R                  45      (       a  [        R                  " UR                  5        [        R                  " UR                  5        [        USS5      b`  [        R                  " UR                  5        [        R                  " UR                   5        [        R                  " UR"                  5        U R                   R$                  n[        U[&        5      (       a  [        R                  " UR(                  R                  SUS-  S9  [        R                  " UR*                  R                  SUS-  S9  [        R,                  " UR.                  [0        R2                  " UR.                  R4                  S   5      R7                  S5      5        g[        U[8        5      (       a  U R                   R$                  nUR:                  S-  S	UR                   R<                  -  S-  -  U-  nUR:                  S-  U-  n[        R                  " UR>                  R                  US
9  [        R                  " UR@                  R                  US
9  [        R                  " URB                  R                  US
9  [        R                  " URD                  R                  US
9  g[        U[F        5      (       a  U R                   R$                  nUR                   RH                  S-  S	UR                   R<                  -  S-  -  U-  nS	UR                   RH                  -  S-  U-  n[        R                  " URJ                  R                  US
9  [        R                  " URL                  R                  US
9  gg)zInitialize the weightsrC   )meanstdNrunning_meang{Gz?re   r'  r   rO   )r  )'rt   initializer_ranger   r   r   r   initnormal_r5  biaszeros_r}   BatchNorm1dones_r   r  running_varnum_batches_trackedinitializer_factorr$  r+  r-  copy_r&  r"   r#   rJ   r/  r{   r   num_hidden_layersr   r   r   rt  r   r~   re  rf  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r%   _init_weights%GroupViTPreTrainedModel._init_weights  s    [[22
fryy"))455LLSjA{{&FKK(r~~ >??KK$JJv}}%v~t4@F//0

6--.F667//f455LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 122[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B,,[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< -r'   r   N)r   r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointingrC  r{   _can_record_outputsr"   rf   r  r   r   r'   r%   r  r    sB    "(&*#-'
 ]]_"= "=r'   r  c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\S-  S\S-  S	\S-  S\	\
-  4
S
 jjrSrU =r$ )GroupViTVisionEncoderi  rt   r   Nc                 j  > [         TU ]  5         Xl        [        R                  " [        [        UR                  5      5       Vs/ s HO  n[        UUR                  U   UR                  U   UR                  U   US:  a  UR                  US-
     OSS9PMQ     sn5      U l        SU l        g s  snf )Nr   r   )rt   r=  r   r   r>  F)ry   rz   rt   r   rA  rB  r$   depthsr<  num_group_tokensnum_output_groupsstagesgradient_checkpointing)r   rt   ir   s      r%   rz   GroupViTVisionEncoder.__init__  s    mm s6==12	 3A ! --*$*$;$;A$>%+%=%=a%@LMPQE)A)A!a%)HWX 3	
 ',#	s   AB0rT  output_hidden_statesrV  return_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOS nU(       a  SOS nS n[	        U R
                  5       H=  u  pU(       a  XQ4-   nU	" XU5      n
U
S   nU
S   nU(       d  M-  U
S   c  M5  XjS   4-   nM?     U(       a  XQ4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )Nr   r   r   rO   c              3   .   #    U  H  oc  M  Uv   M     g 7frJ  r   )r   vs     r%   r   0GroupViTVisionEncoder.forward.<locals>.<genexpr>0  s     g$Uq$Us   	)last_hidden_staterT  r[   )rt   rV  r  r  	enumerater  r   r   )r   rT  r  rV  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r%   r   GroupViTVisionEncoder.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY"6BD/T!$++.HA#$58H$H!!-?PQM)!,M(+L  ]1%5%A -q1A0C C /   14D Dg]}$Uggg+Yf
 	
r'   )rt   r  r  r9  )r   r   r   r   r   rz   r"   r  r  r   r   r   r   r   r   s   @r%   r  r    sq    ,3 , ,( -1)-#'%
||%
 #Tk%
  $;	%

 D[%
 
	 %
 %
r'   r  c                   v   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\\-  4S	 jjrS
rU =r$ )GroupViTTextEncoderi6  z
Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
[`GroupViTEncoderLayer`].

Args:
    config: GroupViTTextConfig
rt   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r]  )
ry   rz   rt   r   rA  rB  r  rC  rD  r  )r   rt   r!  r   s      r%   rz   GroupViTTextEncoder.__init__?  sT    mm5QWQiQiKj$kKja%9&%AKj$kl&+# %ls   A&NrX  r}  r   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
)r  )rD  r   )r   r3  rX  r}  rT  encoder_layers         r%   r   GroupViTTextEncoder.forwardE  sC    ( &![[M) M ) +
 	
r'   )rt   r  rD  rJ  )r   r   r   r   r   r   rz   r"   r  r   r   r   r   r   r   r   r   s   @r%   r  r  6  sV    ,1 , /3
 t+
 +,	

 
	 
 
r'   r  c                      ^  \ rS rSrS\4U 4S jjr\\" SS9\   SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\\   S\4
S jj5       5       5       rSrU =r$ )GroupViTTextTransformerif  rt   c                   > [         TU ]  U5        UR                  n[        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l
        UR                  U l        U R                  5         g rv   )ry   rz   r~   r$  r  r  encoderr   r}   r   final_layer_normeos_token_id	post_initr0  s      r%   rz    GroupViTTextTransformer.__init__g  sd     &&	08*62 "Y<Q<Q R #//r'   F)tie_last_hidden_statesNr2  rX  r&  r}  r   c                 *   Uc  [        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[	        U R
                  UUS S9nUR                  SS 5        U R                  " SUUSS.UD6nUS   nU R                  U5      nU R                  S	:X  ae  U[        R                  " UR                  S   UR                  S
9UR                  [        R                  UR                  S9R!                  SS94   n	OU[        R                  " UR                  S   UR                  S
9UR                  [        R                  UR                  S9U R                  :H  R                  5       R!                  SS94   n	[#        UU	S9$ )NzYou have to specify input_idsre   )r2  r&  )rt   r3  rX  past_key_values	is_causalT)r3  rX  r  r   rO   r   )rD   r   r   )r  pooler_outputr   )r   rS   r  r  r	   rt   popr  r  r  r"   r#   rJ   r   torV   argmaxr   )
r   r2  rX  r&  r}  input_shaperT  encoder_outputsr  pooled_outputs
             r%   r   GroupViTTextTransformer.forwards  s    <==nn&NN2{27	)W+;;') 	
 	

;%+/<< ,
'),
 	,
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r'   )r  r  r  r  r9  )r   r   r   r   r   rz   r   r   r   r"   r  r   r   r   r   r   r   r   s   @r%   r  r  f  s    
1 
  E2 *..2,0	:
<<$&:
 t+:
 llT)	:

 +,:
 
$:
  3  :
r'   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S r\   SS	\R                  S-  S
\R                  S-  S\R                  S-  S\\   S\\-  4
S jj5       rSrU =r$ )GroupViTTextModeli  rt   )r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rJ  )ry   rz   r  
text_modelr  r   s     r%   rz   GroupViTTextModel.__init__  s&     1&9r'   r   c                 B    U R                   R                  R                  $ rJ  r  r  r+  r   s    r%   get_input_embeddings&GroupViTTextModel.get_input_embeddings  s    ))999r'   c                 8    XR                   R                  l        g rJ  r  )r   r   s     r%   set_input_embeddings&GroupViTTextModel.set_input_embeddings  s    5:""2r'   Nr2  rX  r&  r}  c                 .    U R                   " SUUUS.UD6$ )a  
Examples:

```python
>>> from transformers import CLIPTokenizer, GroupViTTextModel

>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r2  rX  r&  r   r  )r   r2  rX  r&  r}  s        r%   r   GroupViTTextModel.forward  s-    .  
)%
 	
 	
r'   r  r9  )r   r   r   r   r   r   r  rz   r   Moduler  r  r   r"   r  r   r   r   r   r   r   r   r   s   @r%   r  r    s     1 :bii :;  *..2,0	
<<$&
 t+
 llT)	

 +,
 
+	+
 
r'   r  c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S-  S\	S-  S\	S-  S\	S-  S	\
\-  4
S
 jj5       rSrU =r$ )GroupViTVisionTransformeri  rt   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        g rv   )ry   rz   rt   r~   r	  r  r  r  r   r}   r   r  r0  s      r%   rz   "GroupViTVisionTransformer.__init__  sL    &&	26:,V4i5J5JKr'   Nr   r  rV  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nUR                  SS9nU(       d	  Xx4USS  -   $ [        UUUR                  UR                  S9$ )Nz You have to specify pixel_values)rT  r  rV  r  r   r   r   )r  r  rT  r[   )rt   rV  r  r  r   r  r  r  r  r   rT  r[   )	r   r   r  rV  r  rT  r  r  r  s	            r%   r   !GroupViTVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY?@@5,,'!5/#	 ' 
 ,A. !NN+<=)..1.5%58KKK)/')77&11	
 	
r'   )rt   r  r  r  NNNN)r   r   r   r   r   rz   r   r"   r   r  r   r   r   r   r   r   s   @r%   r  r    s    L3 L  26,0)-#''
''$.'
 #Tk'
  $;	'

 D['
 
+	+'
 '
r'   r  c                      ^  \ rS rSr% \\S'   SrSr0 rS\4U 4S jjr	S\
4S jr\    SS\R                  S-  S	\S-  S
\S-  S\S-  S\\-  4
S jj5       rSrU =r$ )GroupViTVisionModeli  rt   r   )r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rJ  )ry   rz   r  vision_modelr  r   s     r%   rz   GroupViTVisionModel.__init__  s'     5f=r'   r   c                 B    U R                   R                  R                  $ rJ  )r  r  r  r   s    r%   r  (GroupViTVisionModel.get_input_embeddings#  s      ++<<<r'   NrV  r  r  c                 &    U R                  UUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, GroupViTVisionModel

>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   rV  r  r  r  )r   r   rV  r  r  r}  s         r%   r   GroupViTVisionModel.forward&  s(    >   %/!5#	 ! 
 	
r'   r  r  )r   r   r   r   r   r   main_input_namer  r  rz   r   r  r   r"   r   r  r   r   r   r   r   r   s   @r%   r   r     s      $O!3 =&= =  26)-,0#'#
''$.#
  $;#
 #Tk	#

 D[#
 
+	+#
 #
r'   r   c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\\  SS\	R                  S\	R                  S-  S\	R                  S-  S\\   S	\\-  4
S
 jj5       5       r\\S\	R                  S\\   S	\\-  4S j5       5       r\\        SS\	R"                  S-  S\	R$                  S-  S\	R                  S-  S\	R"                  S-  S\S-  S\S-  S\S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )GroupViTModeliM  rt   c                 >  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [!        U5      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R2                  " [4        R6                  " U R8                  R:                  5      5      U l        U R?                  5         g )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)r  )inplace) ry   rz   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr~   text_embed_dimvision_embed_dimr  r  r  r  r   rF  r   r  ReLUvisual_projectiontext_projectionr  r"   rH   rt   logit_scale_init_valuelogit_scaler  )r   rt   r  r  r   s       r%   rz   GroupViTModel.__init__Q  s    &,,.@AA++,-Q0 
 &..0DEE--./q2 
 ((,,$33+1+M+M()55 - 9 91+>5mD!#IId++T-M-MTXYNN4;;<GGD!IId668K8KRVW	"
  "}}IId))4+K+KRVWNN4;;<GGD!IId668K8KRVW	 
 <<T[[5W5W(XY 	r'   Nr2  rX  r&  r}  r   c                 x    U R                   " SUUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import CLIPTokenizer, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```T)r2  rX  r&  r  r   )r  r  r  )r   r2  rX  r&  r}  text_outputsr  s          r%   get_text_featuresGroupViTModel.get_text_features|  sS    . 48?? 4
)%	4

 4
 %22%)%9%9-%H"r'   r   c                 p    U R                   " U4SS0UD6nU R                  UR                  5      Ul        U$ )a"  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, GroupViTModel
>>> from transformers.image_utils import load_image

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```r  T)r  r  r  )r   r   r}  vision_outputss       r%   get_image_features GroupViTModel.get_image_features  s?    4 6:5F5F|5pae5pio5p'+'='=n>Z>Z'[$r'   return_lossrV  r  output_segmentationc	                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SnUb  UOU R                   R                  nU R	                  UUUSS9n
U R
                  " SUUUS.U	D6nU
R                  nU R                  U5      nUR                  nU R                  U5      nXR                  SSS9-  nXR                  SSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  5       nSnU(       Gag  U
R                  nU R                  UR!                  SUR"                  S   5      5      nU
R$                  n['        UUR"                  SS 5      nUUR                  SSS9-  n[        R                  " UUR                  5       5      U-  nUR!                  UR"                  S   SUR"                  S   5      R)                  SSS	5      nUR!                  UR"                  S   UR"                  S	   S5      n[        R                  " UU5      U-  nUR!                  UR"                  S   UR"                  S	   UR"                  S   UR"                  S
   5      nSnU(       a  [+        U5      n[-        UUUUUUUU
S9$ )a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
output_segmentation (`bool`, *optional*):
    Whether or not to return the segmentation logits.

Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NTr  r  re   r   rO   r   r   r   )r   r   r   r   r   r   r   r   r   )rt   rV  r'  r  r  r  r  r  r  normr  expr"   matmultr  rY   rJ   r[   rp   rg   r-   r   )r   r2  r   rX  r&  r&  rV  r  r'  r}  r#  r  r   r   r  r   r   
seg_logitsimage_group_embedsr[   groupinglogits_per_image_groupflatten_groupingr   s                           r%   r   GroupViTModel.forward  s   X 2C1N-TXT_T_TqTq#6#BHgHg 	  $$8$D $++JjJj 	
 **%/!5	 + 
 48?? 4
)%4
 	4
 &33--l;"00**;7 $&7&7B&7&MM!$4$4T$4$JJ &&**,,,{NN4DES*,,.
 "0!A!A!%!7!78J8R8RSUWiWoWoprWs8t!u'22J3J@R@RSTSU@VWH "46H6M6MRT^b6M6c!c%*\\2Dkmmo%VYd%d"%;%C%C""1%r;+<+<Q+?&gaA #
  (//q0A8>>RSCTVXY &<>NOR]]J#++  #Z%5%5a%8(..:KX^^\]M^J .?D"-+ *#%* .	
 		
r'   )	r  r  r  r  r  r  r  r  r  r  )NNNNNNNN)r   r   r   r   r   r   rz   r   r   r"   r  r   r   r   r   r   r$  r:  r   r  r   r   r   r   r   s   @r%   r  r  M  s   )~ )V  /3,0	<< t+ llT)	
 +, 
+	+  B ll +, 
+	+	  :  .215.204#')-,0+/|
##d*|
 ''$.|
 t+	|

 &&-|
 D[|
  $;|
 #Tk|
 "D[|
 +,|
 
$	$|
  |
r'   r  )r  r  r  r   )r   Fre   r  )Mr   collections.abcr   dataclassesr   typingr   numpyrW   r"   r    r   r  activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_groupvitr   r   r   
get_loggerr   loggerr  r&   r-   rV   r?   floatr  rM   rc   rp   r  rr   r   r   r   r   r	  r$  r<  r   r   r{   rC  r  r  r  r  r  r  r   r  __all__r   r'   r%   <module>rF     sa     !     & ! / 9 K - & j j 7 5 \ \ 
		H	%
`U\\ `ell `
-ELL -U\\ - C 5<< e t RU _d_k_k ,<:"))  -bii -`4+")) 4+n -
+ -
  -
`bii DGryy GV%RYY %PZBII Zz")) 0!{ !]2		 ]2B5 B -=o -= -=`7
BII 7
t-
")) -
`J
5 J
Z,
/ ,
^2
		 2
j3
1 3
l n
+ n
 n
b cr'   