
    Z j                       S r SSKJr  SSKJr  SSKJr  SSKrSSKJrJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \" 5       (       a  SSK+J,r,  \ RZ                  " \.5      r/S\R                  S\R                  4S jr0S\R                  S\R                  4S jr1\\ " S S\5      5       5       r2S\S\4S jr3S\S\4S jr4S  r5S! r6\" S"S#9\ " S$ S%\5      5       5       r7\\" S&S#9 " S' S(\5      5       5       r8 " S) S*\	Rr                  5      r: " S+ S,\	Rr                  5      r;  SQS-\	Rr                  S.\R                  S/\R                  S0\R                  S1\R                  S-  S2\<S-  S3\<S4\\   4S5 jjr= " S6 S7\	Rr                  5      r> " S8 S9\	Rr                  5      r? " S: S;\5      r@\ " S< S=\5      5       rA " S> S?\	Rr                  5      rB " S@ SA\A5      rC " SB SC\A5      rD " SD SE\A5      rE " SF SG\A5      rF\ " SH SI\A5      5       rG " SJ SK\	Rr                  5      rH " SL SM\	Rr                  5      rI " SN SO\A5      rJ/ SPQrKg)RzPyTorch OWLv2 model.    )Callable)	dataclass)AnyN)Tensornn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringis_vision_availablelogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )Owlv2ConfigOwlv2TextConfigOwlv2VisionConfig)center_to_corners_formatlogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr$   )r    s    y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/owlv2/modeling_owlv2.pycontrastive_lossr+   6   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 P    [        U 5      n[        U R                  5      nX-   S-  $ )Ng       @)r+   T)r-   caption_loss
image_losss      r*   image_text_contrastive_lossr2   ;   s(    #J/L!*,,/J%,,r,   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)Owlv2OutputA   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of
    [`Owlv2VisionModel`].
text_model_output (tuple[`BaseModelOutputWithPooling`]):
    The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Owlv2VisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr!   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r;   r<   Ngetattrto_tuple.0kselfs     r*   	<genexpr>'Owlv2Output.to_tuple.<locals>.<genexpr>a   <      
   LLDGRYZ^`aRbRkRkRmm    25tuplekeysrF   s   `r*   rB   Owlv2Output.to_tuple`   #     
YY[
 
 	
r,    )__name__
__module____qualname____firstlineno____doc__r6   r'   FloatTensor__annotations__r7   r8   r9   r:   r;   r   r<   rL   r   rB   __static_attributes__rQ   r,   r*   r4   r4   A   s    ( &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r,   r4   tc                 ,   U R                  5       (       a@  U R                  [        R                  [        R                  4;   a  U $ U R                  5       $ U R                  [        R                  [        R                  4;   a  U $ U R                  5       $ N)	is_floating_pointdtyper'   float32float64floatint32int64int)rZ   s    r*   _upcastre   h   sc    GGu}}==qL1779LGGU[[99qFquuwFr,   boxesc                 f    [        U 5      n U SS2S4   U SS2S4   -
  U SS2S4   U SS2S4   -
  -  $ )a  
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

Args:
    boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
        Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
        < x2` and `0 <= y1 < y2`.

Returns:
    `torch.FloatTensor`: a tensor containing the area for each box.
N   r   r   r   )re   )rf   s    r*   box_areari   q   sB     ENE!Q$K%1+%%1+ad*CDDr,   c                 V   [        U 5      n[        U5      n[        R                  " U S S 2S S S24   US S 2S S24   5      n[        R                  " U S S 2S SS 24   US S 2SS 24   5      nXT-
  R	                  SS9nUS S 2S S 2S4   US S 2S S 2S4   -  nUS S 2S 4   U-   U-
  nXx-  n	X4$ )Nrh   r   minr   )ri   r'   maxrl   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r*   box_ioury      s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L +22q29LAq!LAq$99E!T'NU"U*E
-C:r,   c                    U SS2SS24   U SS2SS24   :  R                  5       (       d  [        SU  35      eUSS2SS24   USS2SS24   :  R                  5       (       d  [        SU 35      e[        X5      u  p#[        R                  " U SS2SSS24   USS2SS24   5      n[        R
                  " U SS2SSS24   USS2SS24   5      nXT-
  R                  SS9nUSS2SS2S4   USS2SS2S4   -  nX'U-
  U-  -
  $ )z
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

Returns:
    `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
Nrh   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rk   r   )all
ValueErrorry   r'   rl   rm   rn   )ro   rp   rx   rw   top_leftbottom_rightru   areas           r*   generalized_box_iour      s(    1ab5MVArrE]*//11WX^W_`aa1ab5MVArrE]*//11WX^W_`aa(JCyy4!,fQUm<H99VAtQRK0&AB-@L +22q29L1a <1a#88D,$&&&r,   z5
    Output type of [`Owlv2ForObjectDetection`].
    )custom_introc                   v   \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\R                  S-  \S'   Sr\\S'   Sr\\S'   S\\   4S jrSrg)Owlv2ObjectDetectionOutput   aj  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
    Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
    bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
    scale-invariant IoU loss.
loss_dict (`Dict`, *optional*):
    A dictionary containing the individual losses. Useful for logging.
logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
    Classification logits (including no-object) for all queries.
objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
    The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
    total number of patches is (image_size / patch_size)**2.
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
    Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
    values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
    possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
    unnormalized bounding boxes.
text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
    Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
    embeddings for each patch.
class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
    Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
    number of patches is (image_size / patch_size)**2.
text_model_output (tuple[`BaseModelOutputWithPooling`]):
    The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Owlv2VisionModel`].
Nr6   	loss_dictr    objectness_logits
pred_boxesr9   r:   class_embedsr;   r<   r!   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr?   r@   rC   s     r*   rG   6Owlv2ObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rI   rJ   rK   rN   s   `r*   rB   #Owlv2ObjectDetectionOutput.to_tuple   rP   r,   rQ   )rR   rS   rT   rU   rV   r6   r'   rW   rX   r   dictr    r   r   r9   r:   r   r;   r   r<   rL   r   rB   rY   rQ   r,   r*   r   r      s    > &*D%

d
")!Itd{!'+FE$+26u((4/6+/J!!D(/,0K""T)0-1L%##d*1-1L%##d*148186:3:
%* 
r,   r   zL
    Output type of [`Owlv2ForObjectDetection.image_guided_detection`].
    c                   :   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\\S
'   Sr\\S'   S\\   4S jrSrg)%Owlv2ImageGuidedObjectDetectionOutput   a  
logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
    Classification logits (including no-object) for all queries.
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
    Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
    image embeddings for each patch.
query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
    Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
    image embeddings for each patch.
target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
    Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
    values are normalized in [0, 1], relative to the size of each individual target image in the batch
    (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
    retrieve the unnormalized bounding boxes.
query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
    Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
    values are normalized in [0, 1], relative to the size of each individual query image in the batch
    (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
    retrieve the unnormalized bounding boxes.
class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
    Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
    number of patches is (image_size / patch_size)**2.
text_model_output (tuple[`BaseModelOutputWithPooling`]):
    The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Owlv2VisionModel`].
Nr    r:   query_image_embedstarget_pred_boxesquery_pred_boxesr   r;   r<   r!   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr?   r@   rC   s     r*   rG   AOwlv2ImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>  rI   rJ   rK   rN   s   `r*   rB   .Owlv2ImageGuidedObjectDetectionOutput.to_tuple  rP   r,   rQ   )rR   rS   rT   rU   rV   r    r'   rW   rX   r:   r   r   r   r   r;   r   r<   rL   r   rB   rY   rQ   r,   r*   r   r      s    8 (,FE$+-1L%##d*137))D0726u((4/615e''$.5-1L%##d*148186:3:
%* 
r,   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )Owlv2VisionEmbeddingsi  configc                   > [         TU ]  5         UR                  U l        Xl        UR                  U l        [        R                  " [        R                  " UR                  5      5      U l
        [        R                  " UR                  U R
                  UR                  UR                  SS9U l        UR                  UR                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R
                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasrh   r   position_idsr   
persistent)super__init__
patch_sizer   hidden_size	embed_dimr   	Parameterr'   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr(   expandrF   r   	__class__s     r*   r   Owlv2VisionEmbeddings.__init__  s    ++++!||EKK8J8J,KL!yy++))$$ 
 #--1B1BBqH!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr,   
embeddingsheightwidthr!   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr   g      ?r   rh   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer'   jit
is_tracingr   r   r   reshapepermuter   r%   interpolateviewcat)rF   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r*   interpolate_pos_encoding.Owlv2VisionEmbeddings.interpolate_pos_encoding.  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr,   pixel_valuesr   c                 b   UR                   u  p4pVU R                  U5      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " X/SS9n	U(       a  XR                  XU5      -   n	U	$ XR                  U R                  5      -   n	U	$ )Nrh   r   r   r   )r   r   flatten	transposer   r   r'   r   r   r   r   )
rF   r   r   
batch_size_r   r   patch_embedsr   r   s
             r*   forwardOwlv2VisionEmbeddings.forwardW  s    '3'9'9$
v++L9#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr,   )r   r   r   r   r   r   r   r   F)rR   rS   rT   rU   r   r   r'   r   rd   r   rW   boolr   rY   __classcell__r   s   @r*   r   r     sr    q0 q*'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 QU bgbnbn  r,   r   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )Owlv2TextEmbeddingsif  r   c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r'   r(   r   r   s     r*   r   Owlv2TextEmbeddings.__init__g  s    !||F,=,=v?Q?QR"$,,v/M/MvOaOa"b 	ELL)G)GHOOPWXej 	 	
r,   N	input_idsr   inputs_embedsr!   c                     Ub  UR                   S   OUR                   S   nUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX5-   nU$ )Nr   )r   r   r   r   )rF   r   r   r   
seq_lengthposition_embeddingsr   s          r*   r   Owlv2TextEmbeddings.forwardq  sx     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"8
r,   )r   r   NNN)rR   rS   rT   rU   r   r   r'   
LongTensorrW   r   r   rY   r   r   s   @r*   r   r   f  so    
 
 .20426	##d* &&- ((4/	
 
 r,   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr         rh   r   r   )ptrainingr   )
r   r'   matmulr   r   r%   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r*   eager_attention_forwardr    s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r,   c                      ^  \ rS rSrSrU 4S jr SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S	 jjrS
rU =r$ )Owlv2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r   r   r   r   r   num_attention_heads	num_headshead_dimr|   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r*   r   Owlv2Attention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar,   Nhidden_statesr   r   r!   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )Nr   r   rh           )r   r   )r   r
  r  r   r   r  r  r   get_interfacer   _attn_implementationr  r  r   r   r   r  r  )rF   r  r   r   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfacer  r  s               r*   r   Owlv2Attention.forward  sE    $))#2.88b8$--8{{=166EOOPQSTU[[/44lCMMaQRS
{{=166EOOPQSTU(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
! "));;;;FFHmmK0((r,   )r   r   r   r
  r  r  r	  r  r  r  r  r\   )rR   rS   rT   rU   rV   r   r'   r   r   r   rL   r   rY   r   r   s   @r*   r  r    sk    GB. /3)||) t+) +,	)
 
u||U\\D00	1) )r,   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Owlv2MLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r\   )r   r   r   r
   
hidden_actactivation_fnr   r  r   intermediate_sizefc1fc2r   s     r*   r   Owlv2MLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr,   r  r!   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r\   )r&  r$  r'  )rF   r  s     r*   r   Owlv2MLP.forward  s4    /**=9/r,   )r$  r   r&  r'  )
rR   rS   rT   rU   r   r'   r   r   rY   r   r   s   @r*   r!  r!    s)    KU\\ ell  r,   r!  c                      ^  \ rS rSrS\\-  4U 4S jjrS\R                  S\R                  S\	\
   S\R                  4S jrS	rU =r$ )
Owlv2EncoderLayeri  r   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r   r   r   r   r  	self_attnr   	LayerNormlayer_norm_epslayer_norm1r!  mlplayer_norm2r   s     r*   r   Owlv2EncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr,   r  r   r   r!   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ )N)r  r   rQ   )r4  r1  r6  r5  )rF   r  r   r   residualr   s         r*   r   Owlv2EncoderLayer.forward  sz     !((7>> 
')
 

 !0 ((7/ 0r,   )r   r4  r6  r5  r1  )rR   rS   rT   rU   r   r   r   r'   r   r   r   rW   r   rY   r   r   s   @r*   r,  r,    s[    S0?B S||  +,	
 
		 r,   r,  c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrS/r\\S.r\R$                  " 5       S\R(                  4S	 j5       rS
rg)Owlv2PreTrainedModeli  r   owlv2)imagetextTr,  )r  
attentionsr   c                 P   U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SUS-  S9  [        R
                  " UR                  R                  SUS-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        GO[        U[        5      (       Ga  [        R
                  " UR                   SUR"                  S-  U-  S9  [        R
                  " UR$                  R                  UR                   R&                  U-  S9  [        R
                  " UR                  R                  UR                   R&                  U-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        GO[        U[(        5      (       a  UR"                  S-  SUR                   R*                  -  S-  -  U-  nUR"                  S-  U-  n[        R
                  " UR,                  R                  US9  [        R
                  " UR.                  R                  US9  [        R
                  " UR0                  R                  US9  [        R
                  " UR2                  R                  US9  GO[        U[4        5      (       a  UR                   R6                  S-  SUR                   R*                  -  S-  -  U-  nSUR                   R6                  -  S-  U-  n[        R
                  " UR8                  R                  US9  [        R
                  " UR:                  R                  US9  GO[        U[<        5      (       a  [        R
                  " UR>                  R                  UR@                  S-  U-  S9  [        R
                  " URB                  R                  URD                  S-  U-  S9  [        RF                  " URH                  U R                   RJ                  5        OZ[        U[L        5      (       aE  [        R                  " URN                  URQ                  URR                  URT                  5      5        [        U[V        RX                  5      (       a@  [        RZ                  " UR\                  5        [        R^                  " UR                  5        [        U[V        R`                  5      (       aO  [        R
                  " UR                  SUS9  UR\                  b!  [        RZ                  " UR\                  5        g	g	g	)
zInitialize the weightsr  g{Gz?)meanstdr   r   r   )rC  rh   N)1r   initializer_factor
isinstancer   initnormal_r   r   r   copy_r   r'   r(   r   r   r   r   r   r   initializer_ranger  num_hidden_layersr  r  r  r  r!  r   r&  r'  
Owlv2Modeltext_projectiontext_embed_dimvisual_projectionvision_embed_dim	constant_logit_scalelogit_scale_init_valueOwlv2ForObjectDetectionbox_biascompute_box_biasnum_patches_heightnum_patches_widthr   r2  zeros_r   ones_r  )rF   r   factorin_proj_stdout_proj_stdfc_stds         r*   _init_weights"Owlv2PreTrainedModel._init_weights  s    //f122LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 566LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh//!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B))!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<
++LL&&--))4/&8 LL((//++T1F: NN6--t{{/Q/QR 788JJv(?(?@Y@Y[a[s[s(tufbll++KK$JJv}}%fbii((LLSf={{&FKK( ' )r,   rQ   N)rR   rS   rT   rU   r   rX   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr,  r  _can_record_outputsr'   no_gradr   Moduler^  rY   rQ   r,   r*   r<  r<    so     (&*#N"&,-*$
 ]]_*)BII *) *)r,   r<  c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )Owlv2EncoderiN  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Owlv2EncoderLayer`].

Args:
    config: Owlv2Config
r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r   r   r   r   
ModuleListrangerJ  r,  layersgradient_checkpointing)rF   r   r   r   s      r*   r   Owlv2Encoder.__init__W  sS    mmfNfNfHg$hHg1%6v%>Hg$hi&+# %is   A&Nr   r   r!   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)last_hidden_state)rq  r   )rF   r   r   r   r  encoder_layers         r*   r   Owlv2Encoder.forward]  sC     &![[M) M ) +
 	
r,   )r   rr  rq  r\   )rR   rS   rT   rU   rV   r   r   r'   r   r   r   r   r   rY   r   r   s   @r*   rl  rl  N  sP    ,{ , /3
 t+
 +,	

 

 
r,   rl  c                      ^  \ rS rSrS\4U 4S jjr\\" SS9\   SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\\   S\\-  4
S jj5       5       5       rSrU =r$ )Owlv2TextTransformeriq  r   c                    > [         TU ]  U5        UR                  n[        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l
        U R                  5         g r.  )r   r   r   r   r   rl  encoderr   r2  r3  final_layer_norm	post_init)rF   r   r   r   s      r*   r   Owlv2TextTransformer.__init__r  sV     &&	-f5#F+ "Y<Q<Q R 	r,   Ftie_last_hidden_statesNr   r   r   r   r!   c                    UR                  5       nUR                  SUS   5      nU R                  XS9n[        U R                  UUSS9nUR                  SS5        U R                  " SUUSS.UD6nUR                  nU R                  U5      nU[        R                  " UR                  S   UR                  S	9UR                  [        R                  5      R                  SS
9R                  UR                  5      4   n	[!        UU	S9$ )aT  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
r   )r   r   N)r   r   r   past_key_valuesr  T)r   r   r  r   r#   r   ru  pooler_outputrQ   )r   r   r   r   r   popr{  ru  r|  r'   r(   r   r$   tord   argmaxr   )
rF   r   r   r   r   r  r  encoder_outputsru  pooled_outputs
             r*   r   Owlv2TextTransformer.forward}  s      nn&NN2{27	)W+;;') 	
 	

;%+/<< ,
'),
 	,
 ,== 112CD *LL*003<M<T<TULL#**r*2556G6N6NOQ

 */'
 	
r,   )r   r{  r|  r   )rR   rS   rT   rU   r   r   r   r   r   r'   r   r   r   rL   r   r   rY   r   r   s   @r*   ry  ry  q  s    	 	  E2 *..2,0	-
<<$&-
 t+-
 llT)	-

 +,-
 
+	+-
  3  -
r,   ry  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S r\  SS	\R                  S-  S
\R                  S-  S\\   S\\-  4S jj5       rSrU =r$ )Owlv2TextModeli  r   )r?  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r\   )r   r   ry  
text_modelr}  r   s     r*   r   Owlv2TextModel.__init__  s&     .v6r,   r!   c                 B    U R                   R                  R                  $ r\   r  r   r   rN   s    r*   get_input_embeddings#Owlv2TextModel.get_input_embeddings  s    ))999r,   c                 8    XR                   R                  l        g r\   r  )rF   r   s     r*   set_input_embeddings#Owlv2TextModel.set_input_embeddings  s    5:""2r,   Nr   r   r   c                 ,    U R                   " SUUS.UD6$ )aW  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)

Examples:
```python
>>> from transformers import AutoProcessor, Owlv2TextModel

>>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> inputs = processor(
...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r   rQ   r  )rF   r   r   r   s       r*   r   Owlv2TextModel.forward  s*    6  
)
 
 	
r,   r  NN)rR   rS   rT   rU   r   rX   ra  r   r   rj  r  r  r   r'   r   r   r   rL   r   r   rY   r   r   s   @r*   r  r    s      :bii :;  *..2
<<$&
 t+
 +,	

 
+	+
 
r,   r  c                      ^  \ rS rSrS\4U 4S jjr\\" SS9\ SS\	R                  S\S-  S	\\   S
\\-  4S jj5       5       5       rSrU =r$ )Owlv2VisionTransformeri  r   c                 <  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        U R                  5         g r.  )r   r   r   r   r   r2  r   r3  pre_layernormrl  r{  post_layernormr}  r   s     r*   r   Owlv2VisionTransformer.__init__  sr     /7\\&*<*<&BWBWX#F+ ll6+=+=6CXCXY 	r,   Fr  r   r   Nr   r!   c                 F   U R                   R                  R                  R                  nUR	                  U5      nU R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nUS S 2SS S 24   nU R                  U5      n[        UUS9$ )N)r   r   r   r  rQ   )
r   r   r   r^   r  r  r{  ru  r  r   )	rF   r   r   r   expected_input_dtyper  r  ru  r  s	            r*   r   Owlv2VisionTransformer.forward  s      $>>EEKK#';<h**=9+/<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r,   )r   r{  r  r  r   )rR   rS   rT   rU   r   r   r   r   r   r'   rW   r   r   r   rL   r   r   rY   r   r   s   @r*   r  r    sz    	0 	  E2 16
''
 #'+
 +,	

 
+	+
  3  
r,   r  c            
          ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\  SS\R                  S-  S	\S
\\   S\4S jj5       rSrU =r$ )Owlv2VisionModeli  r   r   )r>  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r\   )r   r   r  vision_modelr}  r   s     r*   r   Owlv2VisionModel.__init__  s'     26:r,   r!   c                 B    U R                   R                  R                  $ r\   )r  r   r   rN   s    r*   r  %Owlv2VisionModel.get_input_embeddings  s      ++;;;r,   Nr   r   c                 ,    U R                   " SUUS.UD6$ )a  
Examples:
```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Owlv2VisionModel

>>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   r   rQ   r  )rF   r   r   r   s       r*   r   Owlv2VisionModel.forward  s,    8    
%%=
 
 	
r,   r  rn  )rR   rS   rT   rU   r   rX   main_input_namera  r   r   rj  r  r   r'   rW   r   r   r   r   r   rY   r   r   s   @r*   r  r    s    $O!0 <bii <  26).
''$.
 #'
 +,	

 
$
 
r,   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\\ SS\	R                  S\	R                  S-  S\\   S\\-  4S	 jj5       5       r\\ SS
\	R                  S\S\\   S\\-  4S jj5       5       r\\      SS\	R$                  S-  S
\	R&                  S-  S\	R                  S-  S\S-  S\S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )rK  iA  r   c                 H  > [         TU ]  U5        UR                  nUR                  nUR                  U l        UR
                  U l        UR
                  U l        [        U5      U l	        [        U5      U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                   " ["        R$                  " UR&                  5      5      U l        U R+                  5         g )NF)r   )r   r   text_configvision_configprojection_dimr   rM  rO  ry  r  r  r  r   r  rN  rL  r   r'   tensorrR  rQ  r}  )rF   r   r  r  r   s       r*   r   Owlv2Model.__init__F  s     ((,,$33)55 - 9 9.{;2=A!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<V5R5R(ST 	r,   Nr   r   r   r!   c                 t    U R                   " SUUS.UD6nUR                  nU R                  U5      Ul        U$ )a6  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)

Examples:
```python
>>> import torch
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> inputs = processor(
...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r  rQ   )r  r  rL  )rF   r   r   r   text_outputsr  s         r*   get_text_featuresOwlv2Model.get_text_featuresZ  sM    6 48?? 4
)4
 4

 %22%)%9%9-%H"r,   r   r   c                 p    U R                   " SUUS.UD6nU R                  UR                  5      Ul        U$ )a.  
Examples:
```python
>>> import torch
>>> from transformers.image_utils import load_image
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```r  rQ   )r  rN  r  )rF   r   r   r   vision_outputss        r*   get_image_featuresOwlv2Model.get_image_features  sJ    2 6:5F5F 6
%%=6
 6

 (,'='=n>Z>Z'[$r,   return_lossreturn_base_image_embedsc           
      t   U R                   " S	UUS.UD6nU R                  " S	UUS.UD6n	U	R                  n
U R                  U
5      n
UR                  nU R	                  U5      nU[
        R                  R                  USSSS9-  nU
[
        R                  R                  U
SSSS9-  nU R                  R                  5       R                  UR                  5      n[
        R                  " XR                  5       5      U-  nUR                  5       nSnU(       a  [        U5      nUn
[        UUUU
UU	US9$ )
a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
return_base_image_embeds (`bool`, *optional*):
    Whether or not to return the base image embeddings.

Examples:
```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r  r  rh   r   T)ordr   keepdimN)r6   r7   r8   r9   r:   r;   r<   rQ   )r  r  r  rL  rN  r'   linalgnormrQ  expr  r$   r   rZ   r2   r4   )rF   r   r   r   r  r   r  r   r  r  r9   r:   text_embeds_normrQ  r8   r7   r6   s                    r*   r   Owlv2Model.forward  s\   F 6:5F5F 6
%%=6
 6
 48?? 4
)4
 4
 #00**;7%33--l; $ell&7&7!QS]a&7&bb&):):;ASU_c):)dd &&**,//0C0CD,,'79IJ[X*,,..?D&-+#%* .
 	
r,   )rQ  r  rM  r  rL  rO  r  rN  r\   r   )NNNNFN)rR   rS   rT   rU   r   rX   r   r   r   r'   r   r   r   rL   r   r  r   r  r   rW   r4   r   rY   r   r   s   @r*   rK  rK  A  s    { (  /3!<<! t+! +,	!
 
+	+!  !F  */ll #' +,	
 
+	+  @  .215.2#').04K
##d*K
 ''$.K
 t+	K

 D[K
 #'K
 #'+K
 +,K
 
	K
  K
r,   rK  c                   r   ^  \ rS rSrS	S\S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
Owlv2BoxPredictionHeadi  r   out_dimc                 $  > [         TU ]  5         UR                  R                  n[        R
                  " X35      U l        [        R
                  " X35      U l        [        R                  " 5       U l	        [        R
                  " X25      U l
        g r\   )r   r   r  r   r   r  dense0dense1GELUgeludense2)rF   r   r  r   r   s       r*   r   Owlv2BoxPredictionHead.__init__  s\    $$00ii-ii-GGI	ii/r,   image_featuresr!   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r\   )r  r  r  r  )rF   r  outputs      r*   r   Owlv2BoxPredictionHead.forward  sM    ^,6"V$6"V$r,   )r  r  r  r  )   )rR   rS   rT   rU   r   rd   r   r'   r   rW   r   rY   r   r   s   @r*   r  r    s=    0{ 0S 0 0ell u7H7H  r,   r  c            	          ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S-  S\R                  S-  S\	\R                     4S	 jr
S
rU =r$ )Owlv2ClassPredictionHeadi  r   c                   > [         TU ]  5         UR                  R                  nUR                  R                  U l        [        R                  " U R
                  U5      U l        [        R                  " U R
                  S5      U l	        [        R                  " U R
                  S5      U l
        [        R                  " 5       U l        g )Nr   )r   r   r  r   r  	query_dimr   r  r  logit_shiftrQ  ELUelu)rF   r   r  r   s      r*   r   !Owlv2ClassPredictionHead.__init__  s    $$00--99ii899T^^Q799T^^Q7668r,   r:   query_embedsN
query_maskr!   c                 "   U R                  U5      nUcQ  UR                  nUR                  S S u  pg[        R                  " XgU R
                  45      R                  U5      nX4$ U[        R                  R                  USSS9S-   -  nU[        R                  R                  USSS9S-   -  n[        R                  " SXB5      nU R                  U5      n	U R                  U5      n
U R                  U
5      S-   n
X-   U
-  nUb  UR                  S:  a  [        R                  " USS	9n[        R                  " US
:H  [        R                   " UR"                  5      R$                  U5      nUR                  [        R&                  5      nX4$ )Nrh   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r$   r   r'   zerosr  r  r  r  einsumr  rQ  r  ndimr   wherefinfor^   rl   r_   )rF   r:   r  r  image_class_embedsr$   r   r   pred_logitsr  rQ  s              r*   r    Owlv2ClassPredictionHead.forward  sw    "[[6'..F&8&>&>r&B#J++z&OPSSTZ[K44 05<<3D3DEW]_im3D3nqu3uv#u||'8'82W['8'\_c'cd ll#79KZ &&|4&&|4hh{+a/"0K?!""__ZR@
++jAou{{;CTCT7U7Y7Y[fgK%..7K00r,   )r  r  rQ  r  r  )rR   rS   rT   rU   r   r   r'   rW   r   rL   r   rY   r   r   s   @r*   r  r    sd    	{ 	!1''!1 ''$.!1 LL4'	!1
 
u  	!!1 !1r,   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\S\S\S\	R                  4S j5       rS\	R                  S\	R                  4S	 jrS\S\S\	R                  4S
 jr S S\	R                  S\	R                  S\S\	R                  4S jjr  S!S\	R                  S\	R                  S-  S\	R                  S-  S\\	R                     4S jjr S S\	R                  S\	R                  S\	R                  S\S\\   S\\	R                     4S jjr S S\	R                  S\S\\   S\\	R                     4S jjr S S\	R                  S\	R                  S\S\	R                  4S jjr\\  S"S\	R                  S\	R                  S-  S\S\\   S\4
S jj5       5       r\\  S"S\	R                  S\	R                  S\	R                  S-  S\S\\   S\4S jj5       5       rSrU =r$ )#rS  i6  r   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        USS9U l        [        R                  " UR                  R                  UR                  R                  S9U l        [        R                  " 5       U l        Xl        U R"                  R                  R$                  U R"                  R                  R&                  -  U l        U R"                  R                  R$                  U R"                  R                  R&                  -  U l        U R-                  SU R/                  U R(                  U R*                  5      SS9  U R1                  5         g )Nr   )r  r/  rT  Fr   )r   r   rK  r=  r  
class_headr  box_headobjectness_headr   r2  r  r   r3  
layer_normSigmoidsigmoidr   r   r   rV  rW  r   rU  r}  r   s     r*   r    Owlv2ForObjectDetection.__init__9  s    '
26:.v65faH,,v';';'G'GVMaMaMpMpqzz|"&++";";"F"F$++JcJcJnJn"n!%!:!:!E!EIbIbImIm!m--d.E.EtG]G]^kp 	 	

 	r,   rV  rW  r!   c                 T   [         R                  " SUS-   [         R                  S9n[         R                  " SU S-   [         R                  S9n[         R                  " X#SS9u  pE[         R                  " XE4SS9nUS==   U-  ss'   US==   U -  ss'   UR                  SS	5      nU$ )
Nr   )r^   xy)indexingr   r   .r   .r   rh   )r'   r(   r_   meshgridstackr   )rV  rW  x_coordinatesy_coordinatesxxyybox_coordinatess          r*   !normalize_grid_corner_coordinates9Owlv2ForObjectDetection.normalize_grid_corner_coordinatesM  s     Q(9A(=U]]SQ(:Q(>emmTtL  ++rhB7#44#55 *..r15r,   r  c                 R    UR                  5       nU R                  U5      nUS   nU$ )zPredicts the probability that each image feature token is an object.

Args:
    image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
        Features extracted from the image.
Returns:
    Objectness scores.
r  )detachr  )rF   r  r   s      r*   objectness_predictor,Owlv2ForObjectDetection.objectness_predictor_  s4     (..0 00@-f5  r,   c                    U R                  X5      n[        R                  " USS5      n[        R                  " US-   5      [        R                  " U* S-   5      -
  n[        R
                  " US5      nUS==   U-  ss'   US==   U-  ss'   [        R                  " US-   5      [        R                  " U* S-   5      -
  n[        R                  " XF/SS9nU$ )Nr  g      ?g-C6?r  r  r   r   )r  r'   cliploglog1p	full_liker   )rF   rV  rW  r   box_coord_biasbox_sizebox_size_biasrT  s           r*   rU  (Owlv2ForObjectDetection.compute_box_biasn  s    @@ASg**_c3? ?T#9:U[[/IY\`I`=aa ??>37--..		(T/2U[[(TAQ5RR 99n<"Er,   image_featsfeature_mapr   c                     U R                  U5      nU(       a!  UR                  u  pVpuU R                  Xg5      nOU R                  nUR	                  UR
                  5      nXH-  nU R                  U5      nU$ )a  
Args:
    image_feats:
        Features extracted from the image, returned by the `image_text_embedder` method.
    feature_map:
        A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
    interpolate_pos_encoding:
        Whether to interpolate the pre-trained position encodings.
Returns:
    pred_boxes:
        List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
)r  r   rU  rT  r  r$   r  )	rF   r  r  r   r   r   rV  rW  rT  s	            r*   box_predictor%Owlv2ForObjectDetection.box_predictor  sr    & ]];/
 $:E:K:K7A#4,,-?SH}}H;;{112
\\*-
r,   Nr  r  c                 0    U R                  XU5      u  pEXE4$ )z
Args:
    image_feats:
        Features extracted from the `image_text_embedder`.
    query_embeds:
        Text query embeddings.
    query_mask:
        Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
)r  )rF   r  r  r  r  r  s         r*   class_predictor'Owlv2ForObjectDetection.class_predictor  s!     -1OOKWa,b)00r,   r   r   r   r   c                    U R                   " SUUUUS.UD6nU(       aU  UR                  u    pxn	XR                  R                  R                  -  n
XR                  R                  R                  -  nOU R
                  n
U R                  nUR                  S   nU R                   R                  R                  U5      n[        R                  " US S 2S S2S S 24   US S 2S S24   R                  5      nUS S 2SS 2S S 24   U-  nU R                  U5      nUR                  S   U
UUR                  S   4nUR                  U5      nUS   nUX4$ )N)r   r   r   r   r   r   r   rQ   )r=  r   r   r  r   rV  rW  r<   r  r  r'   broadcast_tor  r   )rF   r   r   r   r   r   outputsr   r   r   rV  rW  ru  r:   class_token_outnew_sizer9   s                    r*   image_text_embedder+Owlv2ForObjectDetection.image_text_embedder  so    ** 
%)%=	

 
 $"."4"4Aq%!';;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 $77:zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5bk\33r,   c                    U R                   R                  " SXS.UD6nU(       aU  UR                  u    pVnX`R                  R                  R
                  -  nXpR                  R                  R
                  -  n	OU R                  nU R                  n	US   n
U R                   R                  R                  U
5      n[        R                  " US S 2S S2S S 24   US S 2S S24   R                  5      nUS S 2SS 2S S 24   U-  nU R                  U5      nUR                  S   UU	UR                  S   4nUR                  U5      nX4$ )Nr  r   r   r   rQ   )r=  r  r   r   r  r   rV  rW  r  r'   r  r  r   )rF   r   r   r   r  r   r   r   rV  rW  ru  r:   r  r  s                 r*   image_embedder&Owlv2ForObjectDetection.image_embedder  sY    6:ZZ5L5L 6
%6
\b6
 $"."4"4Aq%!';;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 +1-zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5--r,   query_image_featuresquery_feature_mapc                    U R                  U5      u  pEU R                  XU5      n[        U5      n/ n/ n	UR                  n
[	        UR
                  S   5       GH$  n[        R                  " / SQ/U
S9nX{   n[        X5      u  p[        R                  " US   S:H  5      (       a  [        X5      n[        R                  " U5      S-  nUS   U:  R                  5       nUR                  5       (       d  M  X[   UR                  S5         n[        R                  " X[   SS9n[        R                   " SUU5      nU[        R"                  " U5         nUR%                  X[   U   5        U	R%                  U5        GM'     U(       a-  [        R&                  " U5      n[        R&                  " U	5      nOS	u  nnUUU4$ )
Nr   )r   r   r   r   r#   r  g?r   )axiszd,id->ir  )r  r  r   r$   rp  r   r'   r  ry   r{   r   rm   nonzeronumelsqueezerB  r  argminappendr  )rF   r#  r$  r   r   r   r   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                          r*   embed_image_query)Owlv2ForObjectDetection.embed_image_query  s    ../CD''(<Qij
 8 D 188+11!45A"\\<.ARSN$9$<!nDGD yyaC((*>Q "IIdOc1M!!W5>>@M""$$&2om6K6KA6N&O##jjqA <<	;@ST,U\\(-CD!(()FG ''5' 6*  ;;'89L++&67K(2%L+[*44r,   query_pixel_valuesc                    U R                  X#S9S   nU R                   " SUUS.UD6u  pgUR                  u  pp[        R                  " XhX-  U45      nUR                  u  pp[        R                  " XXX-  U45      nU R	                  XU5      u  pnU R                  XS9u  nnU R                  XU5      n[        UUUUUUSUS9$ )a,  
query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Pixel values of query image(s) to be detected. Pass in one query image per target image.

Examples:
```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> import torch
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection

>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
>>> with httpx.stream("GET", query_url) as response:
...     query_image = Image.open(BytesIO(response.read()))
>>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")

>>> # forward pass
>>> with torch.no_grad():
...     outputs = model.image_guided_detection(**inputs)

>>> target_sizes = torch.Tensor([image.size[::-1]])

>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_image_guided_detection(
...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
... )
>>> i = 0  # Retrieve predictions for the first image
>>> boxes, scores = results[i]["boxes"], results[i]["scores"]
>>> for box, score in zip(boxes, scores):
...     box = [round(i, 2) for i in box.tolist()]
...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8]
Detected similar object with confidence 0.985 at location [176.98, -29.45, 672.69, 182.83]
Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
```r  r   )r  r  N)r:   r   r   r   r    r   r;   r<   rQ   )r!  r   r'   r   r;  r  r  r   )rF   r   r=  r   r   r$  r  r  r   rV  rW  
hidden_dimr  query_image_featsr  r.  r   r  r   r   s                       r*   image_guided_detection.Owlv2ForObjectDetection.image_guided_detection=  s"   | !//+ 0 

 '+&9&9 '
%%='
 '
# ITHYHYE
(9mmK>P>dfp1qrHYH_H_E
(9!MM,>,RT^_
 <@;Q;Q2J<
8(8
 '+&:&:{&:&n#l !..{Iab4$0/-%" .	
 		
r,   c                    U R                   " SUUUUS.UD6u  pgnUR                  n	UR                  n
UR                  u  pp[        R
                  " X{X-  U45      nUR                  S   U-  nUR                  UUUR                  S   5      nUR                  UUUR                  S   5      nUS   S:  nU R                  XU5      u  nnU R                  U5      nU R                  XU5      n[        UUUUUUU	U
S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids).

Examples:
```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> import torch

>>> from transformers import Owlv2Processor, Owlv2ForObjectDetection

>>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs)

>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.tensor([(image.height, image.width)])
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_grounded_object_detection(
...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
... )
>>> # Retrieve predictions for the first image for the corresponding text queries
>>> result = results[0]
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> for box, score, text_label in zip(boxes, scores, text_labels):
...     box = [round(i, 2) for i in box.tolist()]
...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
```)r   r   r   r   r   r   r  )r:   r9   r   r    r   r   r;   r<   rQ   )
r  r;   r<   r   r'   r   r  r  r  r   )rF   r   r   r   r   r   r  r  r  r  r  r   rV  rW  r?  r  max_text_queriesr  r  r   r   r   s                         r*   r   Owlv2ForObjectDetection.forward  sF   f .2-E-E .
%)%=	.

 .
*7 00 44HSHYHYE
(9mmK>P>dfp1qr %??1-;#++J8H,J\J\]_J`a %%j2BIOOTVDWX	v&*
 '+&:&:;V`&a#l !55kB ''BZ[
)$$!/%* .	
 		
r,   )	r  r  r   r  rV  rW  r  r=  r  r   r  rn  ) rR   rS   rT   rU   r   rX   r   staticmethodrd   r'   r   r  rW   r  rU  r   r  rL   r  r   r   r  r!  r;  r   r   r   rA  r   r   rY   r   r   s   @r*   rS  rS  6  s   { ( c VY ^c^j^j   !53D3D !IZIZ !3 3 SXS_S_ . */	&& && #'	
 
		J 26*.	1&&1 ''$.1 LL4'	1
 
u  	!12 */-4<<-4 ''-4 	-4
 #'-4 +,-4 
u  	!-4f */'.'''. #''. +,	'.
 
u  	!'.\ */	*5#//*5 !,,*5 #'	*5
 
		*5X  8<).	`
''`
 "--4`
 #'	`

 +,`
 
/`
  `
D 
 /3).Z
<<Z
 ''Z
 t+	Z

 #'Z
 +,Z
 
$Z
  Z
r,   rS  )rK  r<  r  r  rS  )Nr  )LrV   collections.abcr   dataclassesr   typingr   r'   r   r    r	   rF  activationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_owlv2r   r   r   transformers.image_transformsr   
get_loggerrR   loggerr+   r2   r4   re   ri   ry   r   r   r   rj  r   r   ra   r  r  r!  r,  r<  rl  ry  r  r  r  rK  r  r  rS  __all__rQ   r,   r*   <module>rY     s    $ !    & ! / 9 K F &  J 5 P P F 
		H	%`U\\ `ell `
-ELL -U\\ - !
+ !
  !
JGv G& GEF Ev E""'0 
 /
 /
 /
d *
K *
 *
\JBII J\")) L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:6)RYY 6)tryy  2 B :)? :) :)|
299 
F<
/ <
@/
) /
f(
1 (
X.
+ .
b k
% k
 k
^RYY (-1ryy -1`G
2 G
T rr,   