
    Z j                        S SK r S SKrS SKJr  S SKJr  S SKJr  S SKrS SKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(  \\ " S S\5      5       5       r)\\ " S S\5      5       5       r*\\ " S S\5      5       5       r+ " S S\	RX                  5      r- " S S\	RX                  5      r. SHS\	RX                  S\R^                  S\R^                  S \R^                  S!\R^                  S-  S"\0S#\0S$\\   4S% jjr1 " S& S'\	RX                  5      r2 " S( S)\	RX                  5      r3 " S* S+\5      r4 " S, S-\5      r5\ " S. S/\5      5       r6 " S0 S1\	RX                  5      r7 " S2 S3\65      r8\" S4S59 " S6 S7\65      5       r9\" S8S59 " S9 S:\65      5       r:S;\R^                  S<\R^                  4S= jr;S>\R^                  S<\R^                  4S? jr<S@\R^                  S<\R^                  4SA jr=\ " SB SC\65      5       r>\" SDS59 " SE SF\65      5       r?/ SGQr@g)I    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigc                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)CLIPSegOutput+   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN
isinstancer   to_tuple.0vs     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/clipseg/modeling_clipseg.py	<genexpr>)CLIPSegOutput.to_tuple.<locals>.<genexpr>J   )     ^P]1Z;%?%?QZZ\QFP]   46tuplevaluesselfs    r0   r,   CLIPSegOutput.to_tupleI       ^PTP[P[P]^^^     )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r    r!   r"   r#   r$   r   r%   r6   r   r,   __static_attributes__r=   r<   r0   r   r   +   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r<   r   c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Srg)	CLIPSegDecoderOutputM   a}  
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
hidden_states (`tuple(torch.FloatTensor)`, *optional*,):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Rreturned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`
attentions (`tuple(torch.FloatTensor)`, *optional*):
    Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
    heads. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`
Nlogits.hidden_states
attentionsr=   )r>   r?   r@   rA   rB   rJ   rC   rD   rE   rK   r6   rL   rF   r=   r<   r0   rH   rH   M   s\    	 (,FE$+:>M5**C/047>7;Je'',-4;r<   rH   c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S'   Sr\\S	'   S
\\   4S jrSrg)CLIPSegImageSegmentationOutput`   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Binary cross entropy loss for segmentation.
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
    Conditional embeddings used for segmentation.
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
    Pooled output of the [`CLIPSegVisionModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegVisionModel`].
decoder_output (`CLIPSegDecoderOutput`):
    The output of the [`CLIPSegDecoder`].
Nr   rJ   conditional_embeddingspooled_outputr%   decoder_outputr&   c                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fr)   r*   r-   s     r0   r1   :CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>z   r3   r4   r5   r8   s    r0   r,   'CLIPSegImageSegmentationOutput.to_tupley   r;   r<   r=   )r>   r?   r@   rA   rB   r   rC   rD   rE   rJ   rP   rQ   r%   r   rR   rH   r6   r   r,   rF   r=   r<   r0   rN   rN   `   s     &*D%

d
")'+FE$+7;E--4;.2M5$$t+26:3:+/N(/_%* _r<   rN   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )CLIPSegVisionEmbeddings}   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rZ   hidden_size	embed_dim
image_size
patch_sizer   	ParameterrC   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandr9   rZ   	__class__s     r0   rh    CLIPSegVisionEmbeddings.__init__~   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr<   
embeddingsheightwidthr&   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nrd         ?r   ra   bicubicF)sizemodealign_cornersdim)shaperv   weight	unsqueezerC   jit
is_tracingrb   rl   r   reshapepermuter   
functionalinterpolateviewcat)r9   r}   r~   r   rs   rv   rt   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r0   interpolate_pos_encoding0CLIPSegVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr<   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).)dtypera   r   rd   r   )r   rk   
ValueErrorrr   r   r   toflatten	transposero   ry   rC   r   r   rv   rb   )r9   r   r   
batch_size_r~   r   target_dtypepatch_embedsclass_embedsr}   s              r0   forwardCLIPSegVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr<   )	ro   rZ   rj   rk   rs   rt   rr   rl   rv   T)r>   r?   r@   rA   r   rh   rC   Tensorintr   rD   r   rF   __classcell__r{   s   @r0   rX   rX   }   sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe  r<   rX   c            	          ^  \ rS rSrS\4U 4S jjr   SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )CLIPSegTextEmbeddings   rZ   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nrb   rc   Fre   )rg   rh   ri   r   ru   
vocab_sizetoken_embeddingmax_position_embeddingsrv   rw   rC   rx   ry   r9   rZ   rj   r{   s      r0   rh   CLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r<   N	input_idsrb   inputs_embedsr&   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrd   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rv   r   r   rb   r   )r9   r   rb   r   
seq_lengthmax_position_embeddingposition_embeddingsr}   s           r0   r   CLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r<   )rv   r   NNN)r>   r?   r@   rA   r   rh   rC   
LongTensorrD   r   r   rF   r   r   s   @r0   r   r      sp    

0 

 .20426	##d* &&- ((4/	
 
 r<   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrd   r   )r   r   )ptrainingr   ra   )rC   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r<   c                      ^  \ rS rSrSrS\\-  4U 4S jjr SS\R                  S\R                  S-  S\
\   S	\\R                  \R                  S-  4   4S
 jjrSrU =r$ )CLIPSegAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrZ   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  S-  U l        UR                  U l
        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )N      F)rg   rh   rZ   ri   rj   num_attention_heads	num_headshead_dimscaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrz   s     r0   rh   CLIPSegAttention.__init__  s    ++33$..8]]D(
//ii?ii?ii?		$..$..Ar<   NrK   r   r   r&   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      nU R                  U5      nU R	                  U5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      nUR                  U5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  U R                  (       d  SOU R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )z#Input shape: Batch x Time x ChannelNrd   r   ra           )r   r   )r   r   r   r   r   r   r   r   get_interfacerZ   _attn_implementationr   r   r   r   r   r   r   )r9   rK   r   r   input_shapehidden_shapequerieskeysr7   attention_interfacer   r   s               r0   r   CLIPSegAttention.forward  sI    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
! "));;;;FFHmmK0((r<   )rZ   r   rj   r   r   r   r   r   r   r   r   r)   )r>   r?   r@   rA   rB   r   r   rh   rC   r   r   r   r6   r   rF   r   r   s   @r0   r   r     sz    GB25FF B$ /3%)||%) t+%) +,	%)
 
u||U\\D00	1%) %)r<   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
CLIPSegMLPiG  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r)   )rg   rh   rZ   r	   
hidden_actactivation_fnr   r   ri   intermediate_sizefc1fc2rz   s     r0   rh   CLIPSegMLP.__init__H  sb    #F$5$5699V//1I1IJ99V55v7I7IJr<   rK   r&   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r)   )r   r   r   )r9   rK   s     r0   r   CLIPSegMLP.forwardO  s4    /**=9/r<   )r   rZ   r   r   )
r>   r?   r@   rA   rh   rC   r   r   rF   r   r   s   @r0   r   r   G  s)    KU\\ ell  r<   r   c                      ^  \ rS rSrS\\-  4U 4S jjrS\R                  S\R                  S\	\
   S\R                  4S jrS	rU =r$ )
CLIPSegEncoderLayeriV  rZ   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)epsrg   rh   ri   rj   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rz   s     r0   rh   CLIPSegEncoderLayer.__init__W  m    ++)&1<<F<Q<QRf%<<F<Q<QRr<   rK   r   r   r&   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pXA-   nUnU R                  U5      nU R                  U5      nXA-   nU$ N)rK   r   r=   )r  r   r  r  r9   rK   r   r   residualr   s         r0   r   CLIPSegEncoderLayer.forward_  sz     !((7>> 
')
 

 !0 ((7/ 0r<   rj   r  r  r  r   )r>   r?   r@   rA   r   r   rh   rC   r   r   r   rD   r   rF   r   r   s   @r0   r   r   V  s\    S25FF S||  +,	
 
		 r<   r   c                      ^  \ rS rSrSrS\\-  4U 4S jjrS\R                  S\R                  S\
\R                     4S jrS	rU =r$ )
CLIPSegDecoderLayeriw  z
CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
self-attention/MLP, rather than before.
rZ   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   r   rz   s     r0   rh   CLIPSegDecoderLayer.__init__}  r  r<   rK   r   r&   c                     UnU R                   " SUUS.UD6u  pXA-   nU R                  U5      nUnU R                  U5      nXA-   nU R                  U5      nU$ r  )r   r  r  r  r  s         r0   r   CLIPSegDecoderLayer.forward  sz     !>> 
')
 
 !0((7 / 0((7r<   r  )r>   r?   r@   rA   rB   r   r   rh   rC   r   r6   rD   r   rF   r   r   s   @r0   r  r  w  sV    
S25FF S|| 
 
u  	! r<   r  c                       \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrSr\\/\S.r\R&                  " 5       S 5       rS	rg
)CLIPSegPreTrainedModeli  rZ   clipseg)imagetext)r   r   rX   T)rK   rL   c                 	   U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SUS-  S9  [        R
                  " UR                  R                  SUS-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        GOY[        U[        5      (       Ga   [        R
                  " UR                   SUR"                  S-  U-  S9  [        R
                  " UR$                  R                  UR                   R&                  U-  S9  [        R
                  " UR                  R                  UR                   R&                  U-  S9  [        R                  " UR                  [        R                  " UR(                  5      R                  S5      5        GOC[        U[*        5      (       a  UR"                  S-  SUR                   R,                  -  S-  -  U-  nUR"                  S-  U-  n[        R
                  " UR.                  R                  US9  [        R
                  " UR0                  R                  US9  [        R
                  " UR2                  R                  US9  [        R
                  " UR4                  R                  US9  GOG[        U[6        5      (       a  UR                   R8                  S-  SUR                   R,                  -  S-  -  U-  nSUR                   R8                  -  S-  U-  n[        R
                  " UR:                  R                  US9  [        R
                  " UR<                  R                  US9  O[        U[>        5      (       ar  [        R
                  " UR@                  R                  URB                  S-  U-  S9  [        R
                  " URD                  R                  URF                  S-  U-  S9  [        U[H        RJ                  5      (       a@  [        RL                  " URN                  5        [        RP                  " UR                  5        [        U[H        RR                  5      (       a/  URN                  b!  [        RL                  " URN                  5        g	g	g	)
zInitialize the weightsr   g{Gz?)meanstdrd   rc   r   )r  ra   N)*rZ   initializer_factorr+   r   initnormal_r   r   rv   copy_rb   rC   rx   r   ry   rX   ro   rj   rr   initializer_rangert   r   num_hidden_layersr   r   r   r   r   ri   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zeros_r`   ones_r   )r9   r   factorin_proj_stdout_proj_stdfc_stds         r0   _init_weights$CLIPSegPreTrainedModel._init_weights  si    //f344LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 788LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 011!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
++!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<--LL&&--))4/&8 LL((//++T1F:
 fbll++KK$JJv}}%fbii((V[[-DKK$ .E(r<   r=   N)r>   r?   r@   rA   r   rE   base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r  r   _can_record_outputsrC   no_gradr+  rF   r=   r<   r0   r  r    se    !(c&*#N"&-/BC&
 ]]_&% &%r<   r  c                   p   ^  \ rS rSrSrS\4U 4S jjr SS\R                  S-  S\	\
   S\4S	 jjrS
rU =r$ )CLIPSegEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPSegEncoderLayer`].

Args:
    config: CLIPSegConfig
rZ   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rg   rh   rZ   r   
ModuleListranger  r   layersgradient_checkpointing)r9   rZ   r   r{   s      r0   rh   CLIPSegEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %ks   A&Nr   r   r&   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)last_hidden_state)r<  r   )r9   r   r   r   rK   encoder_layers         r0   r   CLIPSegEncoder.forward  sC     &![[M) M ) +
 	
r<   )rZ   r=  r<  r)   )r>   r?   r@   rA   rB   r   rh   rC   r   r   r   r   r   rF   r   r   s   @r0   r8  r8    sP    ,} , /3
 t+
 +,	

 

 
r<   r8  c                      ^  \ rS rSrS\4U 4S jjr\\\S\	\
R                     S\
R                  S\\   S\4S j5       5       5       rS	rU =r$ )
CLIPSegDecoderi  rZ   c                 >  > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  UR                  R                  S-  UR                  R                  S-  4n[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " 5       [        R                  " UR                  UR                  S-  US   US   S9[        R                  " 5       [        R                  " UR                  S-  SUS   US   S95      U l        ON[        R                  " UR                  SUR                  R                  UR                  R                  S9U l        [#        UR$                  5      n[        R&                  " [)        U5       Vs/ s H8  n[        R                  " UR                  R*                  UR                  5      PM:     sn5      U l        [.        R0                  " UR                  5      nUR                  Ul        UR2                  Ul        UR6                  Ul        S	Ul        [        R&                  " [)        [#        UR$                  5      5       Vs/ s H  n[=        U5      PM     sn5      U l        U RA                  5         g s  snf s  snf )
N   r   r   )r^   paddingra   r   )r^   r_   )r_   relu)!rg   rh   conditional_layerr   r   projection_dim
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionvision_configrl   
Sequentialrp   ReLUConvTranspose2dtransposed_convolutionlenextract_layersr:  r;  ri   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r<  	post_init)r9   rZ   transposed_kernelsdepthr   decoder_configr{   s         r0   rh   CLIPSegDecoder.__init__  sg    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabUaPQRYYv++779J9JKUab
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tRsQ%8%HRs$tu c %us   ?L#LrK   rP   r   r&   c                    USSS2   nSn[        [        X@R                  U R                  5      5       H  u  nu  pxn	Ub  U	" U5      U-   nOU	" U5      nX`R                  :X  aJ  U R                  U5      UR                  SSS5      -  U R                  U5      -   nUR                  SSS5      nU" U4SS0UD6nM     USS2SS2SS24   R                  SS5      n[        [        R                  " UR                  S   5      5      n
UR                  S   nUR                  XR                  S   X5      nU R                  U5      R                  S5      n[!        US9$ )a  
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.
Nrd   r   r   ra   r   rJ   )	enumeratezipr<  rV  rI  rL  r   rM  r   r   mathsqrtr   r   rS  squeezerH   )r9   rK   rP   r   activationsoutputi
activationlayerreducer   r   rJ   s                r0   r   CLIPSegDecoder.forward(  s[    $DbD).7KVZVbVb8c.d*A*
6!
+f4
+***'=>PQSTVWAXX[_[h[h*\   1a06A$A&AF /e 12q!++Aq1499V\\!_-.+11!4
Za$E,,V4<<Q?#622r<   )rI  rM  rL  r<  rV  rS  )r>   r?   r@   rA   r   rh   r   r   r   r6   rC   r   r   r   rH   r   rF   r   r   s   @r0   rD  rD    sm    *} *X  %3U\\*%3 !&%3 +,	%3
 
%3    %3r<   rD  zL
    The text model from CLIPSEG without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjr\	\
" SS9\   SS	\R                  S-  S
\R                  S-  S\R                  S-  S\\   S\\-  4
S jj5       5       5       rSrU =r$ )CLIPSegTextModeliS  rZ   )r  r   c                   > [         TU ]  U5        UR                  n[        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l
        UR                  U l        U R                  5         g r   )rg   rh   ri   r   r}   r8  encoderr   r   r   final_layer_normeos_token_idr[  r   s      r0   rh   CLIPSegTextModel.__init__]  sd     &&	/7%f- "Y<Q<Q R #//r<   Ftie_last_hidden_statesNr   r   rb   r   r&   c                 8   Uc  [        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[	        U R
                  UUSS9nUR                  SS5        U R                  " SUUSS.UD6nUR                  nU R                  U5      nU R                  S	:X  ae  U[        R                  " UR                  S
   UR                  S9UR                  [        R                   UR                  S9R#                  SS94   n	OU[        R                  " UR                  S
   UR                  S9UR                  [        R                   UR                  S9U R                  :H  R!                  5       R#                  SS94   n	[%        UU	S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegTextModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzYou have to specify input_idsrd   )r   rb   )rZ   r   r   past_key_valuesr   T)r   r   r   ra   r   device)r   r{  r   r@  pooler_outputr=   )r   r   r   r}   r
   rZ   poprr  r@  rs  rt  rC   rx   r   r{  r   r   argmaxr   )
r9   r   r   rb   r   r   rK   encoder_outputsr@  rQ   s
             r0   r   CLIPSegTextModel.forwardh  s   2 <==nn&NN2{27	)W+;;') 	
 	

;%+/<< ,
'),
 	,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r<   )r}   rr  rt  rs  r   )r>   r?   r@   rA   r   rE   r.  _input_embed_layerrh   r   r   r   rC   r   r   r   r6   r   r   rF   r   r   s   @r0   rp  rp  S  s      *	0 	  E2 *..2,0	I
<<$&I
 t+I
 llT)	I

 +,I
 
+	+I
  3  I
r<   rp  zN
    The vision model from CLIPSEG without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrS\4U 4S jjr	\
\" SS9\ SS\R                  S	-  S
\S	-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )CLIPSegVisionModeli  rZ   r   )r  rr   c                 (  > [         TU ]  U5        UR                  n[        U5      U l        [
        R                  " X!R                  S9U l        [        U5      U l
        [
        R                  " X!R                  S9U l        U R                  5         g r   )rg   rh   ri   rX   r}   r   r   r   pre_layrnormr8  rr  post_layernormr[  r   s      r0   rh   CLIPSegVisionModel.__init__  sk     &&	1&9LL8M8MN%f- ll9:O:OPr<   Frv  Nr   r   r&   c                     U R                  XS9nU R                  U5      nU R                  " SSU0UD6nUR                  nUSS2SSS24   nU R	                  U5      n[        UUS9$ )a  
Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, CLIPSegVisionModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```)r   r   Nr   r|  r=   )r}   r  rr  r@  r  r   )r9   r   r   r   rK   r  r@  rQ   s           r0   r   CLIPSegVisionModel.forward  s    > h))-8+/<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r<   )r}   rr  r  r  r   )r>   r?   r@   rA   r   rE   main_input_namer.  r  rh   r   r   r   rC   rD   boolr   r   r6   r   r   rF   r   r   s   @r0   r  r    s      $O!*2   E2 15+
''$.+
 #'++
 +,	+

 
+	++
  3  +
r<   r  rJ   r&   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Nrz  )r   r   cross_entropyrC   rx   rT  r{  ra  s    r0   contrastive_lossr    s/    ==&&vu||CKPVP]P]/^__r<   
similarityc                 P    [        U 5      n[        U R                  5      nX-   S-  $ )Ng       @)r  T)r  caption_loss
image_losss      r0   image_text_contrastive_lossr    s(    #J/L!*,,/J%,,r<   tensorc                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
ra   rd   T)r   keepdimr   )rC   powsum)r  square_tensor
sum_tensornormed_tensors       r0   _get_vector_normr  	  s<    
 IIfa(M=b$?JIIj#.Mr<   c                     ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S\R                  S-  S\R                  S-  S\
\   S	\\-  4
S
 jj5       5       r\\ SS\R                  S\S\
\   S	\\-  4S jj5       5       r\\      SS\R$                  S-  S\R                  S-  S\R                  S-  S\R$                  S-  S\S-  S\S\
\   S	\4S jj5       5       rSrU =r$ )r   i  rZ   c                   > [         TU ]  U5        UR                  nUR                  nUR                  U l        UR
                  U l        UR
                  U l        [        R                  U5      U l
        [        R                  U5      U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        U R/                  5         g )NF)r`   )rg   rh   text_configrO  rJ  ri   r"  r$  rp  _from_config
text_modelr  vision_modelr   r   r#  r!  rm   rC   r  rZ   logit_scale_init_valuelogit_scaler[  )r9   rZ   r  rO  r{   s       r0   rh   CLIPSegModel.__init__  s     ((,,$33)55 - 9 9*77D.;;MJ!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r<   Nr   r   rb   r   r&   c                 x    U R                   " SUUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CLIPSegModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```T)r   r   rb   return_dictr=   )r  r}  r!  )r9   r   r   rb   r   text_outputsrQ   s          r0   get_text_featuresCLIPSegModel.get_text_features*  sS    . 48?? 4
)%	4

 4
 %22%)%9%9-%H"r<   r   r   c                 v    U R                   " SUUSS.UD6nUR                  nU R                  U5      Ul        U$ )a$  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegModel
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```T)r   r   r  r=   )r  r}  r#  )r9   r   r   r   vision_outputsrQ   s         r0   get_image_featuresCLIPSegModel.get_image_featuresM  sR    6 6:5F5F 6
%%=6
 	6
 '44'+'='=m'L$r<   return_lossc           
      
   U R                   " SUUS.UD6nU R                  " SUUUS.UD6n	UR                  n
U	R                  nU
[        U
5      -  n
U[        U5      -  n[        R
                  " XR                  5       R                  UR                  5      5      nXR                  R                  5       R                  UR                  5      -  nUR                  5       nSnU(       a  [        U5      n[        UUUUU
U	US9$ )al  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegModel
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   r   )r   r   rb   N)r   r    r!   r"   r#   r$   r%   r=   )r  r  r}  r  rC   r   tr   r{  r  expr  r   )r9   r   r   r   rb   r  r   r   r  r  r#   r"   r!   r    r   s                  r0   r   CLIPSegModel.forwards  s!   L 6:5L5L 6
%%=6
 6
 483I3I 4
)%4
 	4
 &33"00 $&6|&DD!$4[$AA  ,,{NN4D4G4GHZHZ4[\),<,<,@,@,B,E,EkFXFX,YY*,,..?D-+#%* .
 	
r<   )r  rJ  r"  r  r!  r$  r  r#  )NNr   )NNNNNT)r>   r?   r@   rA   r   rh   r   r   rC   r   r   r   r6   r   r  rD   r  r  r   r   r   rF   r   r   s   @r0   r   r     s   } (  /3,0	<< t+ llT)	
 +, 
+	+  B  *."''" #'" +,	"
 
+	+"  "H  .215.204#')-J
##d*J
 ''$.J
 t+	J

 &&-J
 D[J
 #'J
 +,J
 
J
  J
r<   r   zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                   
  ^  \ rS rSr% \\S'   S\4U 4S jjr     SS\S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  4S jjr\\        SS\R                  S-  S\R                  S-  S	\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S\\   S
\\-  4S jj5       5       rSrU =r$ )CLIPSegForImageSegmentationi  rZ   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [        U5      U l        U R                  5         g r)   )rg   rh   r   cliprU  rD  decoderr[  rz   s     r0   rh   $CLIPSegForImageSegmentation.__init__  s@      (	$33%f-r<   Nr   r   r   rb   conditional_pixel_valuesr&   c                    Ub^  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  X#US9R                  nS S S 5        U$ Ub_  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  U5      R                  nS S S 5        U$ [        S5      e! , (       d  f       W$ = f! , (       d  f       W$ = f)Nz@Make sure to pass as many prompt texts as there are query images)r   rb   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)rT  r   rC   r6  r  r  r}  r  )r9   r   r   r   rb   r  rP   s          r0   get_conditional_embeddings6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd)-)D)D< *E *- ' ! &% &1+,
: !dee)-)E)EF^)_)m)m& ! &%	 m  ! &% ! &%s   %C&C!
C!
C0r   rP   labelsr   r   c	           	      h   [         R                  " 5          SU	S'   U R                  R                  " SUUS.U	D6n
U
R                  nU
R
                  nU R                   Vs/ s H
  oUS-      PM     nn[        U
R                  U
R                  U
R
                  U
R                  S9n
SSS5        Uc!  U R                  UR                  S   UUUUS9nO]UR                  S   UR                  S   :w  a  [        S	5      eUR                  S   U R                  R                  :w  a  [        S
5      eU R                  " WU40 U	D6nUR                   nSnUb9  UR#                  UR$                  5      n[&        R(                  " 5       nU" UU5      n[+        UUUWW
US9$ s  snf ! , (       d  f       GN= f)a  
conditional_pixel_values (`torch.FloatTensor`, *optional*):
    The pixel values of the conditional images.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> texts = ["a cat", "a remote", "a blanket"]
>>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> logits = outputs.logits
>>> print(logits.shape)
torch.Size([3, 352, 352])
```Toutput_hidden_statesr  r   )r@  r}  rK   rL   Nr   )r   r   r   rb   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   rJ   rP   rQ   r%   rR   r=   )rC   r6  r  r  r}  rK   rU  r   r@  rL   r  r   r   rZ   rJ  r  rJ   r   r{  r   BCEWithLogitsLossrN   )r9   r   r   r  rP   r   rb   r  r   r   r  rQ   rK   ri  rg  decoder_outputsrJ   r   loss_fns                      r0   r   #CLIPSegForImageSegmentation.forward  s   b ]]_-1F)*!YY99 ))A N
 +88M*88M9=9L9LM9LAQ/9LKM 8"0"B"B,::,::)44	N , ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"
 

 !''YYv}}-F**,G66*D-#9' .*
 	
[ N _s   AF"!F26F"F""
F1)r  r  rU  )NNNNN)NNNNNNNT)r>   r?   r@   rA   r   rE   rh   r   rC   r   rD   r  r   r   r   r  r   r   r6   r   r   rF   r   r   s   @r0   r  r    s    }  "&)-.2,08<&$J& <<$&& t+	&
 llT)& #(,,"5& 
		&:  /315=A;?.204*.)-n
$$t+n
 ''$.n
 #("3"3d":	n

 !& 1 1D 8n
 t+n
 &&-n
   4'n
 #'n
 +,n
 
	n
  n
r<   r  )r   r  rp  r  r  )r   )ArW  rd  collections.abcr   dataclassesr   typingr   rC   r    r   r  rg  r	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipsegr   r   r   r   rH   rN   ModulerX   r   r   floatr   r   r   r   r  r  r8  rD  rp  r  r  r  r  r   r  __all__r=   r<   r0   <module>r     s  *   $ !    & ! / 9 K F & O O I 5 X X _K _  _@ <; <  <" _[ _  _6Pbii Pf%BII %^ %II%<<% 
% <<	%
 LL4'% % % '(%*7)ryy 7)t 4 B$4 $N 7%_ 7% 7%t
RYY 
DU3+ U3p 
\
- \

\
~ 
>
/ >

>
F`U\\ `ell `-ELL -U\\ -U\\ ell  j
) j
 j
Z 
X
"8 X

X
vr<   