
    Z jMh                     .   S r SSKrSSKrSSKJr  SSKJr  SSKrSSKJ	r	  SSKJ
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*  \" SS9\	 " S S\5      5       5       r+\" SS9\	 " S S\5      5       5       r,\" SS9\	 " S S\5      5       5       r- " S S\%5      r.\\ " S S\5      5       5       r/\\ " S S\5      5       5       r0 " S S \)5      r1 " S! S"\'5      r2 " S# S$\!5      r3 " S% S&\ 5      r4 " S' S(\#5      r5 " S) S*\#5      r6\ " S+ S,\&5      5       r7 " S- S.\"5      r8 " S/ S0\75      r9 " S1 S2\(5      r: " S3 S4\*5      r; " S5 S6\$5      r<\" S7S89 " S9 S:\75      5       r=/ S;Qr>g)<zPyTorch CLIPSeg model.    N)	dataclass)Any)strict)nn   )initialization)BaseModelOutputWithPooling)Unpack)ModelOutputTransformersKwargsauto_docstring)can_return_tuplemerge_with_config_defaults)capture_outputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfig)CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayer	CLIPModel
CLIPOutputCLIPPreTrainedModelCLIPTextEmbeddingsCLIPTextModelCLIPVisionEmbeddingsCLIPVisionModelzCIDAS/clipseg-rd64)
checkpointc                   &    \ rS rSrSr\" 5       rSrg)CLIPSegTextConfig/   a  
Example:

```python
>>> from transformers import CLIPSegTextConfig, CLIPSegTextModel

>>> # Initializing a CLIPSegTextConfig with CIDAS/clipseg-rd64 style configuration
>>> configuration = CLIPSegTextConfig()

>>> # Initializing a CLIPSegTextModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
>>> model = CLIPSegTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
``` N__name__
__module____qualname____firstlineno____doc__AttributeErrorprojection_dim__static_attributes__r$       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/clipseg/modular_clipseg.pyr"   r"   /         $%Nr.   r"   c                   &    \ rS rSrSr\" 5       rSrg)CLIPSegVisionConfigE   a  
Example:

```python
>>> from transformers import CLIPSegVisionConfig, CLIPSegVisionModel

>>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64 style configuration
>>> configuration = CLIPSegVisionConfig()

>>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
>>> model = CLIPSegVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```r$   Nr%   r$   r.   r/   r2   r2   E   r0   r.   r2   c                       \ rS rSr% SrSr\\   \\S4   -  \	S'   Sr
\\	S'   Sr\\	S	'   S
r\\-  \	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Sr\\	S'   Srg)CLIPSegConfig[   a  
extract_layers (`list[int]`, *optional*, defaults to `[3, 6, 9]`):
    Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
reduce_dim (`int`, *optional*, defaults to 64):
    Dimensionality to reduce the CLIP vision embedding.
conditional_layer (`int`, *optional*, defaults to 0):
    The layer to use of the Transformer encoder whose activations will be combined with the condition
    embeddings using FiLM (Feature-wise Linear Modulation). If 0, the last layer is used.
use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
    Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
    segmentation..

Example:

```python
>>> from transformers import CLIPSegConfig, CLIPSegModel

>>> # Initializing a CLIPSegConfig with CIDAS/clipseg-rd64 style configuration
>>> configuration = CLIPSegConfig()

>>> # Initializing a CLIPSegModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
>>> model = CLIPSegModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a CLIPSegConfig from a CLIPSegTextConfig and a CLIPSegVisionConfig

>>> # Initializing a CLIPSegText and CLIPSegVision configuration
>>> config_text = CLIPSegTextConfig()
>>> config_vision = CLIPSegVisionConfig()

>>> config = CLIPSegConfig(text_config=config_text, vision_config=config_vision)
```)r      	   .extract_layers@   
reduce_dim   decoder_num_attention_heads        decoder_attention_dropout
quick_geludecoder_hidden_acti   decoder_intermediate_sizer   conditional_layerF"use_complex_transposed_convolutionr$   N)r&   r'   r(   r)   r*   r9   listinttuple__annotations__r;   r=   r?   floatrA   strrB   rC   rD   boolr-   r$   r.   r/   r5   r5   [   su    !F 3<NDIc3h/;J'((-0us{0**%)s)s/4&4r.   r5   c                       \ rS rSrSrg)CLIPSegOutput   r$   Nr&   r'   r(   r)   r-   r$   r.   r/   rM   rM          r.   rM   c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Srg)	CLIPSegDecoderOutput   a}  
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
hidden_states (`tuple(torch.FloatTensor)`, *optional*,):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Rreturned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`
attentions (`tuple(torch.FloatTensor)`, *optional*):
    Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
    heads. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`
Nlogits.hidden_states
attentionsr$   )r&   r'   r(   r)   r*   rT   torchFloatTensorrH   rU   rG   rV   r-   r$   r.   r/   rR   rR      s\    	 (,FE$+:>M5**C/047>7;Je'',-4;r.   rR   c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S'   Sr\\S	'   S
\\   4S jrSrg)CLIPSegImageSegmentationOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Binary cross entropy loss for segmentation.
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
    Conditional embeddings used for segmentation.
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
    Pooled output of the [`CLIPSegVisionModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegVisionModel`].
decoder_output (`CLIPSegDecoderOutput`):
    The output of the [`CLIPSegDecoder`].
NlossrT   conditional_embeddingspooled_outputvision_model_outputdecoder_outputreturnc                 B    [        S U R                  5        5       5      $ )Nc              3   p   #    U  H,  n[        U[        5      (       a  UR                  5       OUv   M.     g 7fN)
isinstancer   to_tuple).0vs     r/   	<genexpr>:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s)     ^P]1Z;%?%?QZZ\QFP]s   46)rG   values)selfs    r/   rf   'CLIPSegImageSegmentationOutput.to_tuple   s    ^PTP[P[P]^^^r.   r$   )r&   r'   r(   r)   r*   r\   rW   rX   rH   rT   r]   r^   r_   r	   r`   rR   rG   r   rf   r-   r$   r.   r/   rZ   rZ      s     &*D%

d
")'+FE$+7;E--4;.2M5$$t+26:3:+/N(/_%* _r.   rZ   c                   `   ^  \ rS rSrSS\R
                  S\R                  4U 4S jjjrSrU =r	$ )CLIPSegVisionEmbeddings   pixel_valuesra   c                 $   > [         TU ]  X5        g rd   superforward)rl   rq   interpolate_pos_encoding	__class__s      r/   ru   CLIPSegVisionEmbeddings.forward   s    ?r.   r$   T)
r&   r'   r(   r)   rW   rX   Tensorru   r-   __classcell__rw   s   @r/   ro   ro      s,    @E$5$5 @Y^YeYe @ @r.   ro   c                       \ rS rSrSrg)CLIPSegTextEmbeddings   r$   NrO   r$   r.   r/   r~   r~      rP   r.   r~   c                       \ rS rSrSrg)CLIPSegAttention   r$   NrO   r$   r.   r/   r   r      rP   r.   r   c                       \ rS rSrSrg)
CLIPSegMLP   r$   NrO   r$   r.   r/   r   r      rP   r.   r   c                       \ rS rSrSrg)CLIPSegEncoderLayer   r$   NrO   r$   r.   r/   r   r      rP   r.   r   c                   p    \ rS rSrSrS\R                  S\R                  S\\R                     4S jr	Sr
g)	CLIPSegDecoderLayer   z
CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
self-attention/MLP, rather than before.
rU   attention_maskra   c                     UnU R                   " SUUS.UD6u  pXA-   nU R                  U5      nUnU R                  U5      nXA-   nU R                  U5      nU$ )N)rU   r   r$   )	self_attnlayer_norm1mlplayer_norm2)rl   rU   r   kwargsresidual_s         r/   ru   CLIPSegDecoderLayer.forward   sz     !>> 
')
 
 !0((7 / 0((7r.   r$   N)r&   r'   r(   r)   r*   rW   rz   rG   rX   ru   r-   r$   r.   r/   r   r      s;    
|| 
 
u  	!r.   r   c                   P    \ rS rSr\\/\S.r\R                  " 5       S 5       r
Srg)CLIPSegPreTrainedModel   )rU   rV   c                 	   U R                   R                  n[        U[        5      (       a  [        R
                  " UR                  R                  SUS-  S9  [        R
                  " UR                  R                  SUS-  S9  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        GOY[        U[        5      (       Ga   [        R
                  " UR                   SUR"                  S-  U-  S9  [        R
                  " UR$                  R                  UR                   R&                  U-  S9  [        R
                  " UR                  R                  UR                   R&                  U-  S9  [        R                  " UR                  [        R                  " UR(                  5      R                  S5      5        GOC[        U[*        5      (       a  UR"                  S-  SUR                   R,                  -  S-  -  U-  nUR"                  S-  U-  n[        R
                  " UR.                  R                  US9  [        R
                  " UR0                  R                  US9  [        R
                  " UR2                  R                  US9  [        R
                  " UR4                  R                  US9  GOG[        U[6        5      (       a  UR                   R8                  S-  SUR                   R,                  -  S-  -  U-  nSUR                   R8                  -  S-  U-  n[        R
                  " UR:                  R                  US9  [        R
                  " UR<                  R                  US9  O[        U[>        5      (       ar  [        R
                  " UR@                  R                  URB                  S-  U-  S9  [        R
                  " URD                  R                  URF                  S-  U-  S9  [        U[H        RJ                  5      (       a@  [        RL                  " URN                  5        [        RP                  " UR                  5        [        U[H        RR                  5      (       a/  URN                  b!  [        RL                  " URN                  5        g	g	g	)
zInitialize the weightsr>   g{Gz?)meanstd)   r   g      )r   r   N)*configinitializer_factorre   r~   initnormal_token_embeddingweightposition_embeddingcopy_position_idsrW   arangeshapeexpandro   class_embedding	embed_dimpatch_embeddinginitializer_rangenum_positionsr   num_hidden_layersq_projk_projv_projout_projr   hidden_sizefc1fc2CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   	LayerNormzeros_biasones_Linear)rl   modulefactorin_proj_stdout_proj_stdfc_stds         r/   _init_weights$CLIPSegPreTrainedModel._init_weights   si    //f344LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 788LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 011!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
++!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<--LL&&--))4/&8 LL((//++T1F:
 fbll++KK$JJv}}%fbii((V[[-DKK$ .E(r.   r$   N)r&   r'   r(   r)   r   r   r   _can_record_outputsrW   no_gradr   r-   r$   r.   r/   r   r      s2     ./BC&
 ]]_&% &%r.   r   c                       \ rS rSrSrg)CLIPSegEncoderi%  r$   NrO   r$   r.   r/   r   r   %  rP   r.   r   c                      ^  \ rS rSrS\4U 4S jjr\\\S\	\
R                     S\
R                  S\\   S\4S j5       5       5       rS	rU =r$ )
CLIPSegDecoderi)  r   c                 >  > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  UR                  R                  S-  UR                  R                  S-  4n[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " 5       [        R                  " UR                  UR                  S-  US   US   S9[        R                  " 5       [        R                  " UR                  S-  SUS   US   S95      U l        ON[        R                  " UR                  SUR                  R                  UR                  R                  S9U l        [#        UR$                  5      n[        R&                  " [)        U5       Vs/ s H8  n[        R                  " UR                  R*                  UR                  5      PM:     sn5      U l        [.        R0                  " UR                  5      nUR                  Ul        UR2                  Ul        UR6                  Ul        S	Ul        [        R&                  " [)        [#        UR$                  5      5       Vs/ s H  n[=        U5      PM     sn5      U l        U RA                  5         g s  snf s  snf )
Nr<   r   r   )kernel_sizepaddingr   r   )r   stride)r   relu)!rt   __init__rC   r   r   r,   r;   film_mulfilm_addrD   vision_config
patch_size
SequentialConv2dReLUConvTranspose2dtransposed_convolutionlenr9   
ModuleListranger   reducescopydeepcopyr=   num_attention_headsrB   intermediate_size
hidden_actr   layers	post_init)rl   r   transposed_kernelsdepthr   decoder_configrw   s         r/   r   CLIPSegDecoder.__init__*  sg    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabUaPQRYYv++779J9JKUab
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tRsQ%8%HRs$tu c %us   ?L#LrU   r]   r   ra   c                    USSS2   nSn[        [        X@R                  U R                  5      5       H  u  nu  pxn	Ub  U	" U5      U-   nOU	" U5      nX`R                  :X  aJ  U R                  U5      UR                  SSS5      -  U R                  U5      -   nUR                  SSS5      nU" U4SS0UD6nM     USS2SS2SS24   R                  SS5      n[        [        R                  " UR                  S   5      5      n
UR                  S   nUR                  XR                  S   X5      nU R                  U5      R                  S5      n[!        US9$ )a  
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.
Nr   r   r   r   r   )rT   )	enumeratezipr   r   rC   r   permuter   	transposerF   mathsqrtr   viewr   squeezerR   )rl   rU   r]   r   activationsoutputi
activationlayerreducesize
batch_sizerT   s                r/   ru   CLIPSegDecoder.forwardV  s[    $DbD).7KVZVbVb8c.d*A*
6!
+f4
+***'=>PQSTVWAXX[_[h[h*\   1a06A$A&AF /e 12q!++Aq1499V\\!_-.+11!4
Za$E,,V4<<Q?#622r.   )rC   r   r   r   r   r   )r&   r'   r(   r)   r5   r   r   r   r   rG   rW   rz   r
   r   rR   ru   r-   r{   r|   s   @r/   r   r   )  sm    *} *X  %3U\\*%3 !&%3 +,	%3
 
%3    %3r.   r   c                   6   ^  \ rS rSrS\\-  4U 4S jjrSrU =r$ )CLIPSegTextModeli  ra   c                 $   > [         TU ]  " S0 UD6$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegTextModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r$   rs   rl   super_kwargsrw   s     r/   ru   CLIPSegTextModel.forward  s      w...r.   r$   )	r&   r'   r(   r)   rG   r	   ru   r-   r{   r|   s   @r/   r   r     s    /1K)K / /r.   r   c            
       n   ^  \ rS rSr S	S\R
                  S-  S\S-  S\\   S\	\
-  4U 4S jjjrSrU =r$ )
CLIPSegVisionModeli  rq   Nrv   r   ra   c                 &   > [         TU ]  " X40 UD6$ )a  
Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, CLIPSegVisionModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```rs   rl   rq   rv   r   rw   s       r/   ru   CLIPSegVisionModel.forward  s    8 w|PPPr.   r$   ry   )r&   r'   r(   r)   rW   rX   rK   r
   r   rG   r	   ru   r-   r{   r|   s   @r/   r  r    sY     15Q''$.Q #'+Q +,	Q
 
+	+Q Qr.   r  c            
          ^  \ rS rSrU 4S jr S
S\R                  S\S\\	   S\
\-  4U 4S jjjrS
S\4U 4S jjjrS	rU =r$ )r   i  c                 $   > [         TU ]  " S0 UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CLIPSegModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r$   )rt   get_text_featuresr  s     r/   r  CLIPSegModel.get_text_features  s     w(8<88r.   rq   rv   r   ra   c                 &   > [         TU ]  " X40 UD6$ )a$  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegModel
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```)rt   get_image_featuresr  s       r/   r  CLIPSegModel.get_image_features  s    2 w),[TZ[[r.   c                 *   > [         TU ]  " SSU0UD6  g)al  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegModel
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```rv   Nr$   rs   )rl   rv   r  rw   s      r/   ru   CLIPSegModel.forward  s    6 	Z1IZ\Zr.   r$   ry   )r&   r'   r(   r)   r  rW   rX   rK   r
   r   rG   r	   r  ru   r-   r{   r|   s   @r/   r   r     si    9( *.\''\ #'\ +,	\
 
+	+\ \6[ [ [r.   r   zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                   
  ^  \ rS rSr% \\S'   S\4U 4S jjr     SS\S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  4S jjr\\        SS\R                  S-  S\R                  S-  S	\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S\\   S
\\-  4S jj5       5       rSrU =r$ )CLIPSegForImageSegmentationi   r   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [        U5      U l        U R                  5         g rd   )rt   r   r   clipr9   r   decoderr   )rl   r   rw   s     r/   r   $CLIPSegForImageSegmentation.__init__  s@      (	$33%f-r.   Nr   	input_idsr   r   conditional_pixel_valuesra   c                    Ub^  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  X#US9R                  nS S S 5        U$ Ub_  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  U5      R                  nS S S 5        U$ [        S5      e! , (       d  f       W$ = f! , (       d  f       W$ = f)Nz@Make sure to pass as many prompt texts as there are query images)r   r   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r   
ValueErrorrW   r   r  r  pooler_outputr  )rl   r   r  r   r   r  r]   s          r/   get_conditional_embeddings6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd)-)D)D< *E *- ' ! &% &1+,
: !dee)-)E)EF^)_)m)m& ! &%	 m  ! &% ! &%s   %C&C!
C!
C0rq   r]   labelsrv   r   c	           	      h   [         R                  " 5          SU	S'   U R                  R                  " SUUS.U	D6n
U
R                  nU
R
                  nU R                   Vs/ s H
  oUS-      PM     nn[        U
R                  U
R                  U
R
                  U
R                  S9n
SSS5        Uc!  U R                  UR                  S   UUUUS9nO]UR                  S   UR                  S   :w  a  [        S	5      eUR                  S   U R                  R                  :w  a  [        S
5      eU R                  " WU40 U	D6nUR                   nSnUb9  UR#                  UR$                  5      n[&        R(                  " 5       nU" UU5      n[+        UUUWW
US9$ s  snf ! , (       d  f       GN= f)a  
conditional_pixel_values (`torch.FloatTensor`, *optional*):
    The pixel values of the conditional images.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
>>> from transformers.image_utils import load_image

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> texts = ["a cat", "a remote", "a blanket"]
>>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> logits = outputs.logits
>>> print(logits.shape)
torch.Size([3, 352, 352])
```Toutput_hidden_states)rq   rv   r   )last_hidden_stater  rU   rV   Nr   )r   r  r   r   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r\   rT   r]   r^   r_   r`   r$   )rW   r   r  r  r  rU   r9   r	   r#  rV   r  r   r  r   r,   r  rT   todevicer   BCEWithLogitsLossrZ   )rl   r  rq   r  r]   r   r   r   rv   r   vision_outputsr^   rU   r   r   decoder_outputsrT   r\   loss_fns                      r/   ru   #CLIPSegForImageSegmentation.forward-  s   b ]]_-1F)*!YY99 ))A N
 +88M*88M9=9L9LM9LAQ/9LKM 8"0"B"B,::,::)44	N , ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"
 

 !''YYv}}-F**,G66*D-#9' .*
 	
[ N _s   AF"!F26F"F""
F1)r  r  r9   )NNNNN)NNNNNNNT)r&   r'   r(   r)   r5   rH   r   rF   rW   rz   rX   r  r   r   
LongTensorrK   r
   r   rG   rM   ru   r-   r{   r|   s   @r/   r  r     s    }  "&)-.2,08<&$J& <<$&& t+	&
 llT)& #(,,"5& 
		&:  /315=A;?.204*.)-n
$$t+n
 ''$.n
 #("3"3d":	n

 !& 1 1D 8n
 t+n
 &&-n
   4'n
 #'n
 +,n
 
	n
  n
r.   r  )r5   r"   r2   r   r   r   r  r  )?r*   r   r   dataclassesr   typingr   rW   huggingface_hub.dataclassesr   r    r   r   modeling_outputsr	   processing_utilsr
   utilsr   r   r   utils.genericr   r   utils.output_capturingr   clip.configuration_clipr   r   r   clip.modeling_clipr   r   r   r   r   r   r   r   r   r   r   r"   r2   r5   rM   rR   rZ   ro   r~   r   r   r   r   r   r   r   r   r  r   r  __all__r$   r.   r/   <module>r8     s      !   .  & : & D D I 5 R R    /0& &  1&( /0&* &  1&( /0+5J +5  1+5\	J 	 <; <  <" _[ _  _6@2 @	. 		} 		 		* 	* > -%0 -% -%`	[ 	U3+ U3p/} /(Q Q@H[9 H[V 
X
"8 X

X
v	r.   