
    Z j              	          S r SSKrSSKJr  SSKrSSKJr  SSKJr	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  \R4                  " \5      r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r " S S\R>                  5      r  " S S\R>                  5      r! " S S\R>                  5      r" " S S\R>                  5      r#S>S\RH                  S\%S\&S \RH                  4S! jjr' " S" S#\R>                  5      r( " S$ S%\5      r) " S& S'\R>                  5      r* " S( S)\RV                  5      r, " S* S+\R>                  5      r- " S, S-\R>                  5      r.\ " S. S/\5      5       r/\ " S0 S1\/5      5       r0S2\RH                  S3\1S \RH                  4S4 jr2S2\RH                  S5\1S6\1S \RH                  4S7 jr3 " S8 S9\R>                  5      r4\" S:S9 " S; S<\/5      5       r5/ S=Qr6g)?zPyTorch SegGpt model.    N)	dataclass)nn)
functional   )initialization)ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )SegGptConfigz1
    Output type of [`SegGptEncoderOutput`].
    )custom_introc                       \ rS rSr% Sr\R                  \S'   Sr\	\R                     S-  \S'   Sr
\	\R                     S-  \S'   Sr\	\R                     S-  \S'   Srg)	SegGptEncoderOutput"   aE  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
    of shape `(batch_size, patch_height, patch_width, hidden_size)`.
attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
    Tuple of *torch.FloatTensor* (one for each layer) of shape
    `(batch_size, num_heads, seq_len, seq_len)`.
intermediate_hidden_states (`tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
    Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
    Additionally, each feature passes through a LayerNorm.
last_hidden_stateNhidden_states
attentionsintermediate_hidden_states )__name__
__module____qualname____firstlineno____doc__torchFloatTensor__annotations__r   tupler   r   __static_attributes__r       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   r   "   sd     (((59M5**+d2926Je''(4/6BFe&7&7 84 ?Fr$   r   z;
    Output type of [`SegGptImageSegmentationOutput`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	SegGptImageSegmentationOutput>   a  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
    The loss value.
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    The predicted masks.
hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
    of shape `(batch_size, patch_height, patch_width, hidden_size)`.
attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape
    `(batch_size, num_heads, seq_len, seq_len)`.
Nloss
pred_masksr   r   r   )r   r   r   r   r   r)   r   r    r!   r*   r   r"   r   r#   r   r$   r%   r'   r'   >   sg     &*D%

d
")+/J!!D(/59M5**+d2926Je''(4/6r$   r'   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )SegGptPatchEmbeddingsY   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr3   r4   r5   r6   r;   	__class__s          r%   r2   SegGptPatchEmbeddings.__init__`   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir$   c                 J   UR                   u  p#pEX0R                  :w  a  [        S5      eX@R                  S   :w  d  XPR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      R                  SSS	S5      nU$ )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper5   
ValueErrorr3   r=   permute)r>   pixel_values
batch_sizer5   heightwidth
embeddingss          r%   forwardSegGptPatchEmbeddings.forwardn   s    2>2D2D/
&,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  __\2::1aAF
r$   )r3   r5   r;   r4   r=   )	r   r   r   r   r   r2   rN   r#   __classcell__r@   s   @r%   r,   r,   Y   s    j r$   r,   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\S\S\R                  4S	 jr
  SS
\R                  S\R                  S\R                  S-  S\S-  S\R                  4
S jjrSrU =r$ )SegGptEmbeddings|   zP
Construct the embeddings from patch, position embeddings for input and prompt.
r?   returnNc                   > [         TU ]  5         [        R                  " [        R
                  " SSSUR                  5      5      U l        [        R                  " [        R
                  " SSSUR                  5      5      U l        [        R                  " [        R
                  " SSSUR                  5      5      U l	        [        R                  " [        R
                  " SSSUR                  5      5      U l
        [        R                  " [        R
                  " SSSUR                  5      5      U l        [        U5      U l        UR                  UR                  -  S-  S-   n[        R                  " [        R                   " SX!R                  5      5      U l        [        R$                  " UR&                  5      U l        g )Nr   rE   )r1   r2   r   	Parameterr   zerosr6   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer,   patch_embeddingspretrain_image_sizer4   randnposition_embeddingsDropouthidden_dropout_probdropout)r>   r?   num_positionsr@   s      r%   r2   SegGptEmbeddings.__init__   s1   ,,u{{1aF<N<N'OP#%<<Aq!VEWEW0X#Y $&LLQ1fFXFX1Y$Z!#%<<Aq!VEWEW0X#Y #%<<Aq!VEWEW0X#Y  5f =33v7H7HHQNQRR#%<<A}N`N`0a#b zz&"<"<=r$   rK   rL   c                    U R                   S S 2SS 24   nUR                  S   n[        US-  5      n[        R                  R                  5       (       d
  XQ:w  d  XR:w  aO  [        R                  " UR                  SXUS5      R                  SSSS5      X4SSS	9nUR                  SSSS5      $ UR                  SXS5      $ )
Nr         ?r   r   rE   bicubicF)sizemodealign_corners)
ra   rF   r   r   jit
is_tracingFinterpolatereshaperH   )r>   rK   rL   patch_pos_embedr;   pretrain_patch_sizes         r%   interpolate_pos_encoding)SegGptEmbeddings.interpolate_pos_encoding   s    221ab59%++A.'S(89 99!!%8%BFYFbmm''+>UWX``abdeghjkl_#	O #**1aA66"**1fR@@r$   rI   prompt_pixel_valuesbool_masked_posembedding_typec                 :   U R                  U5      nU R                  U5      nUR                  u  pxpU R                  R                  XxU	S5      nUR	                  S5      R                  U5      R                  SXS5      nUSU-
  -  X-  -   nUb  UOSnU R                  X5      nXPR                  -   nX`R                  -   nX]-   nXm-   nUS:X  a  U R                  nO!US:X  a  U R                  nO[        SU 35      eX^-   nXn-   n[        R                  " XV4SS9nU$ )Nri   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)r^   rF   rY   expand	unsqueezetype_asrr   ru   rZ   r[   r\   r]   rG   r   cat)r>   rI   rw   rx   ry   input_embeddingsprompt_embeddingsrJ   patch_heightpatch_width_rY   w	pos_embedtype_embeddingrM   s                   r%   rN   SegGptEmbeddings.forward   sL     00> 112EF3C3I3I0
+__++JkSUV
%%b)11*=EEb,efg-Q7*.H+9+E: 11,L	 ,.F.FF-0I0II ,7-9 Z'!55Nz)!55Nabpaqrss+<->YY 0D!L
r$   )rd   rY   r^   ra   rZ   r[   r]   r\   )NN)r   r   r   r   r   r   r2   intr   Tensorru   
BoolTensorstrrN   r#   rP   rQ   s   @r%   rS   rS   |   s    >| > > As A3 A5<< A, 48%)+ll+ #\\+ ))D0	+
 d
+ 
+ +r$   rS   c                   J  ^  \ rS rSrSrU 4S jrS\S\S\R                  S\R                  4S jr	S	\R                  S
\R                  S\R                  S\R                  S\
\\4   S\
\\4   S\R                  4S jrSS\R                  S\R                  4S jjrSrU =r$ )SegGptAttention   z=Multi-head Attention block with relative position embeddings.c                   > [         TU ]  5         UR                  UR                  p2[	        U[
        R                  R                  5      (       a  UOX"4n[	        U[
        R                  R                  5      (       a  UOX34nUS   UR                  -  US   UR                  -  4nUR                  UR                  -  nUR                  U l	        US-  U l
        [        R                  " UR                  UR                  S-  UR                  S9U l        [        R                  " UR                  UR                  5      U l        UR                   U l        U R                   (       a  Uc  [#        S5      e[        R$                  " [&        R(                  " SUS   -  S-
  U5      5      U l        [        R$                  " [&        R(                  " SUS   -  S-
  U5      5      U l        g g )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.rE   )r1   r2   r3   r4   r7   r8   r9   r:   r6   num_attention_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsrG   rW   r   rX   	rel_pos_h	rel_pos_w)r>   r?   r3   r4   
input_sizehead_dimr@   s         r%   r2   SegGptAttention.__init__   s   !'!2!2F4E4EJ#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
 mv'8'88*Q-6K\K\:\]
%%)C)CC#)#=#= t^
99V//1C1Ca1Gfoo^IIf00&2D2DE	060W0W-00! !eff  \\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r$   q_sizek_sizerel_posrU   c                 
   [        S[        X5      -  S-
  5      n[        R                  " UR	                  SUR
                  S   S5      R                  SSS5      USS9nUR	                  SU5      R                  SS5      n[        R                  " U5      SS2S4   [        X!-  S5      -  n[        R                  " U5      SSS24   [        X-  S5      -  nXg-
  US-
  [        X-  S5      -  -   nXXR                  5          $ )	aa  
Get relative positional embeddings according to the relative positions of
    query and key sizes.

Args:
    q_size (int):
        size of the query.
    k_size (int):
        size of key k.
    rel_pos (`torch.Tensor`):
        relative position embeddings (L, channel).

Returns:
    Extracted positional embeddings according to relative positions.
rE   r   r   ri   linear)rk   rl   Ng      ?)
r   maxrp   rq   rr   rF   rH   r   arangelong)	r>   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r%   get_rel_posSegGptAttention.get_rel_pos   s      1s622Q67--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ <<'403v3LL<<'a03v3LL#.6A:V_VYAZ2ZZ33566r$   attnqueryr   r   c                    Uu  pxUu  pU R                  XyU5      nU R                  XU5      nUR                  u  pnUR                  XX5      n[        R                  " SUU5      n[        R                  " SUU5      nUR                  XXU
5      nUUSS2SS2SS2SS2S4   -   USS2SS2SS2SSS24   -   nUR                  XU-  X-  5      nU$ )aZ  
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

Args:
    attn (`torch.Tensor`):
        attention map.
    query (`torch.Tensor`):
        query q in the attention layer with shape (batch_size, query_height * query_width, channel).
    rel_pos_h (`torch.Tensor`):
        relative position embeddings (Lh, channel) for height axis.
    rel_pos_w (`torch.Tensor`):
        relative position embeddings (Lw, channel) for width axis.
    q_size (tuple):
        spatial sequence size of query q with (query_height, query_width).
    k_size (tuple):
        spatial sequence size of key k with (key_height, key_width).

Returns:
    attn (`torch.Tensor`):
        attention map with added relative positional embeddings.
zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   rF   rr   r   einsum)r>   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrJ   r   r~   reshaped_queryrel_hrel_ws                      r%   add_decomposed_rel_pos&SegGptAttention.add_decomposed_rel_pos  s    > %+! &
#'#3#3Li#X "&"2"2;9"U"[[
szR-~?WX-~?VW||JkyYeAq!Q,--aAtQ6F0GG||J{(BJDZ[r$   r   c           	         UR                   u  p4pVU R                  U5      R                  X4U-  SU R                  S5      R	                  SSSSS5      nUR                  SX0R                  -  XE-  S5      R                  S5      u  pn
XR                  -  U	R                  SS5      -  nU R                  (       a+  U R                  XU R                  U R                  XE4XE45      n[        R                  R                  R                  U[        R                   SS9R#                  UR$                  5      nU(       aA  UR'                  X0R                  XE-  S5      nUR'                  X0R                  -  XE-  S5      nOS nX-  R                  X0R                  XES5      nUR	                  SSSSS5      R                  X4US5      nU R)                  U5      nX4$ )	Nr   ri   rE   r   r      )dtyper~   )rF   r   rr   r   rH   unbindr   	transposer   r   r   r   r   r   r   softmaxfloat32tor   viewr   )r>   r   output_attentionsrJ   rK   rL   r   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputs                 r%   rN   SegGptAttention.forward9  s   '4':':$
E HH]#WZ%D4L4LbQWQ1a# 	  KK:8P8P+PRXR`bdellmnoE

*cmmB.CC0066T^^T^^f_W]VeL xx**22<u}}Z\2]``afalalm
 %1$5$5jBZBZ\b\jln$o!055jC[C[6[]c]kmopL$(!#+44ZAYAY[ajlm!))!Q1a8@@UZ\^_ii,33r$   )r   r   r   r   r   r   r   )F)r   r   r   r   r   r2   r   r   r   r   r"   r   rN   r#   rP   rQ   s   @r%   r   r      s    GX07# 7s 7U\\ 7ell 7@+ll+ ||+ <<	+
 <<+ c3h+ c3h+ 
+Z#4U\\ #4u|| #4 #4r$   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	SegGptMlpi`  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l
        g N)r1   r2   r   r   r6   mlp_dimlin1lin2r   
hidden_actactr>   r?   r@   s     r%   r2   SegGptMlp.__init__a  sX    IIf00&..A	IIfnnf.@.@A	&++,r$   r   rU   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r>   r   s     r%   rN   SegGptMlp.forwardg  s2    		-0/		-0r$   )r   r   r   )
r   r   r   r   r2   r   r   rN   r#   rP   rQ   s   @r%   r   r   `  s(    -U\\ ell  r$   r   input	drop_probtrainingrU   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

        r   r   )r   r   device)rF   ndimr   randr   r   floor_div)r   r   r   	keep_probrF   random_tensoroutputs          r%   	drop_pathr   o  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr$   c                      ^  \ rS rSrSrSS\S-  SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )SegGptDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rU   c                 .   > [         TU ]  5         Xl        g r   )r1   r2   r   )r>   r   r@   s     r%   r2   SegGptDropPath.__init__  s    "r$   r   c                 B    [        XR                  U R                  5      $ r   )r   r   r   r   s     r%   rN   SegGptDropPath.forward  s    FFr$   c                      SU R                    3$ )Nzp=r   r>   s    r%   
extra_reprSegGptDropPath.extra_repr  s    DNN#$$r$   r   r   )r   r   r   r   r   floatr2   r   r   rN   r   r   r#   rP   rQ   s   @r%   r   r     sQ    b#%$, #$ # #GU\\ Gell G%C % %r$   r   c                      ^  \ rS rSrS\S\SS4U 4S jjr  SS\R                  S\	S	\
S
\
S\\R                  \R                  4   \\R                     -  4
S jjrSrU =r$ )SegGptLayeri  r?   drop_path_raterU   Nc                 p  > [         TU ]  5         [        U5      U l        [	        U5      U l        US:  a  [        U5      O[        R                  " 5       U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r1   r2   r   	attentionr   mlpr   r   Identityr   	LayerNormr6   layer_norm_epslayernorm_beforelayernorm_after)r>   r?   r   r@   s      r%   r2   SegGptLayer.__init__  s    (0V$;IC;O7UWU`U`Ub "V-?-?VEZEZ [!||F,>,>FDYDYZr$   r   ensemble_condfeature_ensembler   c                    U R                  U R                  U5      US9nUS   nUSS  nU(       a  UR                  S   S-  U:  a  UR                  UR                  S   S-  SS9u  pUS:X  a^  UR                  S   S-  n
U	R	                  SU
S5      n	U	R                  SSS9R                  U	5      n	U	R                  " UR                  6 n	OU	R                  SSS9R                  U	5      n	[        R                  " X/SS9nU R                  U5      U-   nUnU R                  U5      nU R                  U5      nXR                  U5      -   nU4U-   nU$ )	N)r   r   r   rE   r}   ri   T)r~   keepdim)r   r  rF   splitrr   mean	expand_asr   r   r   r  r  )r>   r   r  r	  r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresiduals               r%   rN   SegGptLayer.forward  sr    "&!!-0/ "0 "
 2!4(, 0 6 6q 9Q >- O-334D4J4J14MQR4RXY3ZNF!.44Q71<;;D9CCFK6D9CCFK$yy&)9qA '78=H ,,];/ >>-#@@ "W,r$   )r   r   r  r  r  )FF)r   r   r   r   r   r   r2   r   r   r   boolr"   rN   r#   rP   rQ   s   @r%   r   r     s    [| [U [t [ "'"'#||# # 	#
  # 
u||U\\)	*U5<<-@	@# #r$   r   c                   |   ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\S\S	\S
\S\	\
-  4S jjrSrU =r$ )SegGptEncoderi  r?   rU   Nc           
        > [         TU ]  5         Xl        [        R                  " SUR
                  UR                  SS9 Vs/ s H  o"R                  5       PM     nn[        R                  " [        UR                  5       Vs/ s H  n[        XU   5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf s  snf )Nr   cpu)r   r   F)r1   r2   r?   r   linspacer   num_hidden_layersitemr   
ModuleListranger   layersr  r6   r  	layernormgradient_checkpointing)r>   r?   xdprir@   s        r%   r2   SegGptEncoder.__init__  s    !&63H3H&JbJbkp!qr!qAvvx!qrmm%PVPhPhJi$jJiQ[Q%@Ji$jkf&8&8f>S>ST&+# s$js   CC$r   r	  r   output_hidden_statesreturn_dictc                 d   U(       a  SOS nU(       a  SOS n/ n[        U R                  5       H  u  pU(       a  Xa4-   nU R                  R                  U	:  a  SOSnU
" XX#5      nUS   nXR                  R                  :X  a-  US UR                  S   S-   XR                  S   S-  S  -   S-  nXR                  R
                  ;   a   UR                  U R                  U5      5        U(       d  M  X|S   4-   nM     U(       a  Xa4-   nU(       d  [        S XXx4 5       5      $ [        UUUUS9$ )Nr   rE   r   r   rh   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     r%   	<genexpr>(SegGptEncoder.forward.<locals>.<genexpr>  s      lA ls   	)r   r   r   r   )
	enumerater!  r?   merge_indexrF   !intermediate_hidden_state_indicesappendr"  r"   r   )r>   r   r	  r   r(  r)  all_hidden_statesall_self_attentionsr   r&  layer_moduler  layer_outputss                r%   rN   SegGptEncoder.forward  sT    #7BD$5b4%'"(5OA#$58H$H! "&!8!81!<A!M(GWkM)!,MKK+++!"?M$7$7$:a$?@=QdQdefQgklQlQnCoo! KKAAA*11$..2OP  &91=M<O&O#)  6,   14D D '<Ol  
 #++*'A	
 	
r$   )r?   r#  r"  r!  )FFFT)r   r   r   r   r   r2   r   r   r  r"   r   rN   r#   rP   rQ   s   @r%   r  r    sr    ,| , , "'"'%* 0
||0
 0
  	0

 #0
 0
 
$	$0
 0
r$   r  c                   v   ^  \ rS rSrSrSSS.U 4S jjrS\R                  S\R                  4U 4S	 jjrS
r	U =r
$ )SegGptLayerNormi  a5  LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
gư>channels_last)r   data_formatc                `   > [         TU ]  " U4SU0UD6  US;  a  [        SU 35      eX0l        g )Nr   )r;  channels_firstzUnsupported data format: )r1   r2   NotImplementedErrorr<  )r>   normalized_shaper   r<  kwargsr@   s        r%   r2   SegGptLayerNorm.__init__  s=    )=s=f=AA%(A+&OPP&r$   featuresrU   c                    > U R                   S:X  a9  UR                  SSSS5      n[        TU ]  U5      nUR                  SSSS5      nU$ [        TU ]  U5      nU$ )zt
Args:
    features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
r>  r   rE   r   r   )r<  rH   r1   rN   )r>   rC  r@   s     r%   rN   SegGptLayerNorm.forward  sj    
 //''1a3Hwx0H''1a3H  wx0Hr$   )r<  )r   r   r   r   r   r2   r   r   rN   r#   rP   rQ   s   @r%   r:  r:    s9    
 15/ ' '   r$   r:  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )SegGptDecoderHeadi  c                 F  > [         TU ]  5         [        R                  " UR                  UR                  SSS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        [        R                  " UR                  SSSS9U l        g )Nr   r   )r/   paddingr>  )r@  r   r<  T)r/   r   )r1   r2   r   r<   decoder_hidden_sizeconvr:  r  r"  r   r   act_fctheadr   s     r%   r2   SegGptDecoderHead.__init__  s    II&&&&	
	 )#77V=R=R`p
 f//0IIf88!QUV	r$   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rK  r"  rL  rM  r   s     r%   rN   SegGptDecoderHead.forward"  s@    		-0}5]3		-0r$   )rL  rK  rM  r"  )
r   r   r   r   r2   r   r    rN   r#   rP   rQ   s   @r%   rG  rG    s     WU%6%6  r$   rG  c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrS\R                  4S jrSr	U =r
$ )SegGptDecoderi+  c                 <  > [         TU ]  5         [        R                  " UR                  [        UR                  5      -  UR                  S-  UR                  -  SS9U l	        [        U5      U l        UR                  U l        UR                  U l        Xl        g )NrE   Tr   )r1   r2   r   r   r6   lenr2  r4   rJ  decoder_embedrG  decoder_predr?   r   s     r%   r2   SegGptDecoder.__init__,  s    YYV%M%M!NNq 6#=#==

 .f5 ++#)#=#= r$   r   rU   c                 
   UR                   u  p#pEUR                  X#X@R                  U R                  U R                  5      nUR	                  SSSSSS5      nUR                  USX0R                  -  X@R                  -  4S9nU$ )	Nr      r   r   rE   r   ri   rF   )rF   rr   r4   rJ  rH   )r>   r   rJ   r   r   r   s         r%   _reshape_hidden_states$SegGptDecoder._reshape_hidden_states8  s    3@3F3F0
+%--k??DOOUYUmUm
 &--aAq!Q?%--r<//#A;Q`Q`C`a . 
 r$   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rU  r[  rV  r   s     r%   rN   SegGptDecoder.forwardD  s8    **=933MB))-8r$   )r?   rU  rJ  rV  r4   )r   r   r   r   r2   r   r    r[  rN   r#   rP   rQ   s   @r%   rR  rR  +  s>    

E4E4E 
%J[J[ 
U%6%6  r$   rR  c                       \ rS rSr% \\S'   SrSrSrSr	SS/r
\R                  " 5       S	\R                  S
S4S j5       rSrg)SegGptPreTrainedModeliL  r?   modelrI   )imageTrS   r   modulerU   Nc                 F   U R                   R                  n[        U[        R                  [        R
                  45      (       aO  [        R                  " UR                  SUS9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  [        45      (       aA  [        R                  " UR                  5        [        R                  " UR                  5        g[        U[        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                   SUS9  g[        U["        5      (       a  [        R                  " UR$                  SUS9  [        R&                  " UR(                  US9  [        R&                  " UR*                  US9  [        R&                  " UR,                  US9  [        R&                  " UR.                  US9  [        R&                  " UR0                  US9  gg)zInitialize the weightsr   )r  stdN)re  )r?   initializer_ranger7   r   r   r<   inittrunc_normal_weightr   zeros_r  r:  ones_r   r   r   rS   ra   normal_rY   rZ   r[   r\   r]   )r>   rc  re  s      r%   _init_weights#SegGptPreTrainedModel._init_weightsU  sd    kk++fryy"))455v}}3C@{{&FKK( ' ?@@KK$JJv}}%00v//csCv//csC 011v99MLL**4LL33=LL44#>LL33=LL33= 2r$   r   )r   r   r   r   r   r!   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradr   Modulerm  r#   r   r$   r%   r`  r`  L  sT    $O!&*#+];
]]_>BII >$ > >r$   r`  c                   $  ^  \ rS rSrS\4U 4S jjrS\4S jr\       SS\	R                  S\	R                  S	\	R                  S
\	R                  S-  S\S-  S\S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )SegGptModelil  r?   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r1   r2   r?   rS   rM   r  encoder	post_initr   s     r%   r2   SegGptModel.__init__n  s9     *62$V, 	r$   rU   c                 .    U R                   R                  $ r   )rM   r^   r   s    r%   get_input_embeddings SegGptModel.get_input_embeddingsx  s    ///r$   NrI   rw   prompt_masksrx   r	  ry   labelsr   r(  r)  c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOSnU R                  R
                  R                  R                  R                  nUR                  U5      nUR                  U5      n[        R                  " X!4SS9nUc  [        R                  " X34SS9O[        R                  " X74SS9nUc  Ub  [        R                  S5        Uc  U R                  R
                  R                  n[        R                  " US-  [        R                   UR"                  S9n[        R$                  " XS-  -
  [        R                   UR"                  S9n[        R                  " X/5      nUR'                  S5      nU R	                  XXdS9nU R)                  UUUU	U
S9nU$ )	a0
  
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
    [`SegGptImageProcessor.__call__`] for details.
prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
    details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
feature_ensemble (`bool`, *optional*):
    Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
    if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
    be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
embedding_type (`str`, *optional*):
    Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
    instance or semantic.
labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
    Ground truth mask for input images.

Examples:

```python
>>> from transformers import SegGptImageProcessor, SegGptModel
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
>>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
>>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

>>> with httpx.stream("GET", image_input_url) as response:
...     image_input = Image.open(BytesIO(response.read()))

>>> with httpx.stream("GET", image_prompt_url) as response:
...     image_prompt = Image.open(BytesIO(response.read()))

>>> with httpx.stream("GET", mask_prompt_url) as response:
...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")

>>> checkpoint = "BAAI/seggpt-vit-large"
>>> model = SegGptModel.from_pretrained(checkpoint)
>>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

>>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

>>> outputs = model(**inputs)
>>> list(outputs.last_hidden_state.shape)
[1, 56, 28, 1024]
```
FrE   r}   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r   r   )ry   rx   )r	  r   r(  r)  )r?   r   r(  r)  rM   r^   r=   ri  r   r   r   r   loggerwarning_oncer;   rX   r  r   onesr   ry  )r>   rI   rw   r  rx   r	  ry   r  r   r(  r)  rA  expected_dtyper;   bool_masked_pos_zerosbool_masked_pos_onesembedding_outputencoder_outputss                     r%   rN   SegGptModel.forward{  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY/?/K+QV99DDKKQQ#~6144^D yy"5!D!L ~ II|2:L1q9 	 "v'9 m "//::FFK$)KKq0@

[g[n[n$o!#(::Q..ejjI\I\$  $ii)>(UVO-77:O??n + 
 ,,-/!5# ' 
 r$   )r?   rM   ry  NNNNNNN)r   r   r   r   r   r2   r,   r}  r   r   r   r   r  r   r    r"   r   rN   r#   rP   rQ   s   @r%   rw  rw  l  s    | 0&; 0  48(,%)+/)-,0#'rllr #\\r ll	r
 ))D0r +r d
r !!D(r  $;r #Tkr D[r 
$	$r rr$   rw  tensorr4   c                     U R                   u  p#pEXA-  nXQ-  nU R                  X#XaXq4S9n U R                  SSSSSS5      n U R                  X&U-  US-  S-  4S9n U $ )NrZ  r   rE   r   r   rY  r   )rF   rr   rH   )r  r4   rJ   r5   rK   rL   r   r   s           r%   patchifyr    sz    .4ll+Jf'L%K^^:\Wb"o^pF^^Aq!Q1-F^^:k/I:WX=[\K\"]^^FMr$   r   r   c           	      N   U R                   S   n[        U R                   S   S-  S-  5      nX-  U R                   S   :w  a"  [        SU R                   S    SU SU S	35      eU R                  X1X$US4S
9n U R	                  SSSSSS5      n U R                  USX-  X$-  4S
9n U $ )Nr   ri   r   rh   r   zNumber of patches z does not match patch height (z) and width (rD   rZ  rY  rE   r   )rF   r   rG   rr   rH   )r  r   r   rJ   r4   s        r%   
unpatchifyr    s    aJfll2&*s23J!V\\!_4 a 11OP\~]jkvjwwyz
 	
 ^^:[V`bc"d^eF^^Aq!Q1-F^^:q,2K[Me"f^gFMr$   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  S\R                  4S jrSr	U =r
$ )	
SegGptLossi  c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )r1   r2   betar4   r   s     r%   r2   SegGptLoss.__init__  s&    KK	 ++r$   r  r*   r  rx   c                    [         R                  " X4SS9nUSS2SS2S4   R                  SSU R                  S-  S-  5      n[	        XeR
                  S   U R                  -  UR
                  S   U R                  -  5      n[        R                  " X%SU R                  S9nXv-  R                  5       UR                  5       -  nU$ )a  Computes the L1 loss between the predicted masks and the ground truth masks.

Args:
    prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Pixel values from mask prompt.

    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
        Predicted masks.

    labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Ground truth mask for input images.

    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
        Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Returns:
    `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
rE   r}   Nr   r   none)	reductionr  )
r   r   repeatr4   r  rF   rp   smooth_l1_lossr  sum)r>   r  r*   r  rx   ground_truthmaskr)   s           r%   rN   SegGptLoss.forward  s    2 yy,!7Q?q!Tz*11!Q8JQ8NO$ 2 21 5 H,J\J\]^J_cgcrcrJrs
FQUQZQZ[  "TXXZ/r$   )r  r4   )r   r   r   r   r2   r   r    r   rN   r#   rP   rQ   s   @r%   r  r    sP    ,
!''! %%! !!	!
 ))! !r$   r  zM
    SegGpt model with a decoder on top for one-shot image segmentation.
    c                     ^  \ rS rSrS\4U 4S jjr\       SS\R                  S\R                  S\R                  S\R                  S-  S	\
S-  S
\S-  S\R                  S-  S\
S-  S\
S-  S\
S-  S\\-  4S jj5       rSrU =r$ )SegGptForImageSegmentationi6  r?   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r1   r2   r?   rw  ra  rR  decoderrz  r   s     r%   r2   #SegGptForImageSegmentation.__init__<  s9      (
$V, 	r$   NrI   rw   r  rx   r	  ry   r  r   r(  r)  rU   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Uc  U R                  R
                  R                  R                  n[        R                  " US-  [        R                  UR                  S9n[        R                  " XS-  -
  [        R                  UR                  S9n[        R                  " X/5      nUR                  S5      nU R	                  UUUUUUUUU	U
S9
nU
(       a  UR                  OUS   n[        R                  " USS9nU R!                  U5      nSnUb  [#        U R                   5      nU" UUXt5      nU
(       d9  U4nU	(       a	  UUS   4-   nU(       a  U	(       a  SOSnUUU   4-   nUb  U4U-   nU$ [%        UUUR&                  UR(                  S	9$ )
a
  
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
    [`SegGptImageProcessor.__call__`] for details.
prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
    details.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
feature_ensemble (`bool`, *optional*):
    Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
    if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
    be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
embedding_type (`str`, *optional*):
    Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
    instance or semantic.
labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
    Ground truth mask for input images.

Examples:

```python
>>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
>>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
>>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

>>> with httpx.stream("GET", image_input_url) as response:
...     image_input = Image.open(BytesIO(response.read()))

>>> with httpx.stream("GET", image_prompt_url) as response:
...     image_prompt = Image.open(BytesIO(response.read()))

>>> with httpx.stream("GET", mask_prompt_url) as response:
...     mask_prompt = Image.open(BytesIO(response.read())).convert("L")

>>> checkpoint = "BAAI/seggpt-vit-large"
>>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
>>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

>>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
>>> outputs = model(**inputs)
>>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
>>> print(list(result.shape))
[170, 297]
```
NrE   r   r   )
rI   rw   r  rx   r	  ry   r  r   r(  r)  ri   r}   r   )r)   r*   r   r   )r?   r   r(  r)  ra  rM   r^   r;   r   rX   r  r   r  r   r   r   r  r  r'   r   r   )r>   rI   rw   r  rx   r	  ry   r  r   r(  r)  rA  r;   r  r  r  r   r*   r)   loss_fnr   idxs                         r%   rN   "SegGptForImageSegmentation.forwardF  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY"**//@@LLK$)KKq0@

[g[n[n$o!#(::Q..ejjI\I\$  $ii)>(UVO-77:O**% 3%+-)/!5#  
 LWW%G%G\cdf\g"%*YY/Ir%R"\\"<=
 -G<VMD ]F#71:-/ /aQ73</16)M,!!//))	
 	
r$   )r?   r  ra  r  )r   r   r   r   r   r2   r   r   r   r   r  r   r    r"   r'   rN   r#   rP   rQ   s   @r%   r  r  6  s    |   48(,%)+/)-,0#'x
llx
 #\\x
 ll	x

 ))D0x
 +x
 d
x
 !!D(x
  $;x
 #Tkx
 D[x
 
.	.x
 x
r$   r  )rw  r`  r  )r   F)7r   collections.abcr8   dataclassesr   r   r   torch.nnr   rp    r   rg  activationsr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   configuration_seggptr   
get_loggerr   r  r   r'   ru  r,   rS   r   r   r   r   r  r   r   r   r  r  r:  rG  rR  r`  rw  r   r  r  r  r  __all__r   r$   r%   <module>r     s=     !   $ & ! 9 - D D . 
		H	% 
 G+ G G, 
 7K 7 7* BII  FRryy RjK4bii K4^		 U\\ e T V[VbVb  %RYY %,, ,^9
BII 9
zbll 4		 0BII B >O > >> A' A AH	U\\ 	s 	u|| 	u|| 3 S U\\ ' 'T 
D
!6 D

D
N Qr$   