
    Z j                        S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJrJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(  \ RR                  " \*5      r+\" SS9\ " S S\5      5       5       r,\" SS9\ " S S\5      5       5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S \R\                  5      r1  SWS!\R\                  S"\Rd                  S#\Rd                  S$\Rd                  S%\Rd                  S-  S&\3S-  S'\3S(\\   4S) jjr4 " S* S+\R\                  5      r5 " S, S-\R\                  5      r6 " S. S/\R\                  5      r7 " S0 S1\R\                  5      r8 " S2 S3\R\                  5      r9 " S4 S5\5      r: " S6 S7\R\                  5      r;S8 r< " S9 S:\R\                  5      r= " S; S<\R\                  5      r> " S= S>\R\                  5      r? " S? S@\R\                  5      r@\ " SA SB\5      5       rA " SC SD\R\                  5      rB\ " SE SF\A5      5       rC " SG SH\R\                  5      rD " SI SJ\R\                  5      rE " SK SL\R\                  5      rF\" SMS9 " SN SO\A5      5       rG " SP SQ\R\                  5      rH " SR SS\R\                  5      rI\ " ST SU\A5      5       rJ/ SVQrKg)XzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)Callable)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   t    \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Srg)*BaseModelOutputWithIntermediateActivations-   aW  
last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model.
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
    Intermediate activations that can be used to compute hidden states of the model at various layers.
Nlast_hidden_states.intermediate_activations )__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations__r!   tuple__static_attributes__r"       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   -   s?     48))D07EIeE$5$5s$:;dBIr-   r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)4BaseModelOutputWithPoolingAndIntermediateActivations@   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Last layer hidden-state of the first token of the sequence (classification token) after further processing
    through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
    the classification token after processing through a linear layer and a tanh activation function. The linear
    layer weights are trained from the next sentence prediction (classification) objective during pretraining.
intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
    Intermediate activations that can be used to compute hidden states of the model at various layers.
Nlast_hidden_statepooler_output.hidden_states
attentionsr!   r"   )r#   r$   r%   r&   r'   r2   r(   r)   r*   r3   r4   r+   r5   r!   r,   r"   r-   r.   r0   r0   @   s     37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;EIeE$5$5s$:;dBIr-   r0   c                      ^  \ rS rSrSrSS\S\\\4   S-  4U 4S jjjrSS jr	 SS\
R                  S	\S
\4S jjrSrU =r$ )DPTViTHybridEmbeddingsY   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
Nconfigfeature_sizec                 x  > [         T
U ]  5         UR                  UR                  pCUR                  UR
                  pe[        U[        R                  R                  5      (       a  UOX34n[        U[        R                  R                  5      (       a  UOXD4nUS   US   -  US   US   -  -  n[        U5      U l        U R                  R                  S   n[        U R                  R                  5      S:w  a+  [        S[        U R                  R                  5       35      eSS/U l        Uc  UR                   n	U	SS  nU	S   nOG[        U[        R                  R                  5      (       a  UOX"4nU R                  R                  S   nX0l        US   U l        XPl        ["        R$                  " XSS9U l        ["        R(                  " [*        R,                  " SSUR
                  5      5      U l        ["        R(                  " [*        R,                  " SUS-   UR
                  5      5      U l        g )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler
   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr(   zeros	cls_tokenposition_embeddings)selfr9   r:   rB   rC   rD   rE   num_patchesfeature_dimfeat_map_shape	__class__s             r.   rA   DPTViTHybridEmbeddings.__init__`   s   !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q R RYeXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r-   c                 `   US S 2S U24   nUSUS 24   n[        [        U5      S-  5      nUR                  SXwS5      R                  SSSS5      n[        R
                  R                  XbU4SS9nUR                  SSSS5      R                  SX#-  S5      n[        R                  " XV/SS	9nU$ 
Nr         ?r   r<   r      bilinear)sizemodedim)	r   rL   reshapepermuter   
functionalinterpolater(   catrV   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r.   _resize_pos_embed(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]2NVVWXZ[]^`abmm//UdBelv/w!))!Q15==aAQAceghJ4!<r-   pixel_valuesinterpolate_pos_encodingreturnc                    UR                   u  p4pVX@R                  :w  a  [        S5      eU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U R
                  XPR                  -  X`R                  -  5      nU R                  U5      nUR                  S   n	U R                   V
s/ s H  oR                  U
   PM     nn
U R                  U	5      R                  S	5      R                  SS	5      nU R                  R                  USS5      n[        R                   " X4SS
9nX-   n[#        UUS9$ s  sn
f )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r<   r_   rc   )r    r!   )shaperD   rM   rB   rr   rU   rC   rJ   feature_mapsrN   rQ   flatten	transposerT   expandr(   ri   r   )rV   rt   ru   
batch_sizerD   heightwidthrU   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                 r.   forwardDPTViTHybridEmbeddings.forward   s    3?2D2D/
&,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?//AY
 --5"//3 RVQpQpqQp < <U CQpq__X.66q9CCAqI
^^**:r2>
YY
7Q?
  5
 :)%9
 	
  rs   *E5)rJ   rT   rB   rD   rC   rU   rQ   rN   Nr   F)r#   r$   r%   r&   r'   r   r+   intrA   rr   r(   Tensorboolr   r   r,   __classcell__rZ   s   @r.   r7   r7   Y   sd     ey  ec3h$8N  e  eD LQ&
!LL&
DH&
	3&
 &
r-   r7   c                   \   ^  \ rS rSrSrU 4S jrS	S jrS\R                  S\	4S jr
SrU =r$ )
DPTViTEmbeddings   z:
Construct the CLS token, position and patch embeddings.

c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        Xl        g )Nr   )r@   rA   r   rR   r(   rS   rE   rT   DPTViTPatchEmbeddingspatch_embeddingsrW   rU   Dropouthidden_dropout_probdropoutr9   )rV   r9   rW   rZ   s      r.   rA   DPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r-   c                 l   US S 2S U24   nUSUS 24   n[        UR                  S5      S-  5      nUR                  SXwS5      R                  SSSS5      n[        R
                  R                  XbU4SS9nUR                  SSSS5      R                  SX#-  S5      n[        R                  " XV/SS	9nU$ r]   )	r   ra   re   rf   r   rg   rh   r(   ri   rj   s           r.   rr   "DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]2NVVWXZ[]^`abmm//UdBelv/w!))!Q15==aAQAceghJ4!<r-   rt   rv   c                 x   UR                   u  p#pEU R                  R                  nU R                  U R                  XF-  XV-  5      nU R                  U5      nUR                  5       u  p)n
U R                  R                  USS5      n[        R                  " X4SS9nX-   nU R                  U5      n[        US9$ )Nr<   r   rc   )r    )rz   r9   rC   rr   rU   r   ra   rT   r~   r(   ri   r   r   )rV   rt   r   rD   r   r   rC   rU   r   seq_len_r   s               r.   r   DPTViTEmbeddings.forward   s    2>2D2D/
& [[++
"44$$f&:E<O
 **<8
!+!2
Q ^^**:r2>
YY
7Q?
  5
\\*-
9ZXXr-   )rT   r9   r   r   rU   r   )r#   r$   r%   r&   r'   rA   rr   r(   r   r   r   r,   r   r   s   @r.   r   r      s3    
YELL Y5_ Y Yr-   r   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	r      z
Image to Patch Embedding.

r9   c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )r?   stride)r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rW   r   rP   rQ   )rV   r9   rB   rC   rD   rE   rW   rZ   s          r.   rA   DPTViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir-   rt   rv   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nrx   r_   r   )rz   rD   rM   rQ   r|   r}   )rV   rt   r   rD   r   r   r   s          r.   r   DPTViTPatchEmbeddings.forward  s\    2>2D2D/
&,,,w  __\2::1=GG1M
r-   )rB   rD   rW   rC   rQ   r#   r$   r%   r&   r'   r   rA   r(   r   r   r,   r   r   s   @r.   r   r      s6    
jy jELL U\\  r-   r   modulequerykeyvalueattention_maskscalingr   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr<         r_   r   rc   )ptrainingr   )
ra   r(   matmulr}   r   rg   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr     s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r-   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
\R                  \R                  4   4S jrSrU =r$ )	DPTSelfAttentioni-  r9   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r@   rA   rE   num_attention_headshasattrrM   r9   r   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rV   r9   rZ   s     r.   rA   DPTSelfAttention.__init__.  sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r-   r4   r   rv   c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        R                  " U R                  R                  [        5      nU" U UUUS 4U R                  U R                  U R                  (       d  SOU R                   S.UD6u  pU	R#                  5       S S U R$                  4-   nU	R'                  U5      n	X4$ )Nr   r<   r   r_           )r   r   r   r=   )rz   r   r   r   viewr}   r   r   r   get_interfacer9   _attn_implementationr   r   r   r   r   ra   r   re   )rV   r4   r   r   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r.   r   DPTSelfAttention.forwardB  sO   
 #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?(M(MKK,,.E)
 *=
*
 nnLL#}}C$2C2C
*
 
*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r-   )
r   r   r9   r   r   r   r   r   r   r   )r#   r$   r%   r&   r   rA   r(   r   r   r   r+   r   r,   r   r   s   @r.   r   r   -  sS    ]y ](.||. +,. 
u||U\\)	*	. .r-   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
DPTViTSelfOutputie  z
The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r9   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	r@   rA   r   r   rE   denser   r   r   r   s     r.   rA   DPTViTSelfOutput.__init__k  sB    YYv1163E3EF
zz&"<"<=r-   r4   input_tensorrv   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rV   r4   r   s      r.   r   DPTViTSelfOutput.forwardp  s$    

=1]3r-   r   r   r   s   @r.   r   r   e  sB    
>y >
U\\  RWR^R^  r-   r   c                   t   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\R                  4S jr
SrU =r$ )	DPTViTAttentioniw  r9   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r@   rA   r   	attentionr   outputr   s     r.   rA   DPTViTAttention.__init__x  s&    )&1&v.r-   r4   r   rv   c                 R    U R                   " U40 UD6u  p4U R                  X15      nU$ r   r   r   )rV   r4   r   self_attn_outputr   r   s         r.   r   DPTViTAttention.forward}  s/    
 #nn]EfE-=r-   r   )r#   r$   r%   r&   r   rA   r(   r   r   r   r   r,   r   r   s   @r.   r   r   w  sC    /y /
|| +, 
	 r-   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )DPTViTIntermediatei  r9   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r@   rA   r   r   rE   intermediate_sizer   rF   
hidden_actstrr	   intermediate_act_fnr   s     r.   rA   DPTViTIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r-   r4   rv   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )rV   r4   s     r.   r   DPTViTIntermediate.forward  s&    

=100?r-   r   r#   r$   r%   r&   r   rA   r(   r   r   r,   r   r   s   @r.   r   r     s/    9y 9U\\ ell  r-   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	DPTViTOutputi  r9   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r@   rA   r   r   r   rE   r   r   r   r   r   s     r.   rA   DPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r-   r4   r   rv   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r.   r   DPTViTOutput.forward  s,    

=1]3%4r-   r   r   r   s   @r.   r   r     s=    >y >
U\\  RWR^R^  r-   r   c                   x   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\R                  4S jrS	rU =r$ )
DPTViTLayeri  z?This corresponds to the Block class in the timm implementation.r9   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r@   rA   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormrE   layer_norm_epslayernorm_beforelayernorm_afterr   s     r.   rA   DPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr-   r4   r   rv   c                     U R                  U5      nU R                  " U40 UD6nXA-   nU R                  U5      nU R                  U5      nU R	                  XQ5      nU$ r   )r
  r   r  r  r   )rV   r4   r   hidden_states_normattention_outputlayer_outputs         r.   r   DPTViTLayer.forward  sl    
 "22=A>>*<GG )8 ++M:((6 {{<?r-   )r   r  r  r  r
  r   r  )r#   r$   r%   r&   r'   r   rA   r(   r   r   r   r   r,   r   r   s   @r.   r  r    sH    I[y [|| +, 
	 r-   r  c                      ^  \ rS rSrSrU 4S jrS rS rS
S\\	R                     S\\	R                     4S jjrS	rU =r$ )DPTReassembleStagei  a  
This class reassembles the hidden states of the backbone into image-like feature representations at various
resolutions.

This happens in 3 stages:
1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
   `config.readout_type`.
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
3. Resizing the spatial dimensions (height, width).

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
c                    > [         TU ]  5         Xl        [        R                  " 5       U l        UR                  (       a  U R                  U5        OU R                  U5        UR                  U l	        g r   )
r@   rA   r9   r   
ModuleListlayers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r.   rA   DPTReassembleStage.__init__  sS    mmo,,V4%%f-"(";";r-   c           	         [        [        [        UR                  5      5      UR                  5       Hs  u  p#US::  a0  U R
                  R                  [        R                  " 5       5        M;  US:  d  MC  U R
                  R                  [        XR                  U   US95        Mu     UR                  S:w  a  [        SUR                   S35      e[        R                  " 5       U l        [        U5      n[        [        UR                  5      5       H  nUS::  aD  U R                  R                  [        R                  " [        R                  " 5       5      5        MM  US:  d  MU  U R                  R                  [        R                  " [        R                   " SU-  U5      ["        UR$                     5      5        M     g)z"
For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
for more details.
r   rK   factorprojectzReadout type z! is not supported for DPT-Hybrid.r_   N)ziprangerL   neck_hidden_sizesreassemble_factorsr  appendr   IdentityDPTReassembleLayerreadout_typerM   r  readout_projects_get_backbone_hidden_size
Sequentialr   r	   r   )rV   r9   ir  rE   s        r.   r  .DPTReassembleStage._init_reassemble_dpt_hybrid  sI    U3v'?'?#@A6C\C\]IAAv""2;;=1Q""#5fG_G_`aGbkq#rs	 ^ )+}V-@-@,AAbcdd !#/7s63345AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde	 6r-   c           	      B   [        [        [        UR                  5      5      UR                  5       H5  u  p#U R
                  R                  [        XR                  U   US95        M7     UR                  S:X  a  [        R                  " 5       U l        [        U5      n[        [        UR                  5      5       H\  nU R                  R                  [        R                  " [        R                  " SU-  U5      [        UR                      5      5        M^     g g )Nr  r  r_   )r   r!  rL   r"  r#  r  r$  r&  r'  r   r  r(  r)  r*  r   r	   r   )rV   r9   r+  r  rE   r   s         r.   r  'DPTReassembleStage._init_reassemble_dpt  s    U3v'?'?#@A6C\C\]IAKK1&C[C[\]C^gmno ^ )+$&MMOD!3F;K3v7789%%,,MM"))AO["I6RXRcRcKde : ,r-   r4   rv   c                    / n[        U5       GH  u  pVXPR                  ;  Ga  USS2S4   USS2SS24   pgUR                  u  pn
Ub  Ub  UR                  XX:5      nO [	        U	S-  5      nUR                  XX5      nUR                  SSSS5      R                  5       nUR                  nU R                  R                  S:X  a  UR                  S5      R                  S5      nUR                  S5      R                  U5      nU R                  U   " [        R                  " Xm4S	5      5      nUR                  SSS5      R                  U5      nONU R                  R                  S
:X  a4  UR                  S5      UR                  S	5      -   nUR                  U5      nU R                  U   " U5      nUR!                  U5        GM     U$ )z
Args:
    hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
        List of hidden states from the backbone.
Nr   r   r^   r   r_   r  )r   r_   r   r<   add)	enumerater  rz   re   r   rf   r   r9   r'  r|   	unsqueeze	expand_asr(  r(   ri   r  r$  )rV   r4   patch_heightpatch_widthoutr+  hidden_staterT   r   sequence_lengthrD   ra   feature_shapereadouts                 r.   r   DPTReassembleStage.forward
  s    (7OA///*6q!t*<l1ab5>Q<<H<N<N9
\+0G#/#7#7
R]#lL$_c%9:D#/#7#7
$#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#'#8#8#;EII|F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL#{{1~l;JJ|$3  86 
r-   )r9   r  r  r(  NN)r#   r$   r%   r&   r'   rA   r  r  listr(   r   r   r,   r   r   s   @r.   r  r    sE    
<4
#T%,,%7 #aefkfrfras # #r-   r  c                     U R                   b1  [        U R                   S5      (       a  U R                   R                  $ U R                  $ )NrE   )backbone_configr   rE   )r9   s    r.   r)  r)  0  s>    )gf6L6Lm.\.\%%111!!!r-   c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r	$ )r&  i7  r9   rK   r  c           	      P  > [         TU ]  5         [        U5      n[        R                  " XBSS9U l        US:  a  [        R                  " X"X3SS9U l        g US:X  a  [        R                  " 5       U l        g US:  a)  [        R                  " X"S[        SU-  5      SS9U l        g g )Nr   )in_channelsout_channelsr?   r   r?   r   paddingr   )
r@   rA   r)  r   rP   rQ   ConvTranspose2dresizer%  r   )rV   r9   rK   r  rE   rZ   s        r.   rA   DPTReassembleLayer.__init__8  s    /7))`ab A:,,XVlmnDKq[++-DKaZ))HAcRSV\R\oghiDK r-   c                 J    U R                  U5      nU R                  U5      nU$ r   rQ   rG  )rV   r7  s     r.   r   DPTReassembleLayer.forwardG  s$    |4{{<0r-   rJ  )
r#   r$   r%   r&   r   r   rA   r   r,   r   r   s   @r.   r&  r&  7  s+    jy jC j j r-   r&  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )DPTFeatureFusionStageiM  r9   c                    > [         TU ]  5         [        R                  " 5       U l        [        [        UR                  5      5       H'  nU R                  R                  [        U5      5        M)     g r   )
r@   rA   r   r  r  r!  rL   r"  r$  DPTFeatureFusionLayerrV   r9   r   rZ   s      r.   rA   DPTFeatureFusionStage.__init__N  sM    mmos63345AKK4V<= 6r-   c                     US S S2   n/ nS n[        XR                  5       H*  u  pEUc	  U" U5      nOU" X45      nUR                  U5        M,     U$ )Nr<   )r   r  r$  )rV   r4   fused_hidden_statesfused_hidden_stater7  layers         r.   r   DPTFeatureFusionStage.forwardT  sg    %dd+ !#&}kk#BL!)%*<%8"%*+=%L"&&'9: $C #"r-   )r  )	r#   r$   r%   r&   r   rA   r   r,   r   r   s   @r.   rM  rM  M  s    >y ># #r-   rM  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	DPTPreActResidualLayerie  z
ResidualConvUnit, pre-activate residual unit.

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
r9   c           	        > [         TU ]  5         UR                  U l        UR                  b  UR                  OU R                  (       + n[
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l
        [
        R                  " 5       U l        [
        R                  " UR                  UR                  SSSUS9U l        U R                  (       aK  [
        R                  " UR                  5      U l        [
        R                  " UR                  5      U l        g g )Nr   r   )r?   r   rE  r   )r@   rA   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rP   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rV   r9   r\  rZ   s      r.   rA   DPTPreActResidualLayer.__init__n  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r-   r7  rv   c                    UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nU R	                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   $ r   )r^  r`  r[  rd  ra  rb  re  rV   r7  residuals      r.   r   DPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9L&&r-   )r^  ra  rd  re  r`  rb  r[  r   r   s   @r.   rX  rX  e  s7     Iy  ID'ELL 'U\\ ' 'r-   rX  c                      ^  \ rS rSrSrSS\S\4U 4S jjjrSS\R                  S\R                  S-  S	\R                  4S
 jjr
SrU =r$ )rO  i  a  Feature fusion layer, merges feature maps from different stages.

Args:
    config (`[DPTConfig]`):
        Model configuration class defining the model architecture.
    align_corners (`bool`, *optional*, defaults to `True`):
        The align_corner setting for bilinear upsample.
r9   align_cornersc                    > [         TU ]  5         X l        [        R                  " UR
                  UR
                  SSS9U l        [        U5      U l        [        U5      U l	        g )Nr   T)r?   r   )
r@   rA   rl  r   rP   r_  rQ   rX  residual_layer1residual_layer2)rV   r9   rl  rZ   s      r.   rA   DPTFeatureFusionLayer.__init__  sR    *))F$=$=v?X?Xfgnrs5f=5f=r-   Nr7  ri  rv   c                 t   Ubh  UR                   UR                   :w  a;  [        R                  R                  X!R                   S   UR                   S   4SSS9nXR	                  U5      -   nU R                  U5      n[        R                  R                  USSU R                  S9nU R                  U5      nU$ )Nr_   r   r`   Fra   rb   rl  scale_factorrb   rl  )rz   r   rg   rh   rn  ro  rl  rQ   rh  s      r.   r   DPTFeatureFusionLayer.forward  s    !!X^^3==44$6$6q$9<;M;Ma;P#QXbrw 5  (*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r-   )rl  rQ   rn  ro  Tr   )r#   r$   r%   r&   r'   r   r   rA   r(   r   r   r,   r   r   s   @r.   rO  rO    sS    >y > > >ELL ELL4<O [`[g[g  r-   rO  c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSr\\S.r\R$                  " 5       U 4S j5       rS	rU =r$ )
DPTPreTrainedModeli  r9   dptrt   )imageT)r4   r5   c                    > [         TU ]  U5        [        U[        [        45      (       aA  [
        R                  " UR                  5        [
        R                  " UR                  5        gg)zInitialize the weightsN)	r@   _init_weightsrF   r   r7   initzeros_rT   rU   )rV   r   rZ   s     r.   r|   DPTPreTrainedModel._init_weights  sP     	f%f/1GHIIKK(()KK223 Jr-   r"   )r#   r$   r%   r&   r   r*   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr  r   _can_record_outputsr(   no_gradr|  r,   r   r   s   @r.   rx  rx    s_    $O!&*#N"&$&
 ]]_4 4r-   rx  c            	       j   ^  \ rS rSrS\4U 4S jjr S
S\R                  S\S\	\
   S\4S jjrS	rU =r$ )DPTViTEncoderi  r9   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r   )	r@   rA   r9   r   r  r!  num_hidden_layersr  rU  rP  s      r.   rA   DPTViTEncoder.__init__  sG    ]]vG_G_A`#aA`AK$7A`#ab
#as   Ar4   r   r   rv   c                 J    U R                    H  nU" U5      nM     [        US9$ )N)r2   )rU  r   )rV   r4   r   r   layer_modules        r.   r   DPTViTEncoder.forward  s)     !JJL(7M ' ??r-   )r9   rU  r   )r#   r$   r%   r&   r   rA   r(   r   r   r   r   r   r   r,   r   r   s   @r.   r  r    sR    cy c IN@"\\@AE@Y_`rYs@	@ @r-   r  c            	          ^  \ rS rSrSS\S\4U 4S jjjrS r\\	" SS9\
S\R                  S	\\   S
\4S j5       5       5       rSrU =r$ )DPTModeli  r9   add_pooling_layerc                 b  > [         TU ]  U5        Xl        UR                  (       a  [	        U5      U l        O[        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R!                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r  N)r@   rA   r9   r  r7   r   r   r  encoderr   r  rE   r	  	layernormDPTViTPoolerpooler	post_init)rV   r9   r  rZ   s      r.   rA   DPTModel.__init__  s    
 	  4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r-   c                 |    U R                   R                  (       a  U R                  $ U R                  R                  $ r   )r9   r  r   r   )rV   s    r.   get_input_embeddingsDPTModel.get_input_embeddings  s)    ;;  ??"??333r-   F)tie_last_hidden_statesrt   r   rv   c                    U R                  U5      nUR                  nU R                  " U40 UD6nUR                  nU R	                  U5      nU R
                  b  U R                  U5      OS n[        UUUR                  S9$ )N)r2   r3   r!   )r   r    r  r2   r  r  r0   r!   )rV   rt   r   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputs           r.   r   DPTModel.forward  s     HLWcGd'7'J'J$+/<<8T+_X^+_);;..98<8OO4UYC-'%5%N%N
 	
r-   )r9   r   r  r  r  rv  )r#   r$   r%   r&   r   r   rA   r  r   r   r   r(   r)   r   r   r0   r   r,   r   r   s   @r.   r  r    sq    y T  *4  E2
''
 +,
 
>	
  3  
r-   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r  i%  r9   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r@   rA   r   r   rE   pooler_output_sizer   r	   
pooler_act
activationr   s     r.   rA   DPTViTPooler.__init__&  s>    YYv1163L3LM
 !2!23r-   r4   rv   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )rV   r4   first_token_tensorr  s       r.   r   DPTViTPooler.forward+  s6     +1a40

#566r-   )r  r   r   r   s   @r.   r  r  %  s/    4y 4
U\\ ell  r-   r  c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\\R                     S\
S-  S\
S-  S	\\R                     4S
 jjrSrU =r$ )DPTNecki4  a  
DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
input and produces another list of tensors as output. For DPT, it includes 2 stages:

* DPTReassembleStage
* DPTFeatureFusionStage.

Args:
    config (dict): config dict.
r9   c                   > [         TU ]  5         Xl        UR                  b"  UR                  R                  S:X  a  S U l        O[        U5      U l        [        R                  " 5       U l	        UR                   H=  nU R                  R                  [        R                  " X!R                  SSSS95        M?     [        U5      U l        g )Nswinv2r   r   Fr?   rE  r   )r@   rA   r9   r?  
model_typereassemble_stager  r   r  convsr"  r$  rP   r_  rM  fusion_stage)rV   r9   channelrZ   s      r.   rA   DPTNeck.__init__@  s     !!-&2H2H2S2SW_2_$(D!$6v$>D!]]_
//GJJbii1J1JXYcdkpqr 0 2&9r-   Nr4   r4  r5  rv   c                    [        U[        [        45      (       d  [        S5      e[	        U5      [	        U R
                  R                  5      :w  a  [        S5      eU R                  b  U R                  XU5      n[        U5       VVs/ s H  u  pEU R                  U   " U5      PM     nnnU R                  U5      nU$ s  snnf )z
Args:
    hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
        List of hidden states from the backbone.
z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)rF   r+   r=  	TypeErrorrL   r9   r"  rM   r  r1  r  r  )rV   r4   r4  r5  r+  featurer   r   s           r.   r   DPTNeck.forwardQ  s     -%77PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UV=UzqDJJqM'*=UV ""8, Ws   !C)r9   r  r  r  r<  )r#   r$   r%   r&   r'   r   rA   r=  r(   r   r   r   r,   r   r   s   @r.   r  r  4  sf    	:y :( $("&	ELL) Dj 4Z	
 
ell	 r-   r  c                   t   ^  \ rS rSrSrS\4U 4S jjrS\\R                     S\R                  4S jr
SrU =r$ )	DPTDepthEstimationHeadin  z
Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
supplementary material).
r9   c                   > [         TU ]  5         Xl        S U l        UR                  (       a  [
        R                  " SSSSSS9U l        UR                  n[
        R                  " [
        R                  " X"S-  SSSS9[
        R                  " SSS	S
9[
        R                  " US-  SSSSS9[
        R                  " 5       [
        R                  " SSSSSS9[
        R                  " 5       5      U l        g )N   )r   r   )r   r   rD  r_   r   r   r`   Trs      r   )r@   rA   r9   rQ   add_projectionr   rP   r_  r*  Upsampler]  headrV   r9   r   rZ   s      r.   rA   DPTDepthEstimationHead.__init__u  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r-   r4   rv   c                     XR                   R                     nU R                  b,  U R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SS9nU$ )Nr   rc   )r9   head_in_indexrQ   r   r]  r  squeeze)rV   r4   predicted_depths      r.   r   DPTDepthEstimationHead.forward  sc    %kk&?&?@??& OOM:MGGIm4M))M2)11a18r-   )r9   r  rQ   )r#   r$   r%   r&   r'   r   rA   r=  r(   r   r   r,   r   r   s   @r.   r  r  n  s9    
y 
&T%,,%7 ELL  r-   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                      ^  \ rS rSrU 4S jr\\ S
S\R                  S\R                  S-  S\
\   S\4S jj5       5       rS	rU =r$ )DPTForDepthEstimationi  c                 
  > [         TU ]  U5        S U l        UR                  SL a  UR                  b  [        U5      U l        O[        USS9U l        [        U5      U l	        [        U5      U l        U R                  5         g NF)r  )r@   rA   rJ   r  r?  r
   r  ry  r  neckr  r  r  r   s     r.   rA   DPTForDepthEstimation.__init__  sq     u$)?)?)K)&1DM%@DH FO	 +62	 	r-   Nrt   labelsr   rv   c                   ^  SnUb  [        S5      eUR                  S5      =(       d    [        T R                  SS5      nSUS'   T R                  b*  T R                  R
                  " U40 UD6nUR                  nOT R                  " U40 UD6nUR                  nT R                  R                  (       d?  [        USS 5       VV	s/ s H#  u  pUT R                  R                  ;   d  M!  U	PM%     nnn	O5UR                  n
U
R                  U 4S j[        USS 5       5       5        U
nSu  pT R                  R                  bS  T R                  R                  SL a:  UR                  u    pnT R                  R                  R                   nUU-  nUU-  nT R#                  X{U5      nT R%                  U5      n['        UUU(       a  UR                  OSUR(                  S	9$ s  sn	nf )
a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth depth estimation maps for computing the loss.

Examples:
```python
>>> from transformers import AutoImageProcessor, DPTForDepthEstimation
>>> import torch
>>> import numpy as np
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
>>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # interpolate to original size
>>> post_processed_output = image_processor.post_process_depth_estimation(
...     outputs,
...     target_sizes=[(image.height, image.width)],
... )

>>> # visualize the prediction
>>> predicted_depth = post_processed_output[0]["predicted_depth"]
>>> depth = predicted_depth * 255 / predicted_depth.max()
>>> depth = depth.detach().cpu().numpy()
>>> depth = Image.fromarray(depth.astype("uint8"))
```NzTraining is not implemented yetr   FTr   c              3   j   >#    U  H(  u  pUTR                   R                  S S ;   d  M$  Uv   M*     g7fr_   Nr9   backbone_out_indices.0idxr  rV   s      r.   	<genexpr>0DPTForDepthEstimation.forward.<locals>.<genexpr>  s5      .(Ddkk>>qrBB G(D   #3	3r<  )lossr  r4   r5   )NotImplementedErrorgetgetattrr9   rJ   forward_with_filtered_kwargsr{   ry  r4   r  r1  r  r!   extendr?  rz   rC   r  r  r   r5   )rV   rt   r  r   r  user_requested_hidden_statesoutputsr4   r  r  backbone_hidden_statesr4  r5  r   r   r   rC   r  s   `                 r.   r   DPTForDepthEstimation.forward  s   \ %&GHH (.zz2H'I (
WKK/N
$ *.%&==$mm@@XQWXG#00Mhh|6v6G#11M ;;((09-:K0L!0LPSW[WbWbWwWwPwG0L  ! *1)I)I&&-- .(1-2C(D. 
 !7$.!;;&&2t{{7L7LPU7U"."4"4Aq%44??J!Z/L:-K		-{K))M2#+3O'//UY))	
 	
-!s    G%,G%)rJ   ry  r  r  r   )r#   r$   r%   r&   rA   r   r   r(   r)   
LongTensorr   r   r   r   r,   r   r   s   @r.   r  r    sl    $  +/[
''[
   4'[
 +,	[

 
[
  [
r-   r  c                   p   ^  \ rS rSrS\4U 4S jjrS\\R                     S\R                  4S jr	Sr
U =r$ )DPTSemanticSegmentationHeadi  r9   c                   > [         TU ]  5         Xl        UR                  n[        R
                  " [        R                  " X"SSSS9[        R                  " U5      [        R                  " 5       [        R                  " UR                  5      [        R                  " X!R                  SS9[        R                  " SSSS	95      U l        g )
Nr   r   Fr  r>   r_   r`   Trs  )r@   rA   r9   r_  r   r*  rP   rc  r]  r   semantic_classifier_dropout
num_labelsr  r  r  s      r.   rA   $DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r-   r4   rv   c                 X    XR                   R                     nU R                  U5      nU$ r   )r9   r  r  rV   r4   logitss      r.   r   #DPTSemanticSegmentationHead.forward  s'    %kk&?&?@=)r-   )r9   r  )r#   r$   r%   r&   r   rA   r=  r(   r   r   r,   r   r   s   @r.   r  r    s4    
y 
T%,,%7 ELL  r-   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )DPTAuxiliaryHeadi$  r9   c                 T  > [         TU ]  5         UR                  n[        R                  " [        R
                  " X"SSSS9[        R                  " U5      [        R                  " 5       [        R                  " SS5      [        R
                  " X!R                  SS95      U l
        g )Nr   r   Fr  g?r>   )r@   rA   r_  r   r*  rP   rc  r]  r   r  r  r  s      r.   rA   DPTAuxiliaryHead.__init__%  sr    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r-   r4   rv   c                 (    U R                  U5      nU$ r   r  r  s      r.   r   DPTAuxiliaryHead.forward1  s    =)r-   r  r   r   s   @r.   r  r  $  s/    

y 

U\\ ell  r-   r  c                      ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S-  S\R                  S-  S\\   S\4S	 jj5       5       rS
rU =r$ )DPTForSemanticSegmentationi6  r9   c                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        [        U5      U l        UR                  (       a  [        U5      OS U l
        U R                  5         g r  )r@   rA   r  ry  r  r  r  r  use_auxiliary_headr  auxiliary_headr  r   s     r.   rA   #DPTForSemanticSegmentation.__init__8  s^     Fe< FO	 07	:@:S:S.v6Y] 	r-   Nrt   r  r   rv   c                 P  ^  Ub%  T R                   R                  S:X  a  [        S5      eUR                  S5      =(       d    [	        T R                   SS5      nSUS'   T R
                  " U40 UD6nUR                  nT R                   R                  (       d?  [        USS 5       VVs/ s H#  u  pxUT R                   R                  ;   d  M!  UPM%     nnnO5UR                  n	U	R                  U 4S j[        USS 5       5       5        U	nT R                  US9nT R                  U5      n
SnT R                  b  T R                  US	   5      nSnUb  [        R                   R#                  XR$                  S
S SSS9nUb,  [        R                   R#                  XR$                  S
S SSS9n['        T R                   R(                  S9nU" X5      nU" WU5      nUT R                   R*                  U-  -   n[-        UU
U(       a  UR                  OSUR.                  S9$ s  snnf )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:
```python
>>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
>>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oner   FTc              3   j   >#    U  H(  u  pUTR                   R                  S S ;   d  M$  Uv   M*     g7fr  r  r  s      r.   r  5DPTForSemanticSegmentation.forward.<locals>.<genexpr>|  s6      *,HLCCSWS^S^SsSstutvSwLw,Hr  )r4   r<   r=   r`   rr  )ignore_index)r  r  r4   r5   )r9   r  rM   r  r  ry  r4   r  r1  r  r!   r  r  r  r   r   rg   rh   rz   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r5   )rV   rt   r  r   r  r  r4   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_losss   `                 r.   r   "DPTForSemanticSegmentation.forwardG  s7   @ $++"8"8A"=NOO (.zz2H'I (
WKK/N
$ *.%&HLQ]HhagHh-- {{$$,5mAB6G,H,HLCCSWS^S^SsSsLs,H  M &-%E%E"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88\\"#.Zu  9    +-/]]-F-F$<<+<:]b .G .* (T[[5[5[\H !1:I%&@&INt{{@@>QQD&3O'//UY))	
 	
Es   ) H"H")r   ry  r  r  r<  )r#   r$   r%   r&   r   rA   r   r   r(   r)   r  r   r   r   r   r,   r   r   s   @r.   r  r  6  s{    y   26*.U
''$.U
   4'U
 +,	U

 
!U
  U
r-   r  )r  r  r  rx  )Nr   )Lr'   collections.abcrG   r   dataclassesr   r(   r   torch.nnr    r   r}  activationsr	   backbone_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_dptr   
get_loggerr#   loggerr   r0   Moduler7   r   r   r   floatr   r   r   r   r   r   r  r  r)  r&  rM  rX  rO  rx  r  r  r  r  r  r  r  r  r  __all__r"   r-   r.   <module>r"     s    $ !   % & ! + 9 ^ ^ F & X X I 5 ( 
		H	%  	J 	J 	J  J; J J$]
RYY ]
@4Yryy 4YnBII L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:4.ryy 4.pryy $bii "  
299 
, De eP" ,#BII #0:'RYY :'z"BII "J 4 4 40@BII @ 1
! 1
 1
j299 7bii 7t%RYY %P 
p
. p

p
f")) ,ryy $ g
!3 g
 g
T dr-   