
    Z j>              	          S r SSKrSSKrSSKJr  SSKJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJr  SSKJrJrJr  SSK J!r!  \RD                  " \#5      r$\\" SS9 " S S\5      5       5       r%SIS\RL                  S\'S\(S\RL                  4S jjr) " S S\	RT                  5      r+ " S S\	RT                  5      r, " S S\	RT                  5      r- " S  S!\	RT                  5      r. " S" S#\.5      r/ " S$ S%\	RT                  5      r0\.\/S&.r1 " S' S(\	RT                  5      r2 " S) S*\	RT                  5      r3 " S+ S,\	RT                  5      r4 " S- S.\5      r5 " S/ S0\	RT                  5      r6 " S1 S2\	RT                  5      r7\ " S3 S4\5      5       r8\ " S5 S6\85      5       r9 " S7 S8\	RT                  5      r:\" S9S9 " S: S;\85      5       r; " S< S=\	RT                  5      r< " S> S?\	RT                  5      r= " S@ SA\	RT                  5      r> " SB SC\	RT                  5      r? " SD SE\	RT                  5      r@\ " SF SG\85      5       rA/ SHQrBg)JzPyTorch Data2VecVision model.    N)	dataclass)Optional)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache)auto_docstringlogging	torch_int   )Data2VecVisionConfigz7
    Class for outputs of [`Data2VecVisionModel`].
    )custom_introc                       \ rS rSrSrSrg)$Data2VecVisionModelOutputWithPooling+   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
 N)__name__
__module____qualname____firstlineno____doc____static_attributes__r       چ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/data2vec/modeling_data2vec_vision.pyr   r   +   s    r!   r   input	drop_probtrainingreturnc                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

        r   r   )r   )dtypedevice)shapendimtorchrandr)   r*   floor_div)r#   r$   r%   	keep_probr+   random_tensoroutputs          r"   	drop_pathr4   <   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr!   c                      ^  \ rS rSrSrSS\S-  SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )Data2VecVisionDropPathL   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr$   r&   c                 .   > [         TU ]  5         Xl        g N)super__init__r$   )selfr$   	__class__s     r"   r;   Data2VecVisionDropPath.__init__O   s    "r!   hidden_statesc                 B    [        XR                  U R                  5      $ r9   )r4   r$   r%   r<   r?   s     r"   forwardData2VecVisionDropPath.forwardS   s    FFr!   c                      SU R                    3$ )Nzp=r$   r<   s    r"   
extra_repr!Data2VecVisionDropPath.extra_reprV   s    DNN#$$r!   rE   r9   )r   r   r   r   r   floatr;   r-   TensorrB   strrG   r    __classcell__r=   s   @r"   r6   r6   L   sQ    b#%$, #$ # #GU\\ Gell G%C % %r!   r6   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\R                  S-  S\R                  4S jjrSrU =r$ )Data2VecVisionEmbeddings[   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

configr&   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )r:   r;   r   	Parameterr-   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenData2VecVisionPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r<   rQ   ra   r=   s      r"   r;   !Data2VecVisionEmbeddings.__init__a   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r!   
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r+   rc   r-   jit
is_tracingr[   r   reshapepermuter   
functionalinterpolateviewcat)r<   rh   ri   rj   ra   num_positionsclass_pos_embedpatch_pos_embedrt   
new_height	new_widthsqrt_num_positionss               r"   interpolate_pos_encoding1Data2VecVisionEmbeddings.interpolate_pos_encodingx   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr!   pixel_valuesbool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ Nrl   r   rs   )r+   rZ   rp   rX   expand	unsqueezetype_asrV   r-   r|   rc   r   rf   )r<   r   r   _ri   rj   rh   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                 r"   rB    Data2VecVisionEmbeddings.forward   s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r!   )rV   rf   r]   rX   rZ   r[   rc   r9   )r   r   r   r   r   r   r;   r-   rJ   intr   
BoolTensorrB   r    rL   rM   s   @r"   rO   rO   [   s    
>3 > >.&D5<< &D &DUX &D]b]i]i &DV 487ll7 ))D07 
	7 7r!   rO   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )rY      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                    > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l
        Xpl        [        R                  " XEX3S9U l        g )Nr   r   kernel_sizestride)r:   r;   r]   r[   num_channelsrU   r\   r^   r_   r`   ra   patch_shaper   Conv2d
projection)	r<   rQ   r]   r[   r   rU   ra   r   r=   s	           r"   r;   &Data2VecVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir!   r   r&   c                 V   UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  UR	                  U R                  R
                  R                  5      5      nUR                   S   UR                   S   pUR                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rm   r   r   )	r+   r   
ValueErrorr   toweightr)   flatten	transpose)	r<   r   r   r   ri   rj   rh   r   r   s	            r"   rB   %Data2VecVisionPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\__T__5K5K5Q5Q%RS
$.$4$4Q$79I9I!9Lk''*44Q:
+666r!   )r]   r   ra   r   r[   r   )r   r   r   r   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   rY   rY      s.    j"7ELL 7U\\ 7 7r!   rY   c                      ^  \ rS rSrSS\S\S-  SS4U 4S jjjr    SS\R                  S\	S	\R                  S-  S
\	S\\
   S-  S\\R                     \\R                  \R                  4   -  4S jjrSrU =r$ )Data2VecVisionSelfAttention   NrQ   window_sizer&   c                 J  > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        [%        U5      U l        U R&                  (       a  [)        XS9U l        g g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r:   r;   rQ   rU   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluerd   attention_probs_dropout_probrf   boolhas_relative_position_bias"Data2VecVisionRelativePositionBiasrelative_position_biasr<   rQ   r   r=   s      r"   r;   $Data2VecVisionSelfAttention.__init__   sN    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*.{*;'***LV*mD' +r!   r?   output_attentionsr   r   
resolutionc                     UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        R                  " XR	                  SS5      5      nU[        R                  " U R                  5      -  nU R                  (       aS  Uu  pXR                  R                  -  XR                  R                  -  4nXR                  XUR                   S   S9-   nUb  X-   n[        R                   R#                  USS9nU R%                  U5      n[        R                  " X5      nUR'                  SSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR                  " U6 nU(       a  UU4nU$ U4nU$ )	Nrl   r   rm   dim_sizers   r   r   )r+   r   r   r{   r   r   r   r-   matmulmathsqrtr   rQ   r[   r   r   ry   softmaxrf   rx   
contiguousrp   r   )r<   r?   r   r   r   r   input_shapehidden_shapequery_layer	key_layervalue_layerattention_scoresri   rj   r   attention_probscontext_layernew_context_layer_shapeoutputss                      r"   rB   #Data2VecVisionSelfAttention.forward   s     $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ **&MF![[%;%;;UkkF\F\=\]K/2M2M@S@STU@V 3N 3  
 "-/H --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=/2 O\M]r!   )
r   r   rQ   rf   r   r   r   r   r   r   r9   FNFN)r   r   r   r   r   tupler;   r-   rJ   r   r   rB   r    rL   rM   s   @r"   r   r      s    n3 n%$, nZ^ n n4 #(6:).(,.||.  . !&t 3	.
 #'. #J%. 
u||	uU\\5<<%?@	@. .r!   r   c                       \ rS rSr    SS\R
                  S\S\R
                  S-  S\S\\   S-  S\\R
                     \\R
                  \R
                  4   -  4S	 jjr	S
r
g)Data2VecVisionSdpaSelfAttentioni,  Nr?   r   r   r   r   r&   c           
      H   U(       a,  [         R                  U R                  R                   S35        UR                  S S n/ UQSPU R
                  P7nU R                  U5      R                  U5      R                  SS5      nU R                  U5      R                  U5      R                  SS5      n	U R                  U5      R                  U5      R                  SS5      n
S nU R                  (       aQ  Uu  pXR                  R                  -  XR                  R                  -  4nU R                  XUR                  S   S9nUb
  Uc  UnOX-  nS[        R                   " U R
                  5      -  n["        R$                  R&                  R)                  UU	U
UU R*                  (       a  U R                  R,                  OSSUS9nUR/                  S	SSS
5      R1                  5       nUR3                  5       S S U R4                  4-   nUR                  " U6 nUS 4$ )Nz does not support `output_attentions=True`. The returned attention weights will be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model.rl   r   rm   r   r(   F)	attn_mask	dropout_p	is_causalscaler   r   r   )loggerwarning_oncer=   r   r+   r   r   r{   r   r   r   r   rQ   r[   r   r   r   r-   r   ry   scaled_dot_product_attentionr%   r   rx   r   rp   r   )r<   r?   r   r   r   r   r   r   r   r   r   	attn_biasri   rj   r   scalingr   r   s                     r"   rB   'Data2VecVisionSdpaSelfAttention.forward-  s    >>**+ ,D D $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR	**&MF![[%;%;;UkkF\F\=\]K33@S@STU@V 4 I
 "- 2	3	dii 8 899++HHBF--dkk>>UX I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDd""r!   r   r   )r   r   r   r   r-   rJ   r   r   r   rB   r    r   r!   r"   r   r   ,  s     #(6:).(,/#||/#  /# !&t 3	/#
 #'/# #J%/# 
u||	uU\\5<<%?@	@/# /#r!   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrSS\R                  S\R                  S\R                  4S	 jjr	S
r
U =r$ )Data2VecVisionSelfOutputi`  z
The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rQ   r&   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r9   )	r:   r;   r   r   rU   denserd   re   rf   r<   rQ   r=   s     r"   r;   !Data2VecVisionSelfOutput.__init__f  sB    YYv1163E3EF
zz&"<"<=r!   r?   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r9   r   rf   )r<   r?   r   gammas       r"   rB    Data2VecVisionSelfOutput.forwardk  $    

=1]3r!   r   r9   )r   r   r   r   r   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   r   r   `  sJ    
>3 > >
U\\  ^c^j^j  r!   r   )eagersdpac                      ^  \ rS rSrSS\S\S-  SS4U 4S jjjr    SS\R                  S\	S	\
S
   S\	S\\   S-  S\\R                     \\R                  \R                  4   -  4S jjrSrU =r$ )Data2VecVisionAttentioniy  NrQ   r   r&   c                 z   > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        g )Nr   )r:   r;   &DATA2VEC_VISION_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r3   r   s      r"   r;    Data2VecVisionAttention.__init__z  s6    ?@[@[\
 /v6r!   r?   r   r   r   r   r   c                 h    U R                  XX4U5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   r   r3   )	r<   r?   r   r   r   r   self_outputsattention_outputr   s	            r"   rB   Data2VecVisionAttention.forward  sK     ~~.D`j
  ;;|AF#%QR(88r!   r   r9   r   )r   r   r   r   r   r   r;   r-   rJ   r   r   r   rB   r    rL   rM   s   @r"   r   r   y  s    73 7%$, 7Z^ 7 7 #(QU).(,||   !))M N	
 #' #J% 
u||	uU\\5<<%?@	@ r!   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Data2VecVisionIntermediatei  rQ   r&   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r9   )r:   r;   r   r   rU   intermediate_sizer   r\   
hidden_actrK   r	   intermediate_act_fnr   s     r"   r;   #Data2VecVisionIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r!   r?   c                 J    U R                  U5      nU R                  U5      nU$ r9   r   r  rA   s     r"   rB   "Data2VecVisionIntermediate.forward  s&    

=100?r!   r  r   r   r   r   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   r  r    s7    93 9 9U\\ ell  r!   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	Data2VecVisionOutputi  rQ   r&   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r9   )
r:   r;   r   r   r  rU   r   rd   re   rf   r   s     r"   r;   Data2VecVisionOutput.__init__  sB    YYv779K9KL
zz&"<"<=r!   r?   c                 J    U R                  U5      nU R                  U5      nU$ r9   r   rA   s     r"   rB   Data2VecVisionOutput.forward  r   r!   r   r
  rM   s   @r"   r  r    s7    >3 > >
U\\ ell  r!   r  c                   
  ^  \ rS rSrSr SS\S\S-  S\SS4U 4S jjjr    SS	\	R                  S
\S\	R                  S-  S\S\\\4   S-  S\\	R                     \\	R                  \	R                  4   -  4S jjrSrU =r$ )Data2VecVisionLayeri  z?This corresponds to the Block class in the timm implementation.NrQ   r   drop_path_rater&   c                   > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        US:  a  [        U5      O[        R                   " 5       U l        [        R                  " UR                  UR                  S9U l        UR&                  nUS:  aw  [        R(                  " U[*        R,                  " UR                  5      -  SS9U l        [        R(                  " U[*        R,                  " UR                  5      -  SS9U l        g Su  U l        U l        g )	Nr   r   epsr(   r   T)requires_grad)NN)r:   r;   chunk_size_feed_forwardseq_len_dimr   r   r  intermediater  r3   r   	LayerNormrU   layer_norm_epslayernorm_beforer6   Identityr4   layernorm_afterlayer_scale_init_valuerS   r-   oneslambda_1lambda_2)r<   rQ   r   r  init_valuesr=   s        r"   r;   Data2VecVisionLayer.__init__  s    	'-'E'E$0Q6v>*62 "V-?-?VEZEZ [CQTWCW/?]_]h]h]j!||F,>,>FDYDYZ33?LLuzz&BTBT7U)UeijDMLLuzz&BTBT7U)UeijDM+5(DM4=r!   r?   r   r   r   r   c                    U R                  U R                  U5      UUUUS9nUS   nUSS  nU R                  b  U R                  U-  nU R                  U5      U-   nU R	                  U5      n	U R                  U	5      n	U R                  U	5      n	U R                  b  U R                  U	-  n	U R                  U	5      U-   n	U	4U-   nU$ )Nr   r   r   r   r   r   )r   r  r"  r4   r  r  r3   r#  )
r<   r?   r   r   r   r   self_attention_outputsr   r   layer_outputs
             r"   rB   Data2VecVisionLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r!   )
r   r  r4   r  r"  r#  r  r  r3   r  )Nr(   r   )r   r   r   r   r   r   r   rI   r;   r-   rJ   r   r   rB   r    rL   rM   s   @r"   r  r    s    I gj6*69>6^c6	6 6. #(6:).-1'||'  ' !&t 3	'
 #'' #s(Od*' 
u||	uU\\5<<%?@	@' 'r!   r  c                      ^  \ rS rSrS\S\SS4U 4S jjr\" SS9S\\\4   S\	R                  4S	 j5       rSS
\S\	R                  4S jjrSrU =r$ )r   i  rQ   r   r&   Nc                    > [         TU ]  5         X l        SUS   -  S-
  SUS   -  S-
  -  S-   U l        [        R
                  " [        R                  " U R                  UR                  5      5      U l	        g )Nrm   r   r   r   )
r:   r;   r   num_relative_distancer   rS   r-   rT   r   relative_position_bias_tabler   s      r"   r;   +Data2VecVisionRelativePositionBias.__init__  sp    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)r!   
   )maxsizec                    SUS   -  S-
  SUS   -  S-
  -  S-   nUS   US   -  n[         R                  " [         R                  " US   5      [         R                  " US   5      SS9n[         R                  " U5      n[         R                  " US5      nUSS2SS2S4   USS2SSS24   -
  nUR                  SSS5      R                  5       nUSS2SS2S4==   US   S-
  -  ss'   USS2SS2S4==   US   S-
  -  ss'   USS2SS2S4==   SUS   -  S-
  -  ss'   [         R                  " US-   4S-  UR                  S9nUR                  S	5      USS2SS24'   US-
  USSS24'   US-
  USS2S4'   US-
  US
'   U$ )z
This method creates the relative position index, modified to support arbitrary window sizes,
as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
rm   r   r   r   ij)indexingN)rp   r)   rl   )r   r   )
r-   meshgridarangestackr   rx   r   rT   r)   sum)	r<   r   r-  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r"    generate_relative_position_indexCData2VecVisionRelativePositionBias.generate_relative_position_index  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r!   r   c                    SU R                   S   -  S-
  nSU R                   S   -  S-
  nSUS   -  S-
  nSUS   -  S-
  nU R                  nU R                  n	Xg-  S-   n
USU	S-
   nUR                  SXTS5      R	                  SSSS5      n[
        R                  R                  U[        U5      [        U5      4SS9nUR	                  SSSS5      R                  U
S-
  S5      n[        R                  " XU	S-
  S /5      nU R                  U5      nXR                  S5         nUR                  US   US   -  S-   US   US   -  S-   S5      nUR	                  SSS5      R                  5       nU(       a?  [
        R                  R                  UR                  S5      X34SS	S
9R                  S5      nUR                  S5      $ )ze
Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
rm   r   r   r   Nrl   bilinear)rp   rq   Fro   )r   r.  r-  rw   rx   r   ry   rz   r   r-   r|   r?  r{   r   r   squeeze)r<   r   r   r   
old_height	old_widthr   r    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler>  r   s                   r"   rB   *Data2VecVisionRelativePositionBias.forward  s!    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099=VYZ=Z=\]^,
( #'"G"G"T!AB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r!   )r-  r.  r   )FN)r   r   r   r   r   r   r;   r   r   r-   rJ   r?  r   rB   r    rL   rM   s   @r"   r   r     ss    
3 
% 
D 
 )4'E#s(O 'PUP\P\ ' 5'0-3T -3]b]i]i -3 -3r!   r   c                      ^  \ rS rSrSS\S\S-  SS4U 4S jjjr     SS\R                  S\	S	\	S
\	S\\
\
4   S-  S\	S\\-  4S jjrSrU =r$ )Data2VecVisionEncoderiI  NrQ   r   r&   c                   > [         TU ]  5         Xl        UR                  U l        U R                  (       a  [        XS9U l        [        R                  " SUR                  UR                  SS9 Vs/ s H  o3R                  5       PM     nn[        R                  " [        UR                  5       Vs/ s H#  n[        UUR                   (       a  UOS XE   S9PM%     sn5      U l        SU l        g s  snf s  snf )Nr   r   cpu)r*   )r   r  F)r:   r;   rQ   !use_shared_relative_position_biasr   r   r   r-   linspacer  num_hidden_layersitemr   
ModuleListranger  use_relative_position_biaslayergradient_checkpointing)r<   rQ   r   xdprir=   s         r"   r;   Data2VecVisionEncoder.__init__J  s    *0*R*R'***LV*mD' "'63H3H&JbJbkp!qr!qAvvx!qr]] v778 9A $/5/P/PVZ#&6
 9	

 ',# ss   3C35*C8r?   r   output_hidden_statesr   r   return_dictc           	         U(       a  SOS nU(       a  SOS n[        U R                  5       H  u  pU(       a  Xq4-   nU R                  (       aR  Uu  pXR                  R                  -  XR                  R                  -  4nU R                  XUR                  S   S9nOS nU
" UUUUUS9nUS   nU(       d  M  XS   4-   nM     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr   r   )r   r   r'  r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr9   r   ).0vs     r"   	<genexpr>0Data2VecVisionEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater?   
attentions)		enumeraterX  r   rQ   r[   r   r+   r   r   )r<   r?   r   r^  r   r   r_  all_hidden_statesall_self_attentionsr\  layer_moduleri   rj   r   r   layer_outputss                   r"   rB   Data2VecVisionEncoder.forward_  s    #7BD$5b4(4OA#$58H$H!.. *%)?)??++J`J`A`a)-)D)D]j]p]pqr]s *E *& *.&("3'=)A%M *!,M  &91=M<O&O#1  54   14D Dm]GZ$[mmm++*
 	
r!   )rQ   rY  r   rX  r   r9   )FFFNT)r   r   r   r   r   r   r;   r-   rJ   r   r   r   rB   r    rL   rM   s   @r"   rN  rN  I  s    ,3 ,%$, ,Z^ , ,0 #(%*).-1 /
||/
  /
 #	/

 #'/
 #s(Od*/
 /
 
	 /
 /
r!   rN  c                   |   ^  \ rS rSr% \\S'   SrSrSrSr	S/r
S/rSr\R                  " 5       U 4S	 j5       rS
rU =r$ )Data2VecVisionPreTrainedModeli  rQ   data2vec_vision)imager   Tr  z.*relative_position_index.*c                   > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  5        UR                  b   [        R
                  " UR                  5        UR                  b!  [        R
                  " UR                  5        gg[        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       ay  UR                  bk  [        R                  " UR                  U R                  R                  5        [        R                  " UR                   U R                  R                  5        ggg)zInitialize the weightsN)r:   _init_weightsr\   rO   initzeros_rV   rX   rc   r   r.  r  r"  	constant_rQ   r   r#  )r<   moduler=   s     r"   rs  +Data2VecVisionPreTrainedModel._init_weights  s     	f%f677KK(()  ,F--.))5F667 6 BCCKK;;< 344*v0R0RSv0R0RS + 5r!   r   )r   r   r   r   r   __annotations__base_model_prefixinput_modalitiesmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpar-   no_gradrs  r    rL   rM   s   @r"   ro  ro    sS     ! )!$O&*#./*H)I&N
]]_T Tr!   ro  c                      ^  \ rS rSrSS\S\SS4U 4S jjjrS r\     SS\	R                  S	\	R                  S-  S
\S-  S\S-  S\S\S-  S\\-  4S jj5       rSrU =r$ )Data2VecVisionModeli  rQ   add_pooling_layerr&   Nc                   > [         TU ]  U5        Xl        [        U5      U l        [        XR                  R                  R                  S9U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U(       a  [!        U5      OSU l        U R%                  5         g)z_
add_pooling_layer (bool, *optional*, defaults to `False`):
    Whether to add a pooling layer
r   r  N)r:   r;   rQ   rO   rh   rN  rZ   r   encoderuse_mean_poolingr   r  r  rU   r  	layernormData2VecVisionPoolerpooler	post_init)r<   rQ   r  r=   s      r"   r;   Data2VecVisionModel.__init__  s    
 	 26:,VAaAaAmAmn $44BKKM",,vGYGY_e_t_t:u 	 7H*62T 	r!   c                 .    U R                   R                  $ r9   )rh   rZ   rF   s    r"   get_input_embeddings(Data2VecVisionModel.get_input_embeddings  s    ///r!   r   r   r   r^  r   r_  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  XS9u  pUR
                  SS n
U R                  UUUU
UUS9nUS   nU R                  U5      nU R                  b  U R                  U5      OSnU(       d  Ub  X4OU4nXSS -   $ [        UUUR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
N)r   rm   )r   r^  r   r_  r   r   r   )rf  pooler_outputr?   rg  )rQ   r   r^  r_  rh   r+   r  r  r  r   r?   rg  )r<   r   r   r   r^  r   r_  kwargsembedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r"   rB   Data2VecVisionModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY"oolo\!''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO;_n^pL!""5553-')77&11	
 	
r!   )rQ   rh   r  r  r  )F)NNNFN)r   r   r   r   r   r   r;   r  r   r-   rJ   r   r   r   rB   r    rL   rM   s   @r"   r  r    s    3  Y]  &0  48)-,0).#',
ll,
 ))D0,
  $;	,

 #Tk,
 #',
 D[,
 
5	5,
 ,
r!   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	r  i  rQ   r&   Nc                    > [         TU ]  5         UR                  (       a/  [        R                  " UR
                  UR                  S9U l        g S U l        g )Nr  )r:   r;   r  r   r  rU   r  r  r   s     r"   r;   Data2VecVisionPooler.__init__  sA    KQKbKbBLL++1F1FG 	hl 	r!   r?   c                     U R                   b0  US S 2SS 2S S 24   nU R                  UR                  S5      5      nU$ US S 2S4   nU$ )Nr   r   )r  mean)r<   r?   patch_tokensr  s       r"   rB   Data2VecVisionPooler.forward   sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr!   )r  r
  rM   s   @r"   r  r    s7    
3 
 
	U\\ 	ell 	 	r!   r  z
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrS\SS4U 4S jjr\      SS\R                  S-  S\R                  S-  S\	S-  S	\	S-  S
\	S\	S-  S\
\-  4S jj5       rSrU =r$ )$Data2VecVisionForImageClassificationi  rQ   r&   Nc                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NTr  r   )r:   r;   
num_labelsr  rp  r   r   rU   r  
classifierr  r   s     r"   r;   -Data2VecVisionForImageClassification.__init__  st      ++26TR OUN_N_bcNc"))F$6$68I8IJikititiv 	r!   r   labelsr   r^  r   r_  c                 h   Ub  UOU R                   R                  nU R                  UUUUUS9nU(       a  UR                  OUS   n	U R	                  U	5      n
SnUb  U R                  X*U R                   5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r^  r   r_  r   rm   losslogitsr?   rg  )	rQ   r_  rp  r  r  loss_functionr   r?   rg  )r<   r   r  r   r^  r   r_  r  r   r  r  r  r3   s                r"   rB   ,Data2VecVisionForImageClassification.forward   s    " &1%<k$++BYBY&&/!5%=# ' 
 2=--'!*/%%fdkkBDY,F)-)9TGf$EvE$!//))	
 	
r!   )r  rp  r  NNNNFN)r   r   r   r   r   r;   r   r-   rJ   r   r   r   rB   r    rL   rM   s   @r"   r  r    s    
3 
 
  -1&*)-,0).#'*
llT)*
 t#*
  $;	*

 #Tk*
 #'*
 D[*
 
&	&*
 *
r!   r  c                      ^  \ rS rSrSr   SS\S\S\\\\4   -  S\\\\4   -  \-  S\S\\\\4   -  S	S
4U 4S jjjr	S\
R                  S	\
R                  4S jrSrU =r$ )Data2VecVisionConvModuleiO  a4  
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
in_channelsout_channelsr   paddingr   dilationr&   Nc           	         > [         TU ]  5         [        R                  " UUUUUUS9U l        [        R
                  " U5      U l        [        R                  " 5       U l        g )N)r  r  r   r  r   r  )	r:   r;   r   r   convBatchNorm2dbnReLU
activation)r<   r  r  r   r  r   r  r=   s          r"   r;   !Data2VecVisionConvModule.__init__W  sQ     	II#%#
	 ...'')r!   r#   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r9   )r  r  r  )r<   r#   r3   s      r"   rB    Data2VecVisionConvModule.forwardl  s0    5!(r!   )r  r  r  )r   Fr   )r   r   r   r   r   r   r   rK   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   r  r  O  s     01*+$$ $ 5c?*	$
 uS#X&,$ $ c3h'$ 
$ $*U\\ ell  r!   r  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jrS
r	U =r
$ )!Data2VecVisionPyramidPoolingBlockiu  
pool_scaler  channelsr&   Nc                    > [         TU ]  5         [        R                  " U5      [	        X#SS9/U l        [        U R
                  5       H   u  pEU R                  [        U5      U5        M"     g )Nr   r   )	r:   r;   r   AdaptiveAvgPool2dr  layersrh  
add_modulerK   )r<   r  r  r  r\  rX  r=   s         r"   r;   *Data2VecVisionPyramidPoolingBlock.__init__v  sX      ,$[J
 "$++.HAOOCFE* /r!   r#   c                 @    UnU R                    H  nU" U5      nM     U$ r9   r  )r<   r#   hidden_staterX  s       r"   rB   )Data2VecVisionPyramidPoolingBlock.forward  s%    [[E .L !r!   r  )r   r   r   r   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   r  r  u  sD    +3 +S +C +D +U\\ ell  r!   r  c            
          ^  \ rS rSrSrS\\S4   S\S\S\SS	4
U 4S
 jjrS\	R                  S\\	R                     4S jrSrU =r$ )"Data2VecVisionPyramidPoolingModulei  a  
Pyramid Pooling Module (PPM) used in PSPNet.

Args:
    pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
        Module.
    in_channels (int): Input channels.
    channels (int): Channels after modules, before conv_seg.
    align_corners (bool): align_corners argument of F.interpolate.

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
pool_scales.r  r  rr   r&   Nc                   > [         TU ]  5         Xl        X@l        X l        X0l        / U l        [        U5       HE  u  pV[        XbUS9nU R                  R                  U5        U R                  [        U5      U5        MG     g )N)r  r  r  )r:   r;   r  rr   r  r  blocksrh  r  appendr  rK   )	r<   r  r  r  rr   r\  r  blockr=   s	           r"   r;   +Data2VecVisionPyramidPoolingModule.__init__  sr    &*& &{3MA5%E KKu%OOCFE* 4r!   rZ  c                     / nU R                    HV  nU" U5      n[        R                  R                  XAR	                  5       SS  SU R
                  S9nUR                  U5        MX     U$ )Nrm   rB  ro   )r  r   ry   rz   rp   rr   r  )r<   rZ  ppm_outsppmppm_outupsampled_ppm_outs         r"   rB   *Data2VecVisionPyramidPoolingModule.forward  sg    ;;C!fG " 9 9ffhqrl4K]K] !: ! OO-.  r!   )rr   r  r  r  r  )r   r   r   r   r   r   r   r   r;   r-   rJ   listrB   r    rL   rM   s   @r"   r  r    s`    +E#s(O +# +QT +ei +nr + $u||*<  r!   r  c                   x   ^  \ rS rSrSrS\SS4U 4S jjrS rS\R                  S\R                  4S	 jr
S
rU =r$ )Data2VecVisionUperHeadi  z
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://huggingface.co/papers/1807.10221).

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
rQ   r&   Nc                   > [         TU ]  5         UR                  U l        UR                  /S-  U l        UR                  U l        SU l        [        R                  " U R
                  UR                  SS9U l
        [        U R                  U R                  S   U R
                  U R                  S9U l        [        U R                  S   [        U R                  5      U R
                  -  -   U R
                  SSS9U l        [        R                   " 5       U l        [        R                   " 5       U l        U R                  S S  Hm  n[        X R
                  SS9n[        U R
                  U R
                  SSS9nU R"                  R'                  U5        U R$                  R'                  U5        Mo     [        [        U R                  5      U R
                  -  U R
                  SSS9U l        g )	N   Fr   r  rl   )rr   r   r   r  )r:   r;   r  rU   r  r  rr   r   r   r  r  r  psp_modulesr  len
bottleneckrU  lateral_convs	fpn_convsr  fpn_bottleneck)r<   rQ   r  l_convfpn_convr=   s        r"   r;   Data2VecVisionUperHead.__init__  s   !--"../!3**"))DMM63D3DRST >R MM,,	
 3R 3t'7'7#84==#HHMM	
  ]]_++CR0K-k==VWXF/t}}Z[efgH%%f-NN!!(+	 1 7  !DMM1MM	
r!   c                     US   nU/nUR                  U R                  U5      5        [        R                  " USS9nU R	                  U5      nU$ r   )extendr  r-   r|   r  )r<   inputsrZ  psp_outsr3   s        r"   psp_forward"Data2VecVisionUperHead.psp_forward  sL    2J3((+,99X1-*r!   encoder_hidden_statesc           	      @   [        U R                  5       VVs/ s H  u  p#U" X   5      PM     nnnUR                  U R                  U5      5        [	        U5      n[        US-
  SS5       HP  nXBS-
     R                  SS  nXBS-
     [        R                  R                  XB   USU R                  S9-   XBS-
  '   MR     [        US-
  5       Vs/ s H  o R                  U   " XB   5      PM     nnUR                  US   5        [        US-
  SS5       HA  n[        R                  R                  Xr   US   R                  SS  SU R                  S9Xr'   MC     [        R                  " USS9nU R                  U5      nU R                  U5      nU$ s  snnf s  snf )Nr   r   rl   rm   rB  ro   rs   )rh  r  r  r  r  rV  r+   r   ry   rz   rr   r  r-   r|   r  r  )	r<   r  r\  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr3   s	            r"   rB   Data2VecVisionUperHead.forward  s   R[\`\n\nRopRoqL!6!9:Rop(()>?@  #8}+a/B7A!a%..qr2J&1uo0I0I*:TM_M_ 1J 1 HUO 8 =BBVYZBZ<[\<[qNN1%hk2<[\%+a/B7A--33(1+"3"3AB"7jX\XjXj 4 HK 8 99X1-$$X.(3 q ]s   F F)
rr   r  r  r  r  r  r  r  r  r  )r   r   r   r   r   r   r;   r  r-   rJ   rB   r    rL   rM   s   @r"   r  r    sB    $
3 $
 $
LU\\ ell  r!   r  c                      ^  \ rS rSrSr   SS\S\S\S\\\\4   -  SS4
U 4S	 jjjrS
\	R                  S\	R                  4S jrSrU =r$ )Data2VecVisionFCNHeadi  a  
Fully Convolution Networks for Semantic Segmentation. This head is implemented of
[FCNNet](https://huggingface.co/papers/1411.4038>).

Args:
    config (Data2VecVisionConfig): Configuration.
    in_channels
    kernel_size (int): The kernel size for convs in the head. Default: 3.
    dilation (int): The dilation rate for convs in the head. Default: 1.


Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
rQ   in_indexr   r  r&   Nc                 2  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l	        X l
        US-  U-  n/ nUR                  [        U R                  U R
                  X5US95        [        U R                  S-
  5       H2  nUR                  [        U R
                  U R
                  X5US95        M4     U R                  S:X  a  [        R                  " 5       U l        O[        R"                  " U6 U l        U R                  (       a4  [        U R                  U R
                  -   U R
                  X3S-  S9U l        [        R&                  " U R
                  UR(                  SS9U l        g )Nrm   )r   r  r  r   r   r  r  )r:   r;   rU   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  rV  r   r  convs
Sequentialconv_catr   r  r  )	r<   rQ   r  r   r  conv_paddingr  r\  r=   s	           r"   r;   Data2VecVisionFCNHead.__init__  sH    	!--1133"99 #q(H4$  $--[iq	

 t~~)*ALL(MM4==kjr + >>QDJ.DJ4  4==0$--[qrbrDM ))DMM63D3DRSTr!   r  c                     XR                      nU R                  U5      nU R                  (       a%  U R                  [        R
                  " X#/SS95      nU R                  U5      nU$ )Nr   rs   )r  r  r   r  r-   r|   r  )r<   r  r?   r3   s       r"   rB   Data2VecVisionFCNHead.forward:  sT    -mm<M*]]599m-D!#LMF(r!   )r  r  r   r  r  r  r  r  )rm   r   r   )r   r   r   r   r   r   r   r   r;   r-   rJ   rB   r    rL   rM   s   @r"   r  r    s    " *+$U$$U $U 	$U
 c3h'$U 
$U $ULU\\ ell  r!   r  c                      ^  \ rS rSrS\SS4U 4S jjrS r\      SS\R                  S-  S\R                  S-  S	\
S-  S
\
S-  S\
S\
S-  S\\-  4S jj5       rSrU =r$ )%Data2VecVisionForSemanticSegmentationiD  rQ   r&   Nc                 x  > [         TU ]  U5        UR                  U l        [        USS9U l        [        U R                  R                  5      S:w  a  [        S5      e[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " UR                  5      [        R                  " 5       [        R                  " UR                  UR                  SSS95      U l        [        R                  " [        R                  " UR                  UR                  SSS95      U l        [        R"                  " 5       U l        [        R&                  " SSS9U l        [+        U5      U l        UR.                  (       a  [1        U5      OS U l        U R5                  5         g )NFr  r  zData2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rm   r   )r:   r;   r  r  rp  r  rQ   out_indicesr   r   r  ConvTranspose2drU   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headr  r   s     r"   r;   .Data2VecVisionForSemanticSegmentation.__init__G  sQ     ++26US t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 2&9?E?X?X3F;^b 	r!   c                 X   [         R                  R                  XR                  SS  SSS9nUb,  [         R                  R                  X#R                  SS  SSS9n[	        U R
                  R                  S9nU" XC5      nUnUb$  U" WU5      n	XR
                  R                  U	-  -  nU$ )Nr   rB  Fro   )ignore_index)r   ry   rz   r+   r   rQ   semantic_loss_ignore_indexauxiliary_loss_weight)
r<   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossr  auxiliary_losss
             r"   compute_loss2Data2VecVisionForSemanticSegmentation.compute_lossg  s    ==44bc*5 5 
 ')+)B)B ||BC'8zY^ *C *& $1W1WX-6	'%&@&INKK55FFDr!   r   r  r   r^  r   r_  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  UUSUUS9nU(       a  UR                  OUS   n	[        U	5       V
Vs/ s H&  u  pU
S-   U R                   R                  ;   d  M$  UPM(     nn
nUR                  S   nU R                   R                  U R                   R                  -  nU Vs/ s H2  oSS2SS2SS24   R                  SSS5      R                  USX5      PM4     nnU R                  U R                  U R                   U R"                  /n[%        ['        U5      5       H  nUU   " UU   5      UU'   M     U R)                  U5      nSnU R*                  b  U R+                  U5      nSnUb  U R-                  UUU5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [/        UUU(       a  UR                  OSUR0                  S	9$ s  snn
f s  snf )
a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
>>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTr  r   rm   rl   r  )rQ   r_  r^  r  r   rp  r?   rh  r  r+   r]   r[   rx   rw   r  r  r  r  rV  r  r  r  r!  r   rg  )r<   r   r  r   r^  r   r_  r  r   r  idxfeaturefeaturesr   patch_resolutionrZ  opsr\  r  r  r  r3   s                         r"   rB   -Data2VecVisionForSemanticSegmentation.forwardz  sZ   H &1%<k$++BYBY$8$D $++JjJj 	 $++"8"8A"=NOO&&/!%%=# ' 
 :E 5 5'RS* 1::O0Pw0PTWZ[T[_c_j_j_v_vTvG0Pw!''*
;;11T[[5K5KKnv
nvijaQhK1a(00RAQdnv 	 

 yy$))TYY		:s8}%Aa&!-HQK & !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H;?H;9I)r  rp  r  r  r  r  r  r  r  )r   r   r   r   r   r;   r!  r   r-   rJ   r   r   r   rB   r    rL   rM   s   @r"   r	  r	  D  s    3  @&  -1&*)-,0).#'Y
llT)Y
 t#Y
  $;	Y

 #TkY
 #'Y
 D[Y
 
(	(Y
 Y
r!   r	  )r  r	  r  ro  )r(   F)Cr   collections.abcr^   r   dataclassesr   typingr   r-   r   torch.nnr    r   rt  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_data2vec_visionr   
get_loggerr   r   r   rJ   rI   r   r4   Moduler6   rO   rY   r   r   r   r   r   r  r  r  r   rN  ro  r  r  r  r  r  r  r  r  r	  __all__r   r!   r"   <module>r9     sz   $   !    % & ! 9  . @ 7 7 ? 
		H	% +E  U\\ e T V[VbVb  %RYY %\7ryy \7@#7BII #7NF")) FT0#&A 0#hryy & )+* &bii 6 "
299 
>4 >DP3 P3hE
BII E
P TO T T8 D
7 D
 D
P299 & 8
+H 8
8
x"ryy "L		 $$ $PRRYY Rl<BII <~ N
,I N
 N
br!   