
    Z jf                        S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJr	  SSK
Jr  SSKJr  SS	KJrJrJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#  \RH                  " \%5      r& " S S\RN                  5      r( " S S\RN                  5      r)  S:S\RN                  S\RT                  S\RT                  S\RT                  S\RT                  S-  S\+S-  S\+S\\   4S jjr, " S S\RN                  5      r- " S  S!\RN                  5      r. " S" S#\RN                  5      r/ " S$ S%\RN                  5      r0 " S& S'\RN                  5      r1 " S( S)\5      r2 " S* S+\RN                  5      r3\ " S, S-\5      5       r4\ " S. S/\45      5       r5 " S0 S1\RN                  5      r6\" S2S39 " S4 S5\45      5       r7\" S6S39 " S7 S8\45      5       r8/ S9Qr9g);zPyTorch ViT model.    N)Callable)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	ViTConfigc            	          ^  \ rS rSrSrSS\S\4U 4S jjjrS\R                  S\
S\
S	\R                  4S
 jr  SS\R                  S\R                  S-  S\S	\R                  4S jjrSrU =r$ )ViTEmbeddings+   zZ
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
configuse_mask_tokenc                 `  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l	        [        U5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR"                  U l        Xl        g )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r*   	__class__s       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vit/modeling_vit.pyr    ViTEmbeddings.__init__0   s    ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei 26 :++77#%<<A{QPVPbPb0c#d zz&"<"<= ++    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper+   r"   jit
is_tracingr/   r   reshapepermuter   
functionalinterpolateviewcat)r0   r5   r6   r7   r*   num_positionsclass_pos_embedpatch_pos_embedrB   
new_height	new_widthsqrt_num_positionss               r2   interpolate_pos_encoding&ViTEmbeddings.interpolate_pos_encoding<   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr4   Npixel_valuesbool_masked_posrR   c                    UR                   u  pEpgU R                  XS9nUbX  UR                   S   n	U R                  R                  XIS5      n
UR	                  S5      R                  U
5      nUSU-
  -  X-  -   nU R                  R                  USS5      n[        R                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )N)rR   r   r:   g      ?rA   )rC   r)   r'   expand	unsqueezetype_asr%   r"   rK   rR   r+   r.   )r0   rT   rU   rR   
batch_sizenum_channelsr6   r7   r5   
seq_lengthmask_tokensmask
cls_tokenss                r2   forwardViTEmbeddings.forwardd   s     3?2D2D/
&**<*k
&#))!,J//00LK",,R088ED#sTz2[5GGJ ^^**:r2>
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r4   )r%   r   r.   r'   r)   r/   r+   FNF)__name__
__module____qualname____firstlineno____doc__r   boolr    r"   TensorintrR   
BoolTensorr`   __static_attributes____classcell__r1   s   @r2   r   r   +   s    
y 
$ 
 
&D5<< &D &DUX &D]b]i]i &DV 48).	ll ))D0 #'	
 
 r4   r   c                   v   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	S\R                  4S jjr
S	rU =r$ )r(      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
r   c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r   r    
image_sizer/   r[   r$   
isinstancecollectionsabcIterabler*   r   Conv2d
projection)r0   r   ru   r/   r[   r$   r*   r1   s          r2   r    ViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir4   rT   rR   r8   c                    UR                   u  p4pVX@R                  :w  a  [        SU R                   SU S35      eU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S	3	5      eU R	                  U5      R                  S
5      R                  SS
5      nU$ )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r<   )rC   r[   
ValueErrorru   r{   flatten	transpose)r0   rT   rR   rZ   r[   r6   r7   r5   s           r2   r`   ViTPatchEmbeddings.forward   s    2>2D2D/
&,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r4   )ru   r[   r*   r/   r{   rb   )rd   re   rf   rg   rh   r   r    r"   rj   ri   r`   rm   rn   ro   s   @r2   r(   r(      s@    jy jELL D ]b]i]i  r4   r(   modulequerykeyvalueattention_maskscalingr.   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr:         r<   r   rA   )ptrainingr   )
r>   r"   matmulr   r   rH   softmaxr.   r   
contiguous)
r   r   r   r   r   r   r.   r   attn_weightsattn_outputs
             r2   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r4   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
\R                  \R                  4   4S jrSrU =r$ )	ViTSelfAttention   r   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r~   r   F)bias)r   r    r$   num_attention_headshasattrr   r   rk   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r0   r   r1   s     r2   r    ViTSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r4   hidden_statesr   r8   c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        R                  " U R                  R                  [        5      nU" U UUUS 4U R                  U R                  U R                  (       d  SOU R                   S.UD6u  pU	R#                  5       S S U R$                  4-   nU	R'                  U5      n	X4$ )Nr   r:   r   r<           )r   r   r.   )rC   r   r   r   rJ   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   r   r>   r   rF   )r0   r   r   rZ   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r2   r`   ViTSelfAttention.forward   sO   
 #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?(M(MKK,,.E)
 *=
*
 nnLL#}}C$2C2C
*
 
*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r4   )
r   r   r   r   r   r   r   r   r   r   )rd   re   rf   rg   r   r    r"   rj   r   r   tupler`   rm   rn   ro   s   @r2   r   r      sS    ]y ](.||. +,. 
u||U\\)	*	. .r4   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
ViTSelfOutput   z
The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r   r    r   r   r$   denser,   r-   r.   r   s     r2   r    ViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r4   r   input_tensorr8   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r.   r0   r   r   s      r2   r`   ViTSelfOutput.forward	  s$    

=1]3r4   r   )rd   re   rf   rg   rh   r   r    r"   rj   r`   rm   rn   ro   s   @r2   r   r      sB    
>y >
U\\  RWR^R^  r4   r   c                   t   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\R                  4S jr
SrU =r$ )	ViTAttentioni  r   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r   r    r   	attentionr   outputr   s     r2   r    ViTAttention.__init__  s&    )&1#F+r4   r   r   r8   c                 R    U R                   " U40 UD6u  p4U R                  X15      nU$ r   r   r   )r0   r   r   self_attn_output_r   s         r2   r`   ViTAttention.forward  s/    
 #nn]EfE-=r4   r   )rd   re   rf   rg   r   r    r"   rj   r   r   r`   rm   rn   ro   s   @r2   r   r     sC    ,y ,
|| +, 
	 r4   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )ViTIntermediatei  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r    r   r   r$   intermediate_sizer   rv   
hidden_actstrr   intermediate_act_fnr   s     r2   r    ViTIntermediate.__init__   s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r4   r   r8   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r0   r   s     r2   r`   ViTIntermediate.forward(  s&    

=100?r4   r   rd   re   rf   rg   r   r    r"   rj   r`   rm   rn   ro   s   @r2   r   r     s/    9y 9U\\ ell  r4   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )		ViTOutputi.  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r   r    r   r   r   r$   r   r,   r-   r.   r   s     r2   r    ViTOutput.__init__/  sB    YYv779K9KL
zz&"<"<=r4   r   r   r8   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r2   r`   ViTOutput.forward4  s,    

=1]3%4r4   r   r   ro   s   @r2   r   r   .  s=    >y >
U\\  RWR^R^  r4   r   c                   x   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\R                  4S jrS	rU =r$ )
ViTLayeri;  z?This corresponds to the Block class in the timm implementation.r   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r   r    chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr$   layer_norm_epslayernorm_beforelayernorm_afterr   s     r2   r    ViTLayer.__init__>  s    '-'E'E$%f-+F3' "V-?-?VEZEZ [!||F,>,>FDYDYZr4   r   r   r8   c                     U R                  U5      nU R                  " U40 UD6nXA-   nU R                  U5      nU R                  U5      nU R	                  XQ5      nU$ r   )r   r   r   r   r   )r0   r   r   hidden_states_normattention_outputlayer_outputs         r2   r`   ViTLayer.forwardH  sl    
 "22=A>>*<GG )8 ++M:((6 {{<?r4   )r   r   r   r   r   r   r   )rd   re   rf   rg   rh   r   r    r"   rj   r   r   r`   rm   rn   ro   s   @r2   r   r   ;  sH    I[y [|| +, 
	 r4   r   c                   `   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
4S jrSrU =r$ )	
ViTEncoderi]  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rc   )
r   r    r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r0   r   r   r1   s      r2   r    ViTEncoder.__init__^  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A&r   r   r8   c                 L    U R                    H  nU" U40 UD6nM     [        US9$ )N)last_hidden_state)r   r	   )r0   r   r   layer_modules       r2   r`   ViTEncoder.forwardd  s.    
 !JJL(A&AM ' ??r4   )r   r   r   )rd   re   rf   rg   r   r    r"   rj   r   r   r	   r`   rm   rn   ro   s   @r2   r   r   ]  sD    ,y ,@||@ +,@ 
	@ @r4   r   c                       \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrSr\\S	.r\R&                  " 5       S
\R*                  \R,                  -  \R.                  -  4S j5       rSrg)ViTPreTrainedModelio  r   vitrT   )imageTr   r   )r   
attentionsr   c                 B   [        U[        R                  [        R                  -  5      (       ac  [        R
                  " UR                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR                  5        g[        U[        5      (       a  [        R
                  " UR                  SU R                  R                  S9  [        R
                  " UR                  SU R                  R                  S9  UR                   b!  [        R                  " UR                   5        ggg)zInitialize the weightsr   )meanstdN)rv   r   r   rz   inittrunc_normal_weightr   initializer_ranger   zeros_r   ones_r   r+   r%   r'   )r0   r   s     r2   _init_weights ViTPreTrainedModel._init_weights  s    fbii"))344v}}3DKK<Y<YZ{{&FKK( '--KK$JJv}}%..v99IfIfgv//ct{{?\?\]  ,F--. - /r4    N)rd   re   rf   rg   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr"   no_gradr   r   rz   r   r  rm   r  r4   r2   r   r   o  s    $O!&*#(*5N"&!&
 ]]_/BII		$9BLL$H / /r4   r   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\4S jr\	\
" SS	9\   SS\R                  S
-  S\R                  S
-  S\S
-  S\\   S\4
S jj5       5       5       rSrU =r$ )ViTModeli  Fr   add_pooling_layerr   c                   > [         TU ]  U5        Xl        [        XS9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
)r   r   N)r   r    r   r   r5   r   encoderr   r   r$   r   	layernorm	ViTPoolerpooler	post_init)r0   r   r  r   r1   s       r2   r    ViTModel.__init__  si     	 'N!&)f&8&8f>S>ST+<i'$ 	r4   r8   c                 .    U R                   R                  $ r   )r5   r)   )r0   s    r2   get_input_embeddingsViTModel.get_input_embeddings  s    ///r4   )tie_last_hidden_statesNrT   rU   rR   r   c                    Uc  [        S5      eU R                  R                  R                  R                  R
                  nUR
                  U:w  a  UR                  U5      nU R                  XUS9nU R                  U5      nUR                  nU R                  U5      nU R                  b  U R                  U5      OSn	[        XS9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_valuesrU   rR   )r   pooler_output)r   r5   r)   r{   r  dtypetor  r   r   r"  r
   )
r0   rT   rU   rR   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputs
             r2   r`   ViTModel.forward  s     ?@@ 99DDKKQQ/'??>:L??Tl + 
 ,0<<8H+I);;..98<8OO4UY)Oiir4   )r   r5   r  r   r"  )TFNNN)rd   re   rf   rg   r   ri   r    r(   r&  r   r   r   r"   rj   rl   r   r   r
   r`   rm   rn   ro   s   @r2   r  r    s    y T Z^  &0&8 0  E2 -13704	jllT)j ))D0j #'+	j
 +,j 
$j  3  jr4   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r!  i  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r   r    r   r   r$   pooler_output_sizer   r   
pooler_act
activationr   s     r2   r    ViTPooler.__init__  s>    YYv1163L3LM
 !2!23r4   r   r8   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r9  )r0   r   first_token_tensorr2  s       r2   r`   ViTPooler.forward  s6     +1a40

#566r4   )r9  r   r   ro   s   @r2   r!  r!    s/    4y 4
U\\ ell  r4   r!  ac  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\S-  S\\   S	\4
S
 jj5       5       rSrU =r$ )ViTForMaskedImageModelingi  r   c                 H  > [         TU ]  U5        [        USSS9U l        [        R
                  " [        R                  " UR                  UR                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r  r   r<   r   )in_channelsout_channelsrs   )r   r    r  r   r   
Sequentialrz   r$   encoder_strider[   PixelShuffledecoderr#  r   s     r2   r    "ViTForMaskedImageModeling.__init__  s     FeDQ}}II"..#22A58K8KK
 OOF112
 	r4   NrT   rU   rR   r   r8   c                 <   Ubh  U R                   R                  U R                   R                  :w  a:  [        SU R                   R                   SU R                   R                   S35      eU R                  " U4UUS.UD6nUR
                  nUSS2SS24   nUR                  u  pxn	[        R                  " US-  5      =pUR                  SS	S5      R                  XyX5      nU R                  U5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  S
X5      nUR                  U R                   R                  S5      R                  U R                   R                  S	5      R                  S5      R                  5       n[         R"                  R%                  XSS9nUU-  R'                  5       UR'                  5       S-   -  U R                   R(                  -  n[+        UUUR,                  UR.                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = r~   r*  r   r;   r   r<   r:   none)	reductiongh㈵>)lossreconstructionr   r  )r   r/   rE  r   r   r   rC   mathfloorrG   rF   rG  ru   repeat_interleaverX   r   r   rH   l1_losssumr[   r   r   r  )r0   rT   rU   rR   r   outputsr1  rZ   sequence_lengthr[   r6   r7   reconstructed_pixel_valuesmasked_im_lossr>   r^   reconstruction_losss                    r2   r`   !ViTForMaskedImageModeling.forward  s   N &DKK,B,BdkkF`F`,`&&*kk&<&<%==UVZVaVaVpVpUqqrt  /3hh/
+%=/
 	/
 "33 *!QR%04C4I4I1
\OS$899)11!Q:BB:]ck &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN(5!//))	
 	
r4   )rG  r   r4  )rd   re   rf   rg   r   r    r   r   r"   rj   rl   ri   r   r   r   r`   rm   rn   ro   s   @r2   r@  r@    s    y "  -13704	P
llT)P
 ))D0P
 #'+	P

 +,P
 
#P
  P
r4   r@  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\
S-  S\\   S	\4
S
 jj5       5       rSrU =r$ )ViTForImageClassificationiO  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r  r   )r   r    
num_labelsr  r   r   r   r$   Identity
classifierr#  r   s     r2   r    "ViTForImageClassification.__init__^  ss      ++Fe< OUN_N_bcNc"))F$6$68I8IJikititiv 	r4   NrT   labelsrR   r   r8   c                    U R                   " U4SU0UD6nUR                  nUSS2SSS24   nU R                  U5      nSn	Ub  U R                  " X(U R                  40 UD6n	[        U	UUR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
rR   Nr   )rL  logitsr   r  )r   r   r^  loss_functionr   r   r   r  )
r0   rT   r`  rR   r   rS  r1  r2  rb  rL  s
             r2   r`   !ViTForImageClassification.forwardj  s      /3hh/
%=/
 /
 "33'1a0/%%fdkkLVLD$!//))	
 	
r4   )r^  r\  r   r4  )rd   re   rf   rg   r   r    r   r   r"   rj   ri   r   r   r   r`   rm   rn   ro   s   @r2   rZ  rZ  O  s    
y 
  -1&*04	!
llT)!
 t#!
 #'+	!

 +,!
 
!
  !
r4   rZ  )rZ  r@  r  r   )Nr   ):rh   collections.abcrw   rN  r   r"   r    r   r  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_vitr   
get_loggerrd   loggerModuler   r(   rj   floatr   r   r   r   r   r   r   r   r   r  r!  r@  rZ  __all__r  r4   r2   <module>ru     s      $   & ! 9  G & K K I 5 ( 
		H	%UBII Up$ $\ !%II%<<% 
% <<	%
 LL4'% T\% % '(%84.ryy 4.nBII "299  bii 
		 
) D@ @$ / / /B 8j! 8j 8jv		  	d
 2 d
d
N 0
 2 0
0
f gr4   