
    Z jA              	          S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJrJr  SSKJr  \R4                  " \5      rS=S\S\S\S-  S\4S jjr " S S\R>                  5      r  " S S\R>                  5      r! " S S\R>                  5      r" " S S\R>                  5      r# " S S\R>                  5      r$ " S S\R>                  5      r% " S S \R>                  5      r& " S! S"\R>                  5      r' " S# S$\R>                  5      r( " S% S&\R>                  5      r) " S' S(\5      r* " S) S*\R>                  5      r+\ " S+ S,\5      5       r,\ " S- S.\,5      5       r-\" S/S09 " S1 S2\,5      5       r. " S3 S4\R>                  5      r/ " S5 S6\R>                  5      r0 " S7 S8\R>                  5      r1\" S9S09 " S: S;\,5      5       r2/ S<Qr3g)>zPyTorch MobileViT model.    N)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    Uc  Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  n[        U5      $ )zM
Ensure that all layers have a channel count that is divisible by `divisor`.
   g?)maxint)r   r   r   	new_values       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler   )   sO     	Is5Q;#677BWLMI3;	y>    c                      ^  \ rS rSr      SS\S\S\S\S\S\S\S	\S
\S\\-  SS4U 4S jjjrS\	R                  S\	R                  4S jrSrU =r$ )MobileViTConvLayer6   configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 D  > [         TU ]  5         [        US-
  S-  5      U-  nX&-  S:w  a  [        SU SU S35      eX6-  S:w  a  [        SU SU S35      e[        R
                  " UUUUUUUUSS	9	U l        U	(       a  [        R                  " US
SSSS9U l        OS U l        U
(       an  [        U
[        5      (       a  [        U
   U l        g [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g S U l        g )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r#   r$   r%   r&   paddingr)   r'   r(   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r.   	__class__s               r   r6   MobileViTConvLayer.__init__7   s,    	{Q!+,x71$/}<STZS[[cdee A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#.."("8F--s33"():):";"("3"3"DOr   featuresc                     U R                  U5      nU R                  b  U R                  U5      nU R                  b  U R                  U5      nU$ N)r9   r;   r>   )r@   rC   s     r   forwardMobileViTConvLayer.forwardm   sK    ##H-)))(3H??&x0Hr   )r>   r9   r;   )r   r   Fr   TT)__name__
__module____qualname____firstlineno__r   r   boolr=   r6   torchTensorrF   __static_attributes____classcell__rA   s   @r   r    r    6   s     "&%)4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# s
4# 
4# 4#l   r   r    c                      ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\R                  S\R                  4S jr
SrU =r$ )MobileViTInvertedResidualv   zQ
Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
r"   r#   r$   r&   r)   r   Nc           
      6  > [         TU ]  5         [        [        [	        X!R
                  -  5      5      S5      nUS;  a  [        SU S35      eUS:H  =(       a    X#:H  U l        [        XUSS9U l	        [        UUUSUUUS9U l
        [        UUUSS	S
9U l        g )N   )r   r   zInvalid stride .r   r#   r$   r%   r   )r#   r$   r%   r&   r'   r)   Fr#   r$   r%   r+   )r5   r6   r   r   roundexpand_ratior7   use_residualr    
expand_1x1conv_3x3
reduce_1x1)r@   r"   r#   r$   r&   r)   expanded_channelsrA   s          r   r6   "MobileViTInvertedResidual.__init__{   s     	*3u[CVCV5V/W+XZ[\vha899#q[K{/J,:KYZ
 +)*$
 -)% 
r   rC   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  X!-   $ U$ rE   )r]   r^   r_   r\   )r@   rC   residuals      r   rF   !MobileViTInvertedResidual.forward   sG    ??8,==*??8,&*&7&7x"EXEr   )r^   r]   r_   r\   r   )rH   rI   rJ   rK   __doc__r   r   r6   rM   rN   rF   rO   rP   rQ   s   @r   rS   rS   v   sn    
 jk
%
47
GJ
TW
cf
	
 
BF F F Fr   rS   c                      ^  \ rS rSr SS\S\S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr	Sr
U =r$ )MobileViTMobileNetLayer   r"   r#   r$   r&   
num_stagesr   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        U5       H4  n[        UUUUS:X  a  UOSS9nU R                  R                  U5        UnM6     g )Nr   r   )r#   r$   r&   )r5   r6   r   
ModuleListlayerrangerS   append)	r@   r"   r#   r$   r&   rj   irm   rA   s	           r   r6    MobileViTMobileNetLayer.__init__   sc     	]]_
z"A-')!"avQ	E JJe$&K #r   rC   c                 <    U R                    H  nU" U5      nM     U$ rE   rm   )r@   rC   layer_modules      r   rF   MobileViTMobileNetLayer.forward   s     JJL#H-H 'r   rs   )r   r   rH   rI   rJ   rK   r   r   r6   rM   rN   rF   rO   rP   rQ   s   @r   rh   rh      s`    op'%'47'GJ'TW'il'	' '    r   rh   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfAttention   r"   hidden_sizer   Nc                 r  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X R                  UR                  S9U l
        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rW   )r(   )r5   r6   num_attention_headsr7   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutr@   r"   rz   rA   s      r   r6   MobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{5O5O'O#P !558P8PPYY{,>,>V__U
99[*<*<6??SYY{,>,>V__U
zz&"E"EFr   hidden_statesc                 *   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " XER	                  SS5      5      nU[        R                  " U R                  5      -  n[        R                  R                  USS9nU R                  U5      n[        R                  " X5      n	U	R                  SSSS5      R!                  5       n	U	R#                  5       S S U R$                  4-   n
U	R                  " U
6 n	U	$ )Nr   r   dimr   r   )shaper}   r   view	transposer   r   rM   matmulmathsqrtr   
functionalsoftmaxr   permute
contiguoussizer~   )r@   r   input_shapehidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes              r   rF   MobileViTSelfAttention.forward   sn   #))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDr   )r~   r}   r   r   r|   r   r   rv   rQ   s   @r   rx   rx      sA    G GS GT G&U\\ ell  r   rx   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfOutput   r"   rz   r   Nc                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rE   r5   r6   r   r   denser   hidden_dropout_probr   r   s      r   r6   MobileViTSelfOutput.__init__   s4    YY{8
zz&"<"<=r   r   c                 J    U R                  U5      nU R                  U5      nU$ rE   r   r   r@   r   s     r   rF   MobileViTSelfOutput.forward   s$    

=1]3r   r   rv   rQ   s   @r   r   r      s=    > >S >T >
U\\ ell  r   r   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTAttention   r"   rz   r   Nc                 b   > [         TU ]  5         [        X5      U l        [	        X5      U l        g rE   )r5   r6   rx   	attentionr   outputr   s      r   r6   MobileViTAttention.__init__   s&    /D)&>r   r   c                 J    U R                  U5      nU R                  U5      nU$ rE   r   r   )r@   r   self_outputsattention_outputs       r   rF   MobileViTAttention.forward   s%    ~~m4;;|4r   r   rv   rQ   s   @r   r   r      s=    ? ?S ?T ?
 U\\  ell    r   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTIntermediatei  r"   rz   intermediate_sizer   Nc                    > [         TU ]  5         [        R                  " X#5      U l        [        UR                  [        5      (       a  [        UR                     U l	        g UR                  U l	        g rE   )
r5   r6   r   r   r   r<   r?   r=   r   intermediate_act_fnr@   r"   rz   r   rA   s       r   r6   MobileViTIntermediate.__init__  sR    YY{>
f''--'-f.?.?'@D$'-'8'8D$r   r   c                 J    U R                  U5      nU R                  U5      nU$ rE   r   r   r   s     r   rF   MobileViTIntermediate.forward  s&    

=100?r   r   rv   rQ   s   @r   r   r     sF    9 9S 9UX 9]a 9U\\ ell  r   r   c                      ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S	\R                  S\R                  4S
 jr	Sr
U =r$ )MobileViTOutputi  r"   rz   r   r   Nc                    > [         TU ]  5         [        R                  " X25      U l        [        R
                  " UR                  5      U l        g rE   r   r   s       r   r6   MobileViTOutput.__init__  s5    YY0>
zz&"<"<=r   r   input_tensorc                 R    U R                  U5      nU R                  U5      nX-   nU$ rE   r   )r@   r   r   s      r   rF   MobileViTOutput.forward  s,    

=1]3%4r   r   rv   rQ   s   @r   r   r     sT    > >S >UX >]a >
U\\  RWR^R^  r   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformerLayeri  r"   rz   r   r   Nc                   > [         TU ]  5         [        X5      U l        [	        XU5      U l        [        XU5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  S9U l        g )Nr1   )r5   r6   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r   r6   "MobileViTTransformerLayer.__init__   sg    +F@1&GXY%f;LM "[>S>S T!||K=R=RSr   r   c                     U R                  U R                  U5      5      nX!-   nU R                  U5      nU R                  U5      nU R	                  X15      nU$ rE   )r   r   r   r   r   )r@   r   r   layer_outputs       r   rF   !MobileViTTransformerLayer.forward(  sX    >>$*?*?*NO(8++M:((6{{<?r   )r   r   r   r   r   rv   rQ   s   @r   r   r     sK    T TS TUX T]a TU\\ ell  r   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformeri2  r"   rz   rj   r   Nc           	         > [         TU ]  5         [        R                  " 5       U l        [        U5       H>  n[        UU[        X!R                  -  5      S9nU R                  R                  U5        M@     g )N)rz   r   )
r5   r6   r   rl   rm   rn   r   r   	mlp_ratioro   )r@   r"   rz   rj   _transformer_layerrA   s         r   r6   MobileViTTransformer.__init__3  sa    ]]_
z"A 9'"%k4D4D&D"E!
 JJ/0 #r   r   c                 <    U R                    H  nU" U5      nM     U$ rE   rs   )r@   r   rt   s      r   rF   MobileViTTransformer.forward?  s      JJL(7M 'r   rs   rv   rQ   s   @r   r   r   2  sE    
1 
1S 
1c 
1VZ 
1U\\ ell  r   r   c                     ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
S4U 4S jjjrS\R                  S
\
\R                  \4   4S jrS\R                  S\S
\R                  4S jrS\R                  S
\R                  4S jrSrU =r$ )MobileViTLayeriE  z;
MobileViT block: https://huggingface.co/papers/2110.02178
r"   r#   r$   r&   rz   rj   r)   r   Nc           	        > [         TU ]  5         UR                  U l        UR                  U l        US:X  a(  [        UUUUS:X  a  UOSUS:  a  US-  OSS9U l        UnOS U l        [        UUUUR                  S9U l	        [        UUUSSSS9U l
        [        UUUS9U l        [        R                  " XQR                  S9U l        [        XUSS9U l        [        USU-  X!R                  S9U l        g )	Nr   r   )r#   r$   r&   r)   rX   F)r#   r$   r%   r*   r+   )rz   rj   r   )r5   r6   
patch_sizepatch_widthpatch_heightrS   downsampling_layerr    conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	r@   r"   r#   r$   r&   rz   rj   r)   rA   s	           r   r6   MobileViTLayer.__init__J  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 k7L7LM1+ST 
 )KkWnWn
r   rC   c                 n   U R                   U R                  p2[        X#-  5      nUR                  u  pVpx[        R
                  R                  5       (       a$  [        [        R                  " Xs-  5      U-  5      O#[        [        R                  " Xs-  5      U-  5      n	[        R
                  R                  5       (       a$  [        [        R                  " X-  5      U-  5      O#[        [        R                  " X-  5      U-  5      n
SnX:w  d  X:w  a#  [        R                  R                  XU
4SSS9nSnX-  nX-  nX-  nUR                  XV-  U-  X<U5      nUR                  SS5      nUR                  XVX5      nUR                  SS5      nUR                  XT-  US5      nXx4UUUUUUS	.nUU4$ )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rM   jit
is_tracingr   ceilr   r   r   r   reshaper   )r@   rC   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r   	unfoldingMobileViTLayer.unfolding  s   $($4$4d6G6G\34
8@5
k yy##%% ejj!;<|KLTYY{9:\IJ 	 yy##%% ejj!9:[HITYYz78;FG 	 "j&?}}00I6ZW\ 1 H K $2%5&8 ""!$44lU`
 ##Aq)//*P##Aq)//*"9;K &2$ &&!0"2
	 	!!r   r  r	  c                    U R                   U R                  pC[        X4-  5      nUS   nUS   nUS   nUS   n	US   n
UR                  5       R	                  XeUS5      nUR                  SS5      nUR                  Xg-  U	-  XU5      nUR                  SS	5      nUR                  XgX-  X-  5      nUS
   (       a"  [        R                  R                  XS   SSS9nU$ )Nr   r   r   r   r   r   r   r   r   r   r   r   Fr   )
r   r   r   r   r   r   r   r   r   r   )r@   r  r	  r   r   r  r   r   r   r  r  rC   s               r   foldingMobileViTLayer.folding  s   $($4$4d6G6G\34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44oU`
 %%a+##"2"A?C`
 ]#}}005JV[ 1 H r   c                    U R                   (       a  U R                  U5      nUnU R                  U5      nU R                  U5      nU R                  U5      u  p4U R	                  U5      nU R                  U5      nU R                  X45      nU R                  U5      nU R                  [        R                  " X!4SS95      nU$ Nr   r   )r   r   r   r
  r   r   r  r   r   rM   cat)r@   rC   rc   r  r	  s        r   rF   MobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy()=1EFr   )	r   r   r   r   r   r   r   r   r   re   )rH   rI   rJ   rK   rf   r   r   r6   rM   rN   tupledictr
  r  rF   rO   rP   rQ   s   @r   r   r   E  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
t1"%,, 1"5t9K3L 1"fu||   :   r   r   c            
       p   ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\S\S\	\
-  4S	 jjrS
rU =r$ )MobileViTEncoderi  r"   r   Nc           
        > [         T
U ]  5         Xl        [        R                  " 5       U l        SU l        S=p#UR                  S:X  a  SnSnOUR                  S:X  a  SnSn[        UUR                  S   UR                  S   SSS9nU R
                  R                  U5        [        UUR                  S   UR                  S   SS	S9nU R
                  R                  U5        [        UUR                  S   UR                  S	   SUR                  S   SS
9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S	   UR                  S   SUR                  S   SUS9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S   UR                  S   SUR                  S   S	US9n	U R
                  R                  U	5        g )NFrV   T   r   r   )r#   r$   r&   rj   r   r   )r#   r$   r&   rz   rj      )r#   r$   r&   rz   rj   r)      )r5   r6   r"   r   rl   rm   gradient_checkpointingoutput_striderh   neck_hidden_sizesro   r   hidden_sizes)r@   r"   dilate_layer_4dilate_layer_5r)   layer_1layer_2layer_3layer_4layer_5rA   s             r   r6   MobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r   r   output_hidden_statesreturn_dictc                     U(       a  SOS n[        U R                  5       H  u  pVU" U5      nU(       d  M  XA4-   nM     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   .   #    U  H  oc  M  Uv   M     g 7frE   r*  ).0vs     r   	<genexpr>+MobileViTEncoder.forward.<locals>.<genexpr>H  s     X$Fq$Fs   	)last_hidden_stater   )	enumeraterm   r  r	   )r@   r   r'  r(  all_hidden_statesrp   rt   s          r   rF   MobileViTEncoder.forward9  sc     #7BD(4OA(7M##$58H$H!	  5 X]$FXXX-oor   )r"   r  rm   )FT)rH   rI   rJ   rK   r   r6   rM   rN   rL   r  r	   rF   rO   rP   rQ   s   @r   r  r    sb    H# H#4 H#Z &+ 	p||p #p 	p
 
/	/p pr   r  c                       \ rS rSr% \\S'   SrSrSrSr	S/r
\R                  " 5       S\R                  S	S
4S j5       rSrg
)MobileViTPreTrainedModeliM  r"   	mobilevitpixel_values)imageTr   moduler   Nc                    [        U[        R                  [        R                  [        R                  45      (       a  [
        R                  " UR                  SU R                  R                  S9  UR                  b   [
        R                  " UR                  5        [        USS5      ba  [
        R                  " UR                  5        [
        R                  " UR                  5        [
        R                  " UR                   5        gg[        U[        R"                  5      (       aA  [
        R                  " UR                  5        [
        R                  " UR                  5        gg)zInitialize the weightsg        )meanstdNrunning_mean)r<   r   r   r8   r:   initnormal_weightr"   initializer_ranger(   zeros_getattrr=  ones_running_varnum_batches_trackedr   )r@   r9  s     r   _init_weights&MobileViTPreTrainedModel._init_weightsV  s     fryy"))R^^DEELLSdkk6S6ST{{&FKK(v~t4@F//0

6--.F667 A --KK$JJv}}% .r   r*  )rH   rI   rJ   rK   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesrM   no_gradr   ModulerG  rO   r*  r   r   r5  r5  M  sR    #$O!&*#)*
]]_&BII &$ & &r   r5  c                      ^  \ rS rSrSS\S\4U 4S jjjr\   SS\R                  S-  S\S-  S\S-  S	\
\-  4S
 jj5       rSrU =r$ )MobileViTModelif  r"   expand_outputc                 F  > [         TU ]  U5        Xl        X l        [	        UUR
                  UR                  S   SSS9U l        [        U5      U l	        U R                  (       a+  [	        UUR                  S   UR                  S   SS9U l
        U R                  5         g	)
a%  
expand_output (`bool`, *optional*, defaults to `True`):
    Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
    1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
r   r   r   )r#   r$   r%   r&   r     r   rX   N)r5   r6   r"   rS  r    num_channelsr  	conv_stemr  encoderconv_1x1_exp	post_init)r@   r"   rS  rA   s      r   r6   MobileViTModel.__init__h  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r   Nr7  r'  r(  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nU R                  (       a-  U R                  US   5      n[        R                  " USS/SS9nOUS   nS nU(       d  Ub  Xx4OU4n	XSS  -   $ [        UUUR                  S	9$ )
Nz You have to specify pixel_valuesr'  r(  r   r   r   F)r   keepdimr   )r0  pooler_outputr   )r"   r'  r(  r7   rW  rX  rS  rY  rM   r;  r
   r   )
r@   r7  r'  r(  kwargsembedding_outputencoder_outputsr0  pooled_outputr   s
             r   rF   MobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++BYBY?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFAB///7/')77
 	
r   )r"   rY  rW  rX  rS  )T)NNN)rH   rI   rJ   rK   r   rL   r6   r   rM   rN   r  r
   rF   rO   rP   rQ   s   @r   rR  rR  f  sw     t  >  -1,0#'	(
llT)(
 #Tk(
 D[	(
 
9	9(
 (
r   rR  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\    SS\R                  S-  S\	S-  S\R                  S-  S	\	S-  S\
\-  4
S
 jj5       rSrU =r$ )MobileViTForImageClassificationi  r"   r   Nc                 ~  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  SS9U l        UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       U l        U R                  5         g )NT)inplacer   r   )r5   r6   
num_labelsrR  r6  r   r   classifier_dropout_probr   r   r  Identity
classifierrZ  r@   r"   rA   s     r   r6   (MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r   r7  r'  labelsr(  c                 j   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U R                  U5      5      nSn	Ub  U R                  X8U R                   5      n	U(       d  U4USS -   n
U	b  U	4U
-   $ U
$ [        U	UUR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr]  r   r   )losslogitsr   )	r"   r(  r6  r_  rm  r   loss_functionr   r   )r@   r7  r'  rp  r(  r`  outputsrc  rs  rr  r   s              r   rF   'MobileViTForImageClassification.forward  s     &1%<k$++BYBY..fq.r1<--'!*m!<=%%fdkkBDY,F)-)9TGf$EvE3!//
 	
r   )rm  r   r6  rj  NNNN)rH   rI   rJ   rK   r   r6   r   rM   rN   rL   r  r   rF   rO   rP   rQ   s   @r   rg  rg    s     4   -1,0&*#'"
llT)"
 #Tk"
 t#	"

 D["
 
5	5"
 "
r   rg  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTASPPPoolingi  r"   r#   r$   r   Nc           
      |   > [         TU ]  5         [        R                  " SS9U l        [        UUUSSSSS9U l        g )Nr   )output_sizeTrelu)r#   r$   r%   r&   r*   r+   )r5   r6   r   AdaptiveAvgPool2dglobal_poolr    r   )r@   r"   r#   r$   rA   s       r   r6   MobileViTASPPPooling.__init__  sB    //A>*#%"!
r   rC   c                     UR                   SS  nU R                  U5      nU R                  U5      n[        R                  R                  XSSS9nU$ )Nr   r   Fr   )r   r~  r   r   r   r   )r@   rC   spatial_sizes      r   rF   MobileViTASPPPooling.forward  sQ    ~~bc*##H-==*==,,Xzin,or   )r   r~  rv   rQ   s   @r   ry  ry    sF    
 
S 
PS 
X\ 
   r   ry  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTASPPi  z{
ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
r"   r   Nc                 p  > [         TU ]  5         UR                  S   nUR                  n[	        UR
                  5      S:w  a  [        S5      e[        R                  " 5       U l	        [        UUUSSS9nU R                  R                  U5        U R                  R                  UR
                   Vs/ s H  n[        UUUSUSS9PM     sn5        [        XU5      nU R                  R                  U5        [        USU-  USSS9U l        [        R                  " UR                   S	9U l        g s  snf )
Nr   r   z"Expected 3 values for atrous_ratesr   r|  rY   )r#   r$   r%   r)   r+   r  )p)r5   r6   r  aspp_out_channelslenatrous_ratesr7   r   rl   convsr    ro   extendry  projectr   aspp_dropout_probr   )r@   r"   r#   r$   in_projectionrate
pool_layerrA   s          r   r6   MobileViTASPP.__init__  s-   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
 0D # +!- !!#) 0
	
 *&|L


*%)L 0|YZkq
 zzF$<$<=)
s   4D3rC   c                     / nU R                    H  nUR                  U" U5      5        M     [        R                  " USS9nU R	                  U5      nU R                  U5      nU$ r  )r  ro   rM   r  r  r   )r@   rC   pyramidconvpooled_featuress        r   rF   MobileViTASPP.forward7  sW    JJDNN4>* ))G+,,w/,,7r   )r  r   r  rH   rI   rJ   rK   rf   r   r6   rM   rN   rF   rO   rP   rQ   s   @r   r  r    s<    )> )>4 )>V   r   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTDeepLabV3iB  zB
DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
r"   r   Nc           
         > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        UUR                  UR                  SSSSS9U l        g )Nr   FT)r#   r$   r%   r*   r+   r(   )r5   r6   r  asppr   	Dropout2drk  r   r    r  rj  rm  rn  s     r   r6   MobileViTDeepLabV3.__init__G  s]    !&)	||F$B$BC,00**# 
r   r   c                 r    U R                  US   5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   rm  )r@   r   rC   s      r   rF   MobileViTDeepLabV3.forwardW  s6    99]2./<<)??8,r   )r  rm  r   r  rQ   s   @r   r  r  B  s;    
 
4 
 U\\ ell  r   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                      ^  \ rS rSrS\SS4U 4S jjr\    SS\R                  S-  S\R                  S-  S\	S-  S	\	S-  S\
\-  4
S
 jj5       rSrU =r$ ) MobileViTForSemanticSegmentationi^  r"   r   Nc                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g )NF)rS  )r5   r6   rj  rR  r6  r  segmentation_headrZ  rn  s     r   r6   )MobileViTForSemanticSegmentation.__init__d  sD      ++'eD!3F!; 	r   r7  rp  r'  r(  c                 z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  USUS9nU(       a  UR                  OUS   nU R                  U5      nSn	UbQ  [        R                  R                  XR                  SS SSS	9n
[        U R                   R                  S
9nU" X5      n	U(       d%  U(       a
  U4USS -   nO	U4USS -   nU	b  U	4U-   $ U$ [        U	UU(       a  UR                  SS9$ SSS9$ )a$  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import httpx
>>> from io import BytesIO
>>> import torch
>>> from PIL import Image
>>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
>>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTr]  r   r   Fr   )ignore_indexr   )rr  rs  r   
attentions)r"   r'  r(  rj  r7   r6  r   r  r   r   r   r   r   semantic_loss_ignore_indexr   )r@   r7  rp  r'  r(  r`  ru  encoder_hidden_statesrs  rr  upsampled_logitsloss_fctr   s                r   rF   (MobileViTForSemanticSegmentation.forwardn  sm   N %9$D $++JjJj 	 &1%<k$++BYBY$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88\\"#.Zu  9   (T[[5[5[\H,5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r   )r6  rj  r  rw  )rH   rI   rJ   rK   r   r6   r   rM   rN   rL   r  r   rF   rO   rP   rQ   s   @r   r  r  ^  s     4   -1&*,0#'L
llT)L
 t#L
 #Tk	L

 D[L
 
(	(L
 L
r   r  )rg  r  rR  r5  )rV   N)4rf   r   rM   r   torch.nnr    r   r>  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   utilsr   r   r   configuration_mobilevitr   
get_loggerrH   loggerr   r   rP  r    rS   rh   rx   r   r   r   r   r   r   r   r  r5  rR  rg  ry  r  r  r  __all__r*  r   r   <module>r     s        % & ! 9  . 7 7 4 
		H	%
# 
 
C$J 
RU 
= =@-F		 -F`bii .+RYY +\	")) 		  	 BII 
bii 
		 &299 &f/ fR\pryy \p~ & & &0 I
- I
 I
X 3
&> 3
3
l299 08BII 8v 8 
X
'? X

X
vr   