
    Z jG                        S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJr  SSKJrJr  SSKJr  \R>                  " \ 5      r!\" SS9\ " S S\5      5       5       r" " S S\RF                  5      r$ " S S\RF                  5      r% " S S\RF                  5      r& " S S\RF                  5      r' " S S\RF                  5      r( " S S\RF                  5      r) " S S \RF                  5      r* " S! S"\RF                  5      r+ " S# S$\5      r, " S% S&\RF                  5      r-\ " S' S(\5      5       r.\ " S) S*\.5      5       r/ " S+ S,\RF                  5      r0\" S-S9 " S. S/\.5      5       r1 " S0 S1\RF                  5      r2 " S2 S3\RF                  5      r3\" S4S9 " S5 S6\.5      5       r4\" S7S9 " S8 S9\.5      5       r5\" S:S9 " S; S<\.5      5       r6\ " S= S>\.5      5       r7/ S?Qr8g)@zPyTorch ViLT model.    N)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )
ViltConfigzF
    Class for outputs of [`ViltForImagesAndTextClassification`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\\R                        S-  \S'   Sr\\\R                        S-  \S'   Srg)	(ViltForImagesAndTextClassificationOutput+   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`list[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
    the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
    Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Nlosslogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   listtupler   __static_attributes__r       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   +   sq    	 &*D%

d
")'+FE$+;?M4e//01D8?8<JU5,,-.5<r)   r   c                   B   ^  \ rS rSrSrU 4S jrSS jr SS jrSrU =r	$ )	ViltEmbeddingsC   z
Construct the text and patch embeddings.

Text embeddings are equivalent to BERT embeddings.

Patch embeddings are equivalent to ViT embeddings.
c                 6  > [         TU ]  5         [        U5      U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        [        U5      U l        U R                  R                  n[        R
                  " [        R                  " SUS-   UR                  5      5      U l        [        R                  " UR                  UR                  5      U l        [        R"                  " UR$                  5      U l        Xl        g Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr#   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrB   r:   	__class__s      r*   r1   ViltEmbeddings.__init__L   s      .f5ekk!Q8J8J&KL 3F ;++77#%<<A{QPVPbPb0c#d %'\\&2Q2QSYSeSe%f"zz&"<"<=r)   c                 D   U R                   R                  R                  R                  u    pEnU R                  U5      nUS S 2S S S 2S S 24   R	                  5       n[
        R                  R                  XR                  S   UR                  S   4S9R                  5       nUS S 2S4   R                  SS9S S 2S4   n	US S 2S4   R                  SS9S S 2S4   n
UR                  u  ppU R                  R                  U R                  R                  -  nU R                  S S 2SS 2S S 24   R                  SS5      R                  SXU5      n[         R"                  " [%        X5       VVs/ s HP  u  nn[
        R                  R'                  [
        R                  R                  UUU4SSS	9SUU-
  SUU-
  45      PMR     snnSS9nUR)                  S5      R                  SS5      nUR)                  S5      R                  SS5      n[         R*                  " [         R,                  " [         R.                  " UR                  S
   5      [         R.                  " UR                  S   5      SS9SS9R1                  UR2                  S9nUS S S S 2S S 2S S 24   nUR5                  UR                  S   UR                  S   SSS5      nUR)                  SS5      nUR)                  S5      nUS:  d  Ub  [7        U[8        5      (       d  X-  nUR;                  5       nOX-  n[=        UR;                  5       U5      nUR?                  SS9nSU-
  R?                  SS9nUS S 2S4   RA                  5       nU Vs/ s H  nUUS S 2S4   U:H     PM     nnU Vs/ s H  nUUS S 2S4   U:H     PM     nnU Vs/ s H  nURC                  S5      PM     nnU Vs/ s H  nURC                  S5      PM     nnU Vs/ s H  nUU-
  PM
     nn/ n [E        [%        UUU5      5       H  u  n!u  nn"n#U#S::  aR  [         RF                  " [         RH                  " U5      R	                  5       U5      n$U RK                  UU!   U$   5        Mb  [         RF                  " [         RH                  " U"5      R	                  5       U#SS9n%U RK                  [         R"                  " UU!   UU!   U%   /SS95        M     [         R"                  " U SS9n UU S S 2S4   U S S 2S4   4   R                  USU5      nUU S S 2S4   U S S 2S4   4   R                  US5      nUU S S 2S4   U S S 2S4   4   R                  USS5      nUU S S 2S4   U S S 2S4   4   R                  USU5      nU RL                  R5                  USS5      n&[         R"                  " U&U4SS9n[         R"                  " U R                  S S 2SS S 24   S S 2S S S 24   R5                  USS5      U4SS9nUU-   nU RO                  U5      n[         R"                  " [         RH                  " UR                  S   S5      R1                  U5      U/SS9nXxUX444$ s  snnf s  snf s  snf s  snf s  snf s  snf )N   r   )sizer   r   dimbilinearT)rH   modealign_cornersij)indexingdeviceF)as_tuple)replacement)(r9   
projectionweightshapefloatr   
functionalinterpolatelongsumrB   
image_size
patch_sizer;   	transposeviewr#   catzippadflattenstackmeshgridarangetorS   expand
isinstanceintmaxminnonzerouniquerH   	enumeratemultinomialonesappendr7   rA   )'rC   pixel_values
pixel_maskmax_image_length_phpwxx_maskx_hx_w
batch_sizenum_channelsheightwidth	patch_dimspatial_poshw	pos_embedpatch_indexeffective_resolution	valid_idxnon_valid_idxunique_rowsuvalid_row_idxnon_valid_row_idxv
valid_numsnon_valid_numspad_numsselectinvpvalid_choice
pad_choice
cls_tokenss'                                          r*   visual_embedViltEmbeddings.visual_embed[   sM   ,,77>>DD1"!!,/AtQM*002**6QWWQZ8P*QVVXQTl1%ad+QTl1%ad+23''/
&KK**dkk.D.DD	..q!"ax8BB1aHMMaQ]jstII  M *DAq !!MM--#V'&*	 .  	1fqj1 * 
	  %%a(221a8	IIaL""1a(kkNN5<<R(895<<UWHX;Ydhioq

"FMM"
" 	 "$aA"56!((a&,,q/2rSUV!))!Q/"a#3#;:N^`cCdCd
 $'9 3779#&9 "#7#;#;#=?OPNNEN2	V,,e,<1o,,.BMN+Q9QT?a#78+NNYZk]=A+>!+CDkZ)67AaffQi
7->?->!&&)->?2<=*Q$q(*=&s:~x'PQMAz2qAv$00A1D1D1FHXYmA.|<="..uzz"~/C/C/EqVZ[
eiiq)9;LQ;OPZ;[(\bcde R 6q)fQTlF1a4L()..z2|Lq!tfQTl2388RH!&A,q!t"<=BB:rSTUfQTlF1a4L89>>z2|\	^^**:r2>
IIz1o1-II%%aAg.q$z:AA*bRTUW`agh
	 	MLLOEJJv||A:==fEvNTUV;888SP OZ7?=s%   1AZ
Z	!Z Z!ZZc	           
         U R                  XUS9n	Uc(  U R                  XEU R                  R                  S9u  pznOUR	                  S5      n
Uc  SnXR                  [        R                  " U[        R                  U	R                  S95      -   n	XpR                  [        R                  " X[        R                  U	R                  S95      -   n[        R                  " X/SS9n[        R                  " X*/SS9nX4$ )N)	input_idstoken_type_idsinputs_embeds)rw   r   dtyperS   rI   )r3   r   rB   rw   re   r>   r#   
zeros_liker\   rS   	full_likerb   )rC   r   attention_maskr   ru   rv   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r*   forwardViltEmbeddings.forward   s    **m + 

 595F5F4;;;W;W 6G 62L{ %,,Q/K  '#$ !$>$>^5::kFXFXY%
 
 $&@&@OOKUZZXcXjXjk'
 

 YY:B
		>7Q?  r)   )r7   rB   rA   r9   r;   r3   r>   )   )r   )
r   r   r    r!   r"   r1   r   r   r(   __classcell__rD   s   @r*   r,   r,   C   s#    V9B '! '!r)   r,   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )r2      zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxepsposition_idsr   rO   F)
persistentr   r   )r0   r1   r   r<   
vocab_sizer6   pad_token_idword_embeddingsmax_position_embeddingsr;   type_vocab_sizer>   	LayerNormlayer_norm_epsr?   r@   rA   register_bufferr#   rh   rj   r5   r   rH   r\   rC   rB   rD   s     r*   r1   TextEmbeddings.__init__   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r)   c                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )NrO   r   r   r   r   )rH   r   hasattrr   rj   r#   r5   r\   rS   r   r>   r;   r   rA   )rC   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr>   r   r;   s               r*   r   TextEmbeddings.forward   s,    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
\\*-
r)   )r   rA   r;   r>   r   )NNNN	r   r   r    r!   r"   r1   r   r(   r   r   s   @r*   r2   r2      s    Q
    r)   r2   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r8   i  z
Image to Patch Embedding.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r0   r1   r^   r_   r   r6   rk   collectionsabcIterabler:   r   Conv2drV   )rC   rB   r^   r_   r   r6   r:   rD   s          r*   r1   ViltPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir)   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  R                  R
                  nU R                  UR                  US95      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rX   r   
ValueErrorrV   rW   r   ri   )rC   ru   r   r   r   r   target_dtyper{   s           r*   r   ViltPatchEmbeddings.forward'  sc    2>2D2D/
&,,,w  --33OOLOO,O?@r)   )r^   r   r:   r_   rV   r   r   s   @r*   r8   r8     s    j r)   r8   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )ViltSelfAttentioni2  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                   5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r0   r1   r6   num_attention_headsr   r   rl   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   r   s     r*   r1   ViltSelfAttention.__init__3  s1    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr)   c                 N   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " XgR	                  SS5      5      n	U	[        R                  " U R                  5      -  n	Ub  X-   n	[        R                  " SS9" U	5      n
U R                  U
5      n
[        R                  " X5      nUR                  SSSS5      R                  5       nUR!                  5       S S U R"                  4-   nUR                  " U6 nU(       a  X4nU$ U4nU$ )NrO   r   rG   rN   rI   r   r   )rX   r   r   ra   r`   r   r   r#   matmulmathsqrtr   SoftmaxrA   permute
contiguousrH   r   )rC   r   r   output_attentionsr   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r*   r   ViltSelfAttention.forwardE  s   #))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ%/@ **,-=> ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r)   )r   r   rA   r   r   r   r   NFr   r   r    r!   r1   r   r(   r   r   s   @r*   r   r   2  s    G$ r)   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
ViltSelfOutputif  z
The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rB   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r0   r1   r   r   r6   denser?   r@   rA   r   s     r*   r1   ViltSelfOutput.__init__l  sB    YYv1163E3EF
zz&"<"<=r)   r   input_tensorreturnc                 J    U R                  U5      nU R                  U5      nU$ r  r  rA   rC   r   r  s      r*   r   ViltSelfOutput.forwardq  s$    

=1]3r)   r  )r   r   r    r!   r"   r   r1   r#   Tensorr   r(   r   r   s   @r*   r   r   f  sB    
>z >
U\\  RWR^R^  r)   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )ViltAttentioniw  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r  )r0   r1   r   	attentionr   outputr   s     r*   r1   ViltAttention.__init__x  s&    *62$V,r)   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   r  r  )rC   r   r   r   self_outputsattention_outputr   s          r*   r   ViltAttention.forward}  sA    ~~mEVW;;|AF#%QR(88r)   r  r   r   r   s   @r*   r  r  w  s    -
 r)   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )ViltIntermediatei  rB   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r  )r0   r1   r   r   r6   intermediate_sizer  rk   
hidden_actstrr   intermediate_act_fnr   s     r*   r1   ViltIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r)   r   r  c                 J    U R                  U5      nU R                  U5      nU$ r  r  r  rC   r   s     r*   r   ViltIntermediate.forward  s&    

=100?r)   r  r   r   r    r!   r   r1   r#   r
  r   r(   r   r   s   @r*   r  r    s/    9z 9U\\ ell  r)   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	
ViltOutputi  rB   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r  )
r0   r1   r   r   r  r6   r  r?   r@   rA   r   s     r*   r1   ViltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r)   r   r  r  c                 R    U R                  U5      nU R                  U5      nX-   nU$ r  r  r  s      r*   r   ViltOutput.forward  s,    

=1]3%4r)   r  r"  r   s   @r*   r$  r$    s=    >z >
U\\  RWR^R^  r)   r$  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )	ViltLayeri  z?This corresponds to the Block class in the timm implementation.c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater$  r  r   r   r6   r   layernorm_beforelayernorm_afterr   s     r*   r1   ViltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr)   c                    U R                  U R                  U5      UUS9nUS   nUSS  nXQR                  UR                  5      -   nU R	                  U5      nU R                  U5      nU R                  Xq5      nU4U-   nU$ )N)r   r   r   )r  r/  ri   rS   r0  r.  r  )rC   r   r   r   self_attention_outputsr  r   layer_outputs           r*   r   ViltLayer.forward  s    !%!!-0/ "0 "

 2!4(, )+;+;<L<S<S+TT ++M:((6 {{<?/G+r)   )r  r,  r.  r0  r/  r  r-  r   r   r   s   @r*   r*  r*    s    I[ r)   r*  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )ViltEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r0   r1   rB   r   
ModuleListrangenum_hidden_layersr*  layergradient_checkpointing)rC   rB   rx   rD   s      r*   r1   ViltEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                 $   U(       a  SOS nU(       a  SOS n[        U R                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr  r   ).0r   s     r*   	<genexpr>&ViltEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   r   )rq   r<  r'   r
   )rC   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_outputss              r*   r   ViltEncoder.forward  s     #7BD$5b4(4OA#$58H$H!(HYZM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r)   )rB   r=  r<  )NFFTr   r   s   @r*   r7  r7    s     , "
 
r)   r7  c                   H   ^  \ rS rSr% \\S'   SrSrSrSS/r	U 4S jr
S	rU =r$ )
ViltPreTrainedModeli  rB   vilt)imagetextTr,   r   c                 F  > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R                  " UR                  5        g g )NrO   r   )r0   _init_weightsrk   r2   initcopy_r   r#   rh   rX   rj   zeros_r   )rC   modulerD   s     r*   rR  !ViltPreTrainedModel._init_weights  so    f%fn--JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. .r)   r   )r   r   r    r!   r   r%   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modulesrR  r(   r   r   s   @r*   rM  rM    s1    (&*#)+>?/ /r)   rM  c                     ^  \ rS rSrSU 4S jjrS rS r\           SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\\R                     -  4S jj5       rSrU =r$ )	ViltModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r0   r1   rB   r,   r   r7  encoderr   r   r6   r   	layernorm
ViltPoolerpooler	post_init)rC   rB   add_pooling_layerrD   s      r*   r1   ViltModel.__init__  si    
 	 (0"6*f&8&8f>S>ST,=j(4 	r)   c                 B    U R                   R                  R                  $ r  r   r3   r   rC   s    r*   get_input_embeddingsViltModel.get_input_embeddings  s    ..>>>r)   c                 8    XR                   R                  l        g r  rg  )rC   r   s     r*   set_input_embeddingsViltModel.set_input_embeddings  s    :?''7r)   Nr   r   r   ru   rv   r   r   r   r   rE  rF  r  c                 :   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUb  Ub  [	        S5      eUc  Uc  [	        S5      eUb  UR                  S   OUR                  S   nUU:w  a  [	        S	5      eUc@  [        R                  " UU R                   R                  U R                   R                  4US9nU R                  UUUUUUUUS
9u  nnU R                  X-5      nU R                  UUU	U
US9nUS   nU R                  U5      nU R                   b  U R!                  U5      OSnU(       d
  UU4USS -   $ [#        UUUR$                  UR&                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.

Examples:

```python
>>> from transformers import ViltProcessor, ViltModel
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text = "hello world"

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
>>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timerO   z5You have to specify either input_ids or inputs_embedsrR   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   rE  rF  r   )rD  pooler_outputr   r   )rB   r   rE  rF  r   %warn_if_padding_and_no_attention_maskrH   rS   r#   rs   rX   r^   r   get_extended_attention_maskr_  r`  rb  r   r   r   )rC   r   r   r   ru   rv   r   r   r   r   rE  rF  kwargsr   text_batch_sizer   rS   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r*   r   ViltModel.forward  s_   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU&1#%.%:!!@T@T!"ZZ/)FPVWN#(@eff!l&:VWW4@4L<--a0R^RdRdefRg.`aa%5t{{7M7Mt{{OeOe$fouvJ+/??!5 ,; 	,
(. 150P0PQ_0m,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r)   )rB   r   r_  r`  rb  )TNNNNNNNNNNN)r   r   r    r!   r1   ri  rl  r   r#   
LongTensorr$   rl   boolr   r'   r   r(   r   r   s   @r*   r]  r]    s;   "?@  .2372615.22615+/)-,0#'n
##d*n
 ))D0n
 ((4/	n

 ''$.n
 $$t+n
 ((4/n
 ''$.n
 "Djn
  $;n
 #Tkn
 D[n
 
$eE,=,=&>	>n
 n
r)   r]  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ra  i  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r  )r0   r1   r   r   r6   r  Tanh
activationr   s     r*   r1   ViltPooler.__init__  s9    YYv1163E3EF
'')r)   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r  )rC   r   first_token_tensorry  s       r*   r   ViltPooler.forward  s6     +1a40

#566r)   )r  r  r   r   s   @r*   ra  ra    s    $
 r)   ra  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                     ^  \ rS rSrSS0rU 4S jrS rS r\           SS\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\\	R                     -  4S jj5       rSrU =r$ )ViltForMaskedLMi  zmlm_score.decoder.weightz6vilt.embeddings.text_embeddings.word_embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r  )r0   r1   r]  rN  ViltMLMHead	mlm_scorerc  r   s     r*   r1   ViltForMaskedLM.__init__  s4     f%	$V, 	r)   c                 .    U R                   R                  $ r  )r  decoderrh  s    r*   get_output_embeddings%ViltForMaskedLM.get_output_embeddings  s    ~~%%%r)   c                 Z    XR                   l        UR                  U R                   l        g r  )r  r  r   )rC   new_embeddingss     r*   set_output_embeddings%ViltForMaskedLM.set_output_embeddings  s    !/,11r)   Nr   r   r   ru   rv   r   r   labelsr   rE  rF  r  c                 H   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUSS u  pUb  UR                  S   OUR                  S   nUSS2SU24   USS2US24   nnU R	                  U5      nSnUba  [        5       nUR                  UR                  5      nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a	  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
    config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
    loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

Examples:

```python
>>> from transformers import ViltProcessor, ViltForMaskedLM
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> import re
>>> import torch

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text = "a bunch of [MASK] laying on a [MASK]."

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
>>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> tl = len(re.findall("\[MASK\]", text))
>>> inferred_token = [text]

>>> # gradually fill in the MASK tokens, one by one
>>> with torch.no_grad():
...     for i in range(tl):
...         encoded = processor.tokenizer(inferred_token)
...         input_ids = torch.tensor(encoded.input_ids)
...         encoded = encoded["input_ids"][0][1:-1]
...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
...         # only take into account text features (minus CLS and SEP token)
...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
...         # only take into account text
...         mlm_values[torch.tensor(encoded) != 103] = 0
...         select = mlm_values.argmax().item()
...         encoded[select] = mlm_ids[select].item()
...         inferred_token = [processor.decode(encoded)]

>>> selected_token = ""
>>> encoded = processor.tokenizer(inferred_token)
>>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
>>> print(output)
a bunch of cats laying on a couch.
```N	r   r   ru   rv   r   r   r   rE  rF  rG   r   rO   r   r   r   r   )rB   rF  rN  rX   r  r   ri   rS   ra   r   r   r   r   )rC   r   r   r   ru   rv   r   r   r  r   rE  rF  rr  r   rx  ry  text_seq_lentext_featuresrx   
mlm_logitsmasked_lm_lossloss_fctr  s                          r*   r   ViltForMaskedLM.forward  sX   V &1%<k$++BYBY))))%!'%/!5#  
 *1!&-6-Byq)H[H[\]H^+A}},<=qR^R_O_?`q^^M2
')HYYz001F%joob$++:P:P&QSYS^S^_aSbcN ]WQR[0F3A3M^%.YSYY!//))	
 	
r)   )r  rN  r{  )r   r   r    r!   _tied_weights_keysr1   r  r  r   r#   r|  r$   r}  r   r'   r   r(   r   r   s   @r*   r  r    sQ    	#$\&2  .2372615.22615*.)-,0#'p
##d*p
 ))D0p
 ((4/	p

 ''$.p
 $$t+p
 ((4/p
 ''$.p
   4'p
  $;p
 #Tkp
 D[p
 
% 1 12	2p
 p
r)   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ViltPredictionHeadTransformi)  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )r0   r1   r   r   r6   r  rk   r  r  r   transform_act_fnr   r   r   s     r*   r1   $ViltPredictionHeadTransform.__init__*  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr)   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r   r   s     r*   r   #ViltPredictionHeadTransform.forward3  s4    

=1--m<}5r)   )r   r  r  r   r   s   @r*   r  r  )  s    U r)   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i:  c                    > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  5      U l	        g r  )
r0   r1   rB   r  	transformr   r   r6   r   r  r   s     r*   r1   ViltMLMHead.__init__;  s=    4V<yy!3!3V5F5FGr)   c                 J    U R                  U5      nU R                  U5      nU$ r  )r  r  )rC   r{   s     r*   r   ViltMLMHead.forwardA  s"    NN1LLOr)   )rB   r  r  r   r   s   @r*   r  r  :  s    H r)   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                     ^  \ rS rSrU 4S jr\           SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )ViltForQuestionAnsweringiG  c           	        > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " [
        R                  " UR                  UR                  S-  5      [
        R                  " UR                  S-  5      [
        R                  " 5       [
        R                  " UR                  S-  UR                  5      5      U l        U R                  5         g )NrG   )r0   r1   
num_labelsr]  rN  r   
Sequentialr   r6   r   GELU
classifierrc  r   s     r*   r1   !ViltForQuestionAnswering.__init__N  s      ++f%	 --IIf((&*<*<q*@ALL++a/0GGIIIf((1,f.?.?@	
 	r)   Nr   r   r   ru   rv   r   r   r  r   rE  rF  r  c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nU(       a  UR                  OUS   nU R	                  U5      nSnUbJ  UR                  UR                  5      n[        R                  R                  X5      UR                  S   -  nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
    Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
    all answers that are applicable for a given example in the batch, or a soft encoding indicating which
    answers are applicable, where 1.0 is the highest score.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForQuestionAnswering
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text = "How many cats are there?"

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
>>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)
>>> logits = outputs.logits
>>> idx = logits.argmax(-1).item()
>>> print("Predicted answer:", model.config.id2label[idx])
Predicted answer: 2
```Nr  r   rG   r  )rB   rF  rN  ro  r  ri   rS   r   rZ    binary_cross_entropy_with_logitsrX   r   r   r   )rC   r   r   r   ru   rv   r   r   r  r   rE  rF  rr  r   ro  r   r   r  s                     r*   r    ViltForQuestionAnswering.forward_  s   f &1%<k$++BYBY))))%!'%/!5#  
 2=--'!*/YYv}}-F==AA&QTZT`T`abTccD Y,F)-)9TGf$EvE'!//))	
 	
r)   r  r  rN  r{  r   r   r    r!   r1   r   r#   r|  r$   r}  r   r'   r   r(   r   r   s   @r*   r  r  G  s7   "  .2372615.22615*.)-,0#'U
##d*U
 ))D0U
 ((4/	U

 ''$.U
 $$t+U
 ((4/U
 ''$.U
   4'U
  $;U
 #TkU
 D[U
 
"E%*;*;$<	<U
 U
r)   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                     ^  \ rS rSrU 4S jr\           SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )ViltForImageAndTextRetrievali  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r/   )	r0   r1   r]  rN  r   r   r6   rank_outputrc  r   s     r*   r1   %ViltForImageAndTextRetrieval.__init__  sC     f%	 99V%7%7; 	r)   Nr   r   r   ru   rv   r   r   r  r   rE  rF  r  c                 P   Ub  UOU R                   R                  nSnUb  [        S5      eU R                  UUUUUUUU	U
US9
nU(       a  UR                  OUS   nU R                  U5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
>>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, :].item()
```NzTraining is not yet supported.r  r   rG   r  )	rB   rF  NotImplementedErrorrN  ro  r  r   r   r   )rC   r   r   r   ru   rv   r   r   r  r   rE  rF  rr  r   r   ro  r   r  s                     r*   r   $ViltForImageAndTextRetrieval.forward  s    ^ &1%<k$++BYBY%&FGG))))%!'%/!5#  
 2=--'!*!!-0Y,F)-)9TGf$EvE'!//))	
 	
r)   )r  rN  r{  r  r   s   @r*   r  r    s7   	  .2372615.22615*.)-,0#'N
##d*N
 ))D0N
 ((4/	N

 ''$.N
 $$t+N
 ((4/N
 ''$.N
   4'N
  $;N
 #TkN
 D[N
 
"E%*;*;$<	<N
 N
r)   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                     ^  \ rS rSrU 4S jr\           SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )"ViltForImagesAndTextClassificationi  c           	        > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  n[        R                  " [        R                  " UR                  U-  UR                  U-  5      [        R                  " UR                  U-  5      [        R                  " 5       [        R                  " UR                  U-  UR                  5      5      U l        U R                  5         g r  )r0   r1   r  r]  rN  
num_imagesr   r  r   r6   r   r  r  rc  )rC   rB   r  rD   s      r*   r1   +ViltForImagesAndTextClassification.__init__"  s      ++f%	 &&
--IIf((:5v7I7IJ7VWLL++j89GGIIIf((:5v7H7HI	
 	r)   Nr   r   r   ru   rv   r   r   r  r   rE  rF  r  c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb!  UR                  S:X  a  UR                  S5      nUb!  UR                  S:X  a  UR                  S5      nUb  UR                  S   OSnUc  Ub  UR                  S   OSnXR                   R                  :w  a  [        S5      e/ nU
(       a  / OSnU	(       a  / OSn[        U5       H  nU R                  UUUUb  USS2USS2SS2SS24   OSUb  USS2USS2SS24   OSUUb  USS2USS2SS24   OSUS-   U	U
US9nU(       a  UR                  OUS   nUR                  U5        U
(       a  UR                  UR                  5        U	(       d  M  UR                  UR                  5        M     [        R                   " USS9nU R#                  U5      nSnUbW  [%        5       nUR'                  UR(                  5      nU" UR+                  SU R,                  5      UR+                  S5      5      nU(       d  UUU4nUb  U4U-   $ U$ [/        UUUUS	9$ )
a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Binary classification labels.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image

>>> url_1 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg"
>>> with httpx.stream("GET", url_1) as response:
...     image_1 = Image.open(BytesIO(response.read()))

>>> url_2 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg"
>>> with httpx.stream("GET", url_2) as response:
...     image_2 = Image.open(BytesIO(response.read()))

>>> text = "The left image contains twice the number of dogs as the right image."

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
>>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

>>> # prepare inputs
>>> encoding = processor([image_1, image_2], text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
>>> logits = outputs.logits
>>> idx = logits.argmax(-1).item()
>>> print("Predicted answer:", model.config.id2label[idx])
Predicted answer: True
```N   r   r   z\Make sure to match the number of images in the model with the number of images in the input.)
r   r   ru   rv   r   r   r   r   rE  rF  rO   rI   r  )rB   r   rE  rF  ndim	unsqueezerX   r  r   r:  rN  ro  rt   r   r   r#   rb   r  r   ri   rS   ra   r  r   )rC   r   r   r   ru   rv   r   r   r  r   rE  rF  rr  r  pooler_outputsr   r   r   r   ro  ry  r   r   r  r  s                            r*   r   *ViltForImagesAndTextClassification.forward4  s   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY#(9(9Q(>'11!4L#(9(9Q(>'11!4L.:.F\''*D
2>2J++A.PTJ///n  2,R$
z"Aii--<H<T\!Q1a-8Z^5?5K:aAqj1QU+9E9Q\!Q1*5W[%&U"3%9'   G 6AG11gajM!!-0#$$W%:%:;  !!'"4"45) #, 		.b9/')HYYv}}-FFKKDOO<fkk"oNDmZ8F)-)9TGf$EvE7'!	
 	
r)   r  r{  )r   r   r    r!   r1   r   r#   r|  r$   r}  r   r'   r   r(   r   r   s   @r*   r  r    s7   $  .2372615.22615*.)-,0#'v
##d*v
 ))D0v
 ((4/	v

 ''$.v
 $$t+v
 ((4/v
 ''$.v
   4'v
  $;v
 #Tkv
 D[v
 
2E%:K:K4L	Lv
 v
r)   r  c                     ^  \ rS rSrU 4S jr\           SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\\R                     -  4S jj5       rSrU =r$ )ViltForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )NF)rd  )r0   r1   r  r]  rN  r   r?   r@   rA   r   r6   r  rc  r   s     r*   r1   #ViltForTokenClassification.__init__  sk      ++f>	zz&"<"<=))F$6$68I8IJ 	r)   Nr   r   r   ru   rv   r   r   r  r   rE  rF  r  c                 8   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nUb  UR                  S   OUR                  S   nU R	                  U5      nU R                  USS2SU24   5      nSnUbW  [        5       nUR                  UR                  5      nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r   rO   rG   r  )rB   rF  rN  rX   rA   r  r   ri   rS   ra   r  r   r   r   )rC   r   r   r   ru   rv   r   r   r  r   rE  rF  rr  r   rx  text_input_sizer   r   r  r  s                       r*   r   "ViltForTokenClassification.forward  s<   0 &1%<k$++BYBY))))%!'%/!5#  
 "!*090E)//!,=K^K^_`Ka,,74D_4D1D!EF')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r)   )r  rA   r  rN  r{  )r   r   r    r!   r1   r   r#   r|  r$   r}  r   r'   r   r(   r   r   s   @r*   r  r    s)   
  .2372615.22615*.)-,0#'=
##d*=
 ))D0=
 ((4/	=

 ''$.=
 $$t+=
 ((4/=
 ''$.=
   4'=
  $;=
 #Tk=
 D[=
 
u'8'8!9	9=
 =
r)   r  )r  r  r  r  r  r*  r]  rM  )9r"   collections.abcr   r   dataclassesr   r#   r   torch.nnr    r   rS  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_viltr   
get_loggerr   loggerr   Moduler,   r2   r8   r   r   r  r  r$  r*  r7  rM  r]  ra  r  r  r  r  r  r  r  __all__r   r)   r*   <module>r     sY      !   % & ! 9  . , * 
		H	% 
 ={ = =$W!RYY W!t3RYY 3l")) >0		 0hRYY "BII  ryy  
 
"* "J&
")) &
R // / / G
# G
 G
T  
F
) F

F
R")) "
")) 
 h
2 h
h
V [
#6 [
[
| 
J
)< J

J
Z K
!4 K
 K
\	r)   