
    Z jGw                     N   S r SSKrSSKJr  SSKJr  SSKJr  SSKr	SSK
r
SSK
Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(  \RR                  " \*5      r+\" SS9\ " S S\5      5       5       r,\" SS9\ " S S\5      5       5       r-S r. " S S\R^                  5      r0 " S S\R^                  5      r1  SDS \R^                  S!\
Rd                  S"\
Rd                  S#\
Rd                  S$\
Rd                  S-  S%\3S-  S&\3S'\\   4S( jjr4 " S) S*\R^                  5      r5 " S+ S,\R^                  5      r6 " S- S.\R^                  5      r7 " S/ S0\R^                  5      r8 " S1 S2\R^                  5      r9 " S3 S4\5      r: " S5 S6\R^                  5      r;\ " S7 S8\5      5       r<\ " S9 S:\<5      5       r= " S; S<\R^                  5      r>\" S=S9 " S> S?\<5      5       r?\" S@S9 " SA SB\<5      5       r@/ SCQrAg)Ez,PyTorch VideoMAE (masked autoencoder) model.    N)Callable)deepcopy)	dataclass)nn)MSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)can_return_tuplemerge_with_config_defaults)capture_outputs   )VideoMAEConfigz[
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                     S-  \S'   Sr\
\R                     S-  \S'   Srg)VideoMAEDecoderOutput)   zx
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
    Pixel reconstruction logits.
Nlogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r    tupler!   __static_attributes__r"       /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/videomae/modeling_videomae.pyr   r   )   sR    
 (,FE$+59M5**+d2926Je''(4/6r-   r   zb
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	VideoMAEForPreTrainingOutput:   z
loss (`torch.FloatTensor` of shape `(1,)`):
    Pixel reconstruction loss.
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
    Pixel reconstruction logits.
Nlossr   r    r!   r"   )r#   r$   r%   r&   r'   r2   r(   r)   r*   r   r    r+   r!   r,   r"   r-   r.   r0   r0   :   sg     &*D%

d
")'+FE$+59M5**+d2926Je''(4/6r-   r0   c                 v  ^ U4S jn[         R                  " [        U 5       Vs/ s H
  o2" U5      PM     sn5      n[         R                  " USS2SSS24   5      USS2SSS24'   [         R                  " USS2SSS24   5      USS2SSS24'   [
        R                  " U5      R                  S5      $ s  snf )z Sinusoid position encoding tablec           
         > [        T5       Vs/ s H%  o[        R                  " SSUS-  -  T-  5      -  PM'     sn$ s  snf )Ni'     )rangenppower)positionhid_jd_hids     r.   get_position_angle_vec;get_sinusoid_encoding_table.<locals>.get_position_angle_vecT   s?    RWX]R^_R^288E1
+;e+CDDR^___s   ,>Nr   r5   r   )r7   arrayr6   sincosr(   r)   	unsqueeze)
n_positionr;   r<   pos_isinusoid_tables    `   r.   get_sinusoid_encoding_tablerE   P   s    ` XX%PZJ[\J[5e<J[\]N ff^Aqt!tG%<=N1add7 ff^Aqt!tG%<=N1add7^,66q99	 ]s   B6c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VideoMAEEmbeddings^   z/
Construct the patch and position embeddings.

c                    > [         TU ]  5         [        U5      U l        U R                  R                  U l        [        U R                  UR                  5      U l        Xl        g N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesrE   hidden_sizeposition_embeddingsconfigselfrR   	__class__s     r.   rL   VideoMAEEmbeddings.__init__d   sP     7 ?00<<#>t?O?OQWQcQc#d r-   c                    U R                  U5      nX0R                  R                  5       R                  U5      R	                  UR
                  SS9-   nUb'  UR                  u  pEnX2)    nUR                  USU5      nU$ )NTdevicecopy)rN   rQ   detachtype_astorY   shapereshape)rT   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelss          r.   forwardVideoMAEEmbeddings.forwardm   s    **<8
  ":":"A"A"C"K"KJ"W"Z"Z$$4 #[ #
 


 &*4*:*:'J<#$45J#++JLIJr-   )rR   rO   rN   rQ   	r#   r$   r%   r&   r'   rL   rg   r,   __classcell__rU   s   @r.   rG   rG   ^   s    
 r-   rG   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rM      ac  
Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
patch_size).

c           	        > [         T	U ]  5         UR                  nUR                  nUR                  nUR
                  nUR                  nUR                  n[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nX l        X0l        [        U5      U l        US   US   -  US   US   -  -  X`R                  -  -  nX@l        Xl        [        R                  " UUU R                  US   US   4U R                  US   US   4S9U l        g )Nr   r   )in_channelsout_channelskernel_sizestride)rK   rL   
image_size
patch_sizerf   rP   
num_framestubelet_size
isinstancecollectionsabcIterableintrO   r   Conv3d
projection)
rT   rR   rs   rt   rf   rP   ru   rv   rO   rU   s
            r.   rL    VideoMAEPatchEmbeddings.__init__   s3   &&
&&
**((&&
**#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
$$-]jm+
1A0NOS]ararSrs 	 )&))$$**JqM:a=I%%z!}jmD	
r-   c                    UR                   u  p#pEnX@R                  :w  a  [        S5      eXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR	                  SSSS	S
5      nU R                  U5      R                  S5      R                  SS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r5   r      )r_   rf   
ValueErrorrs   permuter}   flatten	transpose)rT   ra   rd   ru   rf   heightwidthrc   s           r.   rg   VideoMAEPatchEmbeddings.forward   s    >J>P>P;
e,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  $++Aq!Q:__\2::1=GG1M
r-   )rs   rf   rO   rt   r}   rv   ri   rk   s   @r.   rM   rM      s    
6 r-   rM   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr[         r5   r   dim)ptrainingr   )
sizer(   matmulr   r   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r-   c                      ^  \ rS rSrS\SS4U 4S jjr S	S\R                  S-  S\\R                  \R                  4   4S jjr	Sr
U =r$ )
VideoMAESelfAttention   rR   returnNc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   Fbias)rK   rL   rP   num_attention_headshasattrr   rR   r{   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rS   s     r.   rL   VideoMAESelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7  #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r-   r    c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nU" U UUUS U R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  5       S S U R"                  4-   n
UR%                  U
5      nX4$ )Nr[   r   r5           )r   r   r   )r_   r   r   viewr   r   r   r   get_interfacerR   _attn_implementationr   r   r   r   r   r   r   r`   )rT   r    input_shapehidden_shapekeysvaluesqueriesattention_interfacecontext_layerattention_probsnew_context_layer_shapes              r.   rg   VideoMAESelfAttention.forward   s@    $))#2.CCbC$*B*BCxx&++L9CCAqIM*//=GG1M**]+00>HHAN(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r-   )
r   r   rR   r   r   r   r   r   r   r   rJ   )r#   r$   r%   r&   r   rL   r(   Tensorr+   rg   r,   rj   rk   s   @r.   r   r      sS    ]~ ]$ ]( 48."\\D0.	u||U\\)	*. .r-   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
VideoMAESelfOutputi  z
The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rR   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rJ   )	rK   rL   r   r   rP   denseDropouthidden_dropout_probr   rS   s     r.   rL   VideoMAESelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r-   r    input_tensorr   c                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r   rT   r    r   s      r.   rg   VideoMAESelfOutput.forward  s$    

=1]3r-   r   )r#   r$   r%   r&   r'   r   rL   r(   r   rg   r,   rj   rk   s   @r.   r   r     sB    
>~ >
U\\  RWR^R^  r-   r   c                   t   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\R                  4S jr
SrU =r$ )	VideoMAEAttentioni  rR   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g rJ   )rK   rL   r   	attentionr   outputrS   s     r.   rL   VideoMAEAttention.__init__  s&    .v6(0r-   r    r   r   c                 R    U R                   " U40 UD6u  p4U R                  X15      nU$ rJ   r   r   )rT   r    r   self_attn_outputre   r   s         r.   rg   VideoMAEAttention.forward  s/    
 #nn]EfE-=r-   r   )r#   r$   r%   r&   r   rL   r(   r   r   r   rg   r,   rj   rk   s   @r.   r   r     sC    1~ 1
|| +, 
	 r-   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VideoMAEIntermediatei(  rR   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rJ   )rK   rL   r   r   rP   intermediate_sizer   rw   
hidden_actstrr	   intermediate_act_fnrS   s     r.   rL   VideoMAEIntermediate.__init__)  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r-   r    r   c                 J    U R                  U5      nU R                  U5      nU$ rJ   r   r   )rT   r    s     r.   rg   VideoMAEIntermediate.forward1  s&    

=100?r-   r   r#   r$   r%   r&   r   rL   r(   r   rg   r,   rj   rk   s   @r.   r   r   (  s/    9~ 9U\\ ell  r-   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	VideoMAEOutputi8  rR   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g rJ   )
rK   rL   r   r   r   rP   r   r   r   r   rS   s     r.   rL   VideoMAEOutput.__init__9  sB    YYv779K9KL
zz&"<"<=r-   r    r   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ rJ   r   r   s      r.   rg   VideoMAEOutput.forward>  s,    

=1]3%4r-   r   r   rk   s   @r.   r   r   8  s=    >~ >
U\\  RWR^R^  r-   r   c                   x   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\R                  4S jrS	rU =r$ )
VideoMAELayeriF  z?This corresponds to the Block class in the timm implementation.rR   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)rK   rL   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormrP   layer_norm_epslayernorm_beforelayernorm_afterrS   s     r.   rL   VideoMAELayer.__init__I  s    '-'E'E$*6208$V, "V-?-?VEZEZ [!||F,>,>FDYDYZr-   r    r   r   c                     U R                  U5      nU R                  " U40 UD6nXA-   nU R                  U5      nU R                  U5      nU R	                  XQ5      nU$ rJ   )r   r   r   r   r   )rT   r    r   hidden_states_normattention_outputlayer_outputs         r.   rg   VideoMAELayer.forwardS  sl    
 "22=A>>*<GG )8 ++M:((6 {{<?r-   )r   r   r   r   r   r   r   )r#   r$   r%   r&   r'   r   rL   r(   r   r   r   rg   r,   rj   rk   s   @r.   r   r   F  sH    I[~ [|| +, 
	 r-   r   c                   `   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
4S jrSrU =r$ )	VideoMAEEncoderii  rR   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rK   rL   rR   r   
ModuleListr6   num_hidden_layersr   layergradient_checkpointing)rT   rR   re   rU   s      r.   rL   VideoMAEEncoder.__init__j  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A&r    r   r   c                 L    U R                    H  nU" U40 UD6nM     [        US9$ )Nlast_hidden_state)r  r   )rT   r    r   layer_modules       r.   rg   VideoMAEEncoder.forwardp  s.    
 !JJL(A&AM ' ??r-   )rR   r  r  )r#   r$   r%   r&   r   rL   r(   r   r   r   r   rg   r,   rj   rk   s   @r.   r   r   i  sD    ,~ ,@||@ +,@ 
	@ @r-   r   c                   R    \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrSr\\S	.rS
rg)VideoMAEPreTrainedModeli{  rR   videomaera   videoTrG   r   )r    r!   r"   N)r#   r$   r%   r&   r   r*   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr,   r"   r-   r.   r  r  {  sO    "$O&*#-?N"&&+r-   r  c                      ^  \ rS rSrU 4S jrS r\\" SS9\ SS\	R                  S\	R                  S-  S	\\   S
\4S jj5       5       5       rSrU =r$ )VideoMAEModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  S U l        O.[        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )rK   rL   rR   rG   rc   r   encoderuse_mean_pooling	layernormr   r   rP   r   	post_initrS   s     r.   rL   VideoMAEModel.__init__  sg     ,V4&v.""!DN\\&*<*<&BWBWXDN 	r-   c                 .    U R                   R                  $ rJ   )rc   rN   )rT   s    r.   get_input_embeddings"VideoMAEModel.get_input_embeddings  s    ///r-   F)tie_last_hidden_statesNra   rb   r   r   c                     U R                  X5      nU R                  U5      nUR                  nU R                  b  U R                  U5      n[	        US9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
    length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

Examples:

```python
>>> import torch
>>> from transformers import VideoMAEVideoProcessor, VideoMAEModel
>>> from huggingface_hub import hf_hub_download

>>> # replace this with your own video file
>>> video_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )

>>> video_processor = VideoMAEVideoProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

>>> # prepare video for the model
>>> inputs = video_processor(video_path, return_tensors="pt")

>>> # forward pass
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 1568, 768]
```r  )rc   r  r  r  r   )rT   ra   rb   r   embedding_outputencoder_outputssequence_outputs          r.   rg   VideoMAEModel.forward  sQ    T  ??<I+/<<8H+I);;>>%"nn_=OAAr-   )rR   rc   r  r  rJ   )r#   r$   r%   r&   rL   r!  r   r   r   r(   r)   
BoolTensorr   r   r   rg   r,   rj   rk   s   @r.   r  r    s~    0  E2 48.B''.B ))D0.B +,	.B
 
.B  3  .Br-   r  c                   V   ^  \ rS rSrS\4U 4S jjrS\R                  S\4S jr	Sr
U =r$ )VideoMAEDecoderi  rR   c                   > [         TU ]  5         UR                  UR                  -  UR                  S-  -  n[        U5      nUR                  Ul        UR                  Ul	        UR                  Ul        UR                  Ul        [        R                  " [!        UR                  5       Vs/ s H  n[#        U5      PM     sn5      U l        [        R&                  " UR                  5      U l        US:  a!  [        R*                  " UR                  U5      O[        R,                  " 5       U l        SU l        X0l        g s  snf )Nr5   r   F)rK   rL   rf   rv   rt   r   decoder_hidden_sizerP   decoder_num_hidden_layersr  decoder_num_attention_headsr   decoder_intermediate_sizer   r   r   r6   r   decoder_layersr   normr   Identityheadr  rR   )rT   rR   decoder_num_labelsdecoder_configre   rU   s        r.   rL   VideoMAEDecoder.__init__  s
   #0063F3FFIZIZ\]I]]!&)%+%?%?"+1+K+K(-3-O-O*+1+K+K( mm49&:Z:Z4[\4[q]>*4[\
 LL!;!;<	I[^_I_BIIf002DEegepeper 		 ',#$ ]s   /Er    return_token_numc                     U R                    H  nU" U5      nM     US S 2U* S 24   nU R                  U5      nU R                  U5      n[        US9$ )N)r   )r1  r2  r4  r   )rT   r    r8  r  r   s        r.   rg   VideoMAEDecoder.forward  s]     //L(7M 0 &a*:):);&;< 		-0=)$F33r-   )rR   r1  r  r4  r2  )r#   r$   r%   r&   r   rL   r(   r   r{   rg   r,   rj   rk   s   @r.   r+  r+    s+    %~ %,4U\\ 4S 4 4r-   r+  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    c            
          ^  \ rS rSrU 4S jr\\S\R                  S\R                  S\
\   S\4S j5       5       rSrU =r$ )	VideoMAEForPreTrainingi  c                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " SSUR                  5      5      U l        [        U R                  R                  R                   UR                  5      U l        [%        U5      U l        U R)                  5         g )NFr   r   )rK   rL   rR   r  r  r   r   rP   r-  encoder_to_decoder	Parameterr(   zeros
mask_tokenrE   rc   rO   rQ   r+  decoderr  rS   s     r.   rL   VideoMAEForPreTraining.__init__   s     %f-"$))F,>,>@Z@Zaf"g,,u{{1a9S9S'TU#>MM$$00&2L2L$
  'v. 	r-   ra   rb   r   r   c                    U R                   " U4SU0UD6nUR                  nU R                  U5      nUR                  u  pgnUc  [	        S5      eU R
                  R                  USS5      R                  U5      n	U	R                  5       R                  UR                  SS9n	X)    R                  USU5      n
X   R                  USU5      n[        R                  " XZ-   U R                  U-   /SS9nU R                  XR                  S   5      nUR                   nSn[        R"                  " 5          U R$                  R&                  S	:w  a  UnOUR                  nUR(                  n[        R*                  " [,        5      R                  UUS
9SSSS2SS4   n[        R*                  " [.        5      R                  UUS
9SSSS2SS4   nUU-  U-   nUR                  u  nnnnnU R$                  R0                  U R$                  R2                  nnU R$                  R4                  (       a  UR7                  UUU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS	5      R;                  5       nUR7                  UUU-  U-  U-  U-  U-  UU-  U-  U5      nUUR=                  SSS9-
  UR?                  SSSS9RA                  5       S-   -  nUR7                  UUU-  U-  U-  U-  U-  UU-  U-  U-  5      nOU R$                  R&                  S	:w  a  [	        S5      eUR7                  UUU-  UUUU-  UUU-  U5      nUR9                  SSSSSSSS	5      R;                  5       nUR7                  UUU-  U-  U-  U-  U-  UU-  U-  U-  5      nUR                  u  pgnUU   R                  USU5      nSSS5        [C        5       nU" UW5      n[E        UUURF                  URH                  S9$ ! , (       d  f       NA= f)ae  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
    batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
    (image_size // patch_size) ** 2`.

Examples:
```python
>>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
>>> import numpy as np
>>> import torch

>>> num_frames = 16
>>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

>>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
>>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

>>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

>>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
>>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
>>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss = outputs.loss
```rb   Nz!One must provided a boolean mask r[   TrX   r   r   r   )rY   dtyper   r      r5         r   )r   keepdim)r   unbiasedrI  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r2   r   r    r!   )%r  r  r>  r_   r   rQ   expandr]   r\   r^   rY   r`   r(   catrA  rB  r   no_gradrR   rf   rE  	as_tensorr   r   rv   rt   norm_pix_lossr   r   r   meanvarsqrtr   r0   r    r!   )rT   ra   rb   r   outputsr'  rd   re   rf   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr   r2   framesrY   rE  rQ  stdtimer   r   rv   rt   frames_normvideos_patchlabelsloss_fcts                                 r.   rg   VideoMAEForPreTraining.forward  su   F $(==#i#ibh#i!3311/B '6&;&;#
| "@AA'+'?'?'F'FzSUWY'Z'b'bco'p$'C'J'J'L'O'OWcWjWjqu'O'v$67GHPPQ[]_amn3DLLZY[]ij O=tQ]?]^def 26fFXFXYZF[1\ '']]_{{''1,% &,,$**'<=@@V[@\]acgijlprv]vwoo&:;>>fTY>Z[_aeghjnpt[tu%+d2<BLL9JlFE'+{{'?'?AWAW*L{{((L(  j(Z'	  1aAq!Q?JJLL(61Z?%G:U :-
: 	  &D(IIJJ2dJCHHJTQ  +//L(61Z?%G:U :-
:\I  ;;++q0$k   L(  j(Z'	  1aAq!Q?JJL%{{L(61Z?%G:U :-
:\I  +7*<*<'J<!/2:::r<XFQ T 9'+!//))	
 	
[ _s   6J	O::
P)rR   rB  r>  rA  rQ   r  )r#   r$   r%   r&   rL   r   r   r(   r)   r)  r   r   r0   rg   r,   rj   rk   s   @r.   r<  r<    sb    " L
''L
 ))L
 +,	L

 
&L
  L
r-   r<  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\\  S
S\R                  S-  S\R                  S-  S\	\
   S\4S jj5       5       rS	rU =r$ )VideoMAEForVideoClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  (       a   [        R                  " UR                  5      OS U l	        UR                  S:  a+  [        R                  " UR                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rK   rL   
num_labelsr  r  r  r   r   rP   fc_normr   r3  
classifierr  rS   s     r.   rL   'VideoMAEForVideoClassification.__init__  s      ++%f- <B;R;Rr||F$6$67X\NTN_N_bcNc"))F$6$68I8IJikititiv 	r-   Nra   r_  r   r   c                 Z   U R                   " U40 UD6nUR                  nU R                  b#  UR                  S5      nU R                  U5      nO	USS2S4   nU R	                  U5      nSnUb  U R
                  " X'U R                  40 UD6n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import torch
>>> from transformers import VideoMAEVideoProcessor, VideoMAEForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> # replace this with your own video file
>>> video_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )

>>> video_processor = VideoMAEVideoProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
>>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

>>> inputs = video_processor(video_path, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
eating spaghetti
```Nr   r   rK  )
r  r  rf  rQ  rg  loss_functionrR   r   r    r!   )	rT   ra   r_  r   rT  r'  r   r   r2   s	            r.   rg   &VideoMAEForVideoClassification.forward  s    R $(==#H#H!33<<#$))!,F\\&)F$QT*F(%%fdkkLVLD$!//))	
 	
r-   )rg  rf  re  r  )NN)r#   r$   r%   r&   rL   r   r   r(   r   r   r   r   rg   r,   rj   rk   s   @r.   rc  rc    sj      -1&*;
llT);
 t#;
 +,	;

 
;
  ;
r-   rc  )r<  r  r  rc  )Nr   )Br'   collections.abcrx   r   rZ   r   dataclassesr   numpyr7   r(   r   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.constantsr   r   utils.genericr   r   utils.output_capturingr   configuration_videomaer   
get_loggerr#   loggerr   r0   rE   ModulerG   rM   r   floatr   r   r   r   r   r   r   r   r  r  r+  r<  rc  __all__r"   r-   r.   <module>r     sr   3  $  !     ! 9 F F & M M J I 5 2 
		H	% 
 7K 7 7 
 7; 7 7 : B2bii 2x !%II%<<% 
% <<	%
 LL4'% T\% % '(%80.BII 0.h $		 "299  
RYY 
. F@bii @$ o  " DB+ DB DBN"4bii "4J 
`
4 `

`
F K
%< K
K
\ sr-   