
    Z je                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJrJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJ r   \RB                  " \"5      r# " S S\RH                  5      r% " S S\RH                  5      r&  S7S\RH                  S\RN                  S\RN                  S\RN                  S\RN                  S-  S\(S-  S\(S\\   4S jjr) " S S\RH                  5      r* " S  S!\RH                  5      r+ " S" S#\RH                  5      r, " S$ S%\RH                  5      r- " S& S'\RH                  5      r. " S( S)\5      r/ " S* S+\RH                  5      r0 " S, S-\RH                  5      r1\ " S. S/\5      5       r2\ " S0 S1\25      5       r3\" S2S39 " S4 S5\25      5       r4/ S6Qr5g)8zPyTorch ViViT model.    )CallableN)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )VivitConfigc                   v   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	S\R                  4S jjr
S	rU =r$ )VivitTubeletEmbeddings$   az  
Construct Vivit Tubelet embeddings.

This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
(width // tubelet_size[2]).
configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  S   -  U R                  U R
                  S   -  -  U R                  U R
                  S   -  -  U l        UR                  U l        [        R                  " UR                  UR                  UR                  UR                  S9U l        g )N   r   r   )kernel_sizestride)super__init__
num_frames
image_sizetubelet_size
patch_sizenum_patcheshidden_size	embed_dimr   Conv3dnum_channels
projectionselfr   	__class__s     y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vivit/modeling_vivit.pyr!   VivitTubeletEmbeddings.__init__/   s     ++ ++ --__ 22$//!"446$//!"446 	
  ++))!3!3ATAT]c]p]p
    pixel_valuesinterpolate_pos_encodingreturnc                 b   UR                   u  p4pVnU(       dP  X`R                  :w  d  XpR                  :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR                  SSSSS	5      nU R	                  U5      nUR                  S5      R                  SS5      nU$ )
NzImage image size (*z) doesn't match model (r   r   z).r   r      )shaper#   
ValueErrorpermuter+   flatten	transpose)	r-   r2   r3   
batch_sizer"   r*   heightwidthxs	            r/   forwardVivitTubeletEmbeddings.forward?   s    >J>P>P;
e'V-F%SbSbJb$VHAeW4KDOO\]L^K__`aeapapqras`ttvw 
 $++Aq!Q:OOL) IIaL""1a(r1   )r(   r#   r"   r&   r%   r+   F)__name__
__module____qualname____firstlineno____doc__r   r!   torchTensorboolrA   __static_attributes____classcell__r.   s   @r/   r   r   $   s>    
{ 
 ELL D ]b]i]i  r1   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	S\R                  4S	 jr
SS
\R                  S\S\R                  4S jjrSrU =r$ )VivitEmbeddingsP   z|
Vivit Embeddings.

Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
r   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        [        R                  " [        R
                  " SU R                  R                  S-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                  SS  U l        Xl        g )Nr   )r    r!   r   	ParameterrI   zerosr'   	cls_tokenr   patch_embeddingsr&   position_embeddingsDropouthidden_dropout_probdropoutr$   r%   r   r,   s     r/   r!   VivitEmbeddings.__init__W   s    ekk!Q8J8J&KL 6v >#%<<KK400<<q@&BTBTU$
  zz&"<"<= --ab1r1   
embeddingsr>   r?   r4   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr   g      ?r   r   bicubicF)sizemodealign_cornersdim)r8   rW   rI   jit
is_tracingr%   r   reshaper:   r   
functionalinterpolateviewcat)r-   r\   r>   r?   r&   num_positionsclass_pos_embedpatch_pos_embedrd   
new_height	new_widthsqrt_num_positionss               r/   r3   (VivitEmbeddings.interpolate_pos_encodinge   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   r2   r3   c                 "   UR                   u  p4pVnU R                  XS9nU R                  R                  USS/5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )Nr3   r   rc   )	r8   rV   rU   tilerI   rk   r3   rW   rZ   )
r-   r2   r3   r=   r"   r*   r>   r?   r\   
cls_tokenss
             r/   rA   VivitEmbeddings.forward   s    >J>P>P;
e**<*k
^^((*a);<
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r1   )rU   r   rZ   rV   r%   rW   rC   )rD   rE   rF   rG   rH   r   r!   rI   rJ   intr3   rK   rA   rL   rM   rN   s   @r/   rP   rP   P   sq    { &D5<< &D &DUX &D]b]i]i &DPELL D ]b]i]i  r1   rP   modulequerykeyvalueattention_maskscalingrZ   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr^         r   r   rc   )ptrainingr   )
r`   rI   matmulr<   r   rh   softmaxrZ   r   
contiguous)
ry   rz   r{   r|   r}   r~   rZ   r   attn_weightsattn_outputs
             r/   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r1   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
\R                  \R                  4   4S jrSrU =r$ )	VivitSelfAttention   r   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r    r!   r'   num_attention_headshasattrr9   r   rx   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr~   	is_causalr   Linearqkv_biasrz   r{   r|   r,   s     r/   r!   VivitSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r1   hidden_statesr   r4   c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        R                  " U R                  R                  [        5      nU" U UUUS 4U R                  U R                  U R                  (       d  SOU R                   S.UD6u  pU	R#                  5       S S U R$                  4-   nU	R'                  U5      n	X4$ )Nr   r^   r   r           )r   r~   rZ   )r8   r   r   r{   rj   r<   r|   rz   r   get_interfacer   _attn_implementationr   r   r~   r   r   r`   r   rg   )r-   r   r   r=   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r/   rA   VivitSelfAttention.forward   sO   
 #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?(M(MKK,,.E)
 *=
*
 nnLL#}}C$2C2C
*
 
*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r1   )
r   r   r   r   r   r{   r   rz   r~   r|   )rD   rE   rF   rG   r   r!   rI   rJ   r   r   tuplerA   rL   rM   rN   s   @r/   r   r      sS    ]{ ](.||. +,. 
u||U\\)	*	. .r1   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
VivitSelfOutput   z
The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r    r!   r   r   r'   denserX   rY   rZ   r,   s     r/   r!   VivitSelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r1   r   input_tensorr4   c                 J    U R                  U5      nU R                  U5      nU$ r   r   rZ   r-   r   r   s      r/   rA   VivitSelfOutput.forward   s$    

=1]3r1   r   rD   rE   rF   rG   rH   r   r!   rI   rJ   rA   rL   rM   rN   s   @r/   r   r      sB    
>{ >
U\\  RWR^R^  r1   r   c                   t   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\R                  4S jr
SrU =r$ )	VivitAttentioni  r   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r    r!   r   	attentionr   outputr,   s     r/   r!   VivitAttention.__init__  s&    +F3%f-r1   r   r   r4   c                 R    U R                   " U40 UD6u  p4U R                  X15      nU$ r   r   r   )r-   r   r   self_attn_output_r   s         r/   rA   VivitAttention.forward  s/    
 #nn]EfE-=r1   r   )rD   rE   rF   rG   r   r!   rI   rJ   r   r   rA   rL   rM   rN   s   @r/   r   r     sC    .{ .
|| +, 
	 r1   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitIntermediatei  r   c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r    r!   r   r   r'   intermediate_sizer   rX   rY   rZ   
isinstance
hidden_actstrr   intermediate_act_fnr,   s     r/   r!   VivitIntermediate.__init__  sv    YYv1163K3KL
zz&"<"<=f''--'-f.?.?'@D$'-'8'8D$r1   r   r4   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   rZ   )r-   r   s     r/   rA   VivitIntermediate.forward!  s4    

=100?]3r1   )r   rZ   r   rD   rE   rF   rG   r   r!   rI   rJ   rA   rL   rM   rN   s   @r/   r   r     s/    9{ 9U\\ ell  r1   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	VivitOutputi)  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r    r!   r   r   r   r'   r   rX   rY   rZ   r,   s     r/   r!   VivitOutput.__init__*  sB    YYv779K9KL
zz&"<"<=r1   r   r   r4   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r/   rA   VivitOutput.forward/  s,    

=1]3%4r1   r   r   rN   s   @r/   r   r   )  s=    >{ >
U\\  RWR^R^  r1   r   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	
VivitLayeri6  zNThis corresponds to the EncoderBlock class in the scenic/vivit implementation.r   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r    r!   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr'   layer_norm_epslayernorm_beforelayernorm_afterr,   s     r/   r!   VivitLayer.__init__9  s    '-'E'E$'/-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr1   r   r4   c                     U R                  U5      nU R                  U5      nX1-   nU R                  U5      nU R                  U5      nU R	                  XA5      nU$ r   )r   r   r   r   r   )r-   r   hidden_states_normattention_outputlayer_outputs        r/   rA   VivitLayer.forwardC  se    !22=A>>*<= )8 ++M:((6 {{<?r1   )r   r   r   r   r   r   r   r   rN   s   @r/   r   r   6  s4    X[{ [U\\ ell  r1   r   c                   V   ^  \ rS rSrS\4U 4S jjrS\R                  S\4S jr	Sr
U =r$ )VivitEncoderiT  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r    r!   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r-   r   r   r.   s      r/   r!   VivitEncoder.__init__U  sR    ]]fF^F^@_#`@_1Jv$6@_#`a
&+# $as   A&r   r4   c                 `    [        U R                  5       H  u  p#U" U5      nM     [        US9$ )N)last_hidden_state)	enumerater   r	   )r-   r   ilayer_modules       r/   rA   VivitEncoder.forward[  s.    (4OA(7M  5 ??r1   )r   r   r   )rD   rE   rF   rG   r   r!   rI   rJ   r	   rA   rL   rM   rN   s   @r/   r   r   T  s/    ,{ ,@U\\ @o @ @r1   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitPoolerib  r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r    r!   r   r   r'   r   Tanh
activationr,   s     r/   r!   VivitPooler.__init__c  s9    YYv1163E3EF
'')r1   r   r4   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r-   r   first_token_tensorpooled_outputs       r/   rA   VivitPooler.forwardh  s6     +1a40

#566r1   )r   r   r   rN   s   @r/   r   r   b  s/    ${ $
U\\ ell  r1   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )VivitPreTrainedModeliq  r   vivitr2   videoTr   )r   
attentionsc                    > [         TU ]  U5        [        U[        5      (       aA  [        R
                  " UR                  5        [        R
                  " UR                  5        gg)zInitialize the weightsN)r    _init_weightsr   rP   initzeros_rU   rW   )r-   ry   r.   s     r/   r  "VivitPreTrainedModel._init_weights  sH     	f%fo..KK(()KK223 /r1    )rD   rE   rF   rG   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrI   no_gradr  rL   rM   rN   s   @r/   r  r  q  sg    $O&*#%N"&#(
 ]]_4 4r1   r  c                      ^  \ rS rSrSS\S\4U 4S jjjrS r\\	" SS9\
  SS	\R                  S-  S
\S\\   S\4S jj5       5       5       rSrU =r$ )
VivitModeli  r   add_pooling_layerc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r    r!   r   rP   r\   r   encoderr   r   r'   r   	layernormr   pooler	post_init)r-   r   r  r.   s      r/   r!   VivitModel.__init__  si    
 	 )&1#F+f&8&8f>S>ST->k&)D 	r1   c                 .    U R                   R                  $ r   )r\   rV   )r-   s    r/   get_input_embeddingsVivitModel.get_input_embeddings  s    ///r1   F)tie_last_hidden_statesNr2   r3   r   r4   c                     Uc  [        S5      eU R                  XS9nU R                  U5      nUR                  nU R	                  U5      nU R
                  b  U R                  U5      OSn[        XgS9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```Nz You have to specify pixel_valuesrt   )r   pooler_output)r9   r\   r  r   r  r  r
   )r-   r2   r3   r   embedding_outputencoder_outputssequence_outputr   s           r/   rA   VivitModel.forward  su    h ?@@??<?k+/<<8H+I);;..98<8OO4UY)Oiir1   )r   r\   r  r  r  )Tr   )rD   rE   rF   rG   r   rK   r!   r"  r   r   r   rI   FloatTensorr   r   r
   rA   rL   rM   rN   s   @r/   r  r    s    { t  "0  E2 26).Zj''$.Zj #'Zj +,	Zj
 
$Zj  3  Zjr1   r  a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\S\\   S	\4
S
 jj5       5       rSrU =r$ )VivitForVideoClassificationi  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r  r   )r    r!   
num_labelsr  r  r   r   r'   Identity
classifierr  r,   s     r/   r!   $VivitForVideoClassification.__init__  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r1   Nr2   labelsr3   r   r4   c                     U R                   " U4SU0UD6nUR                  nU R                  USS2SSS24   5      nSnUb  U R                  " X'U R                  40 UD6n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```r3   Nr   )losslogitsr   r  )r  r   r2  loss_functionr   r   r   r  )	r-   r2   r4  r3   r   outputsr)  r7  r6  s	            r/   rA   #VivitForVideoClassification.forward  s    x $(::$
3K$
OU$
 "33Aq!9:%%fdkkLVLD$!//))	
 	
r1   )r2  r0  r  )NNF)rD   rE   rF   rG   r   r!   r   r   rI   r+  
LongTensorrK   r   r   r   rA   rL   rM   rN   s   @r/   r.  r.    s    
{ 
  26*.).	i
''$.i
   4'i
 #'	i

 +,i
 
i
  i
r1   r.  )r  r  r.  )Nr   )6rH   collections.abcr   rI   r    r   r  activationsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_vivitr   
get_loggerrD   loggerModuler   rP   rJ   floatr   r   r   r   r   r   r   r   r   r  r  r.  __all__r  r1   r/   <module>rL     s    $   & ! 9 b b F & K K I 5 , 
		H	%)RYY )XLbii Ll !%II%<<% 
% <<	%
 LL4'% T\% % '(%:4. 4.pbii $RYY  		 $
")) 
+ <@299 @"))  4? 4 42 rj% rj rjj x
"6 x
x
v Pr1   