
    Z j2                        S SK Jr  S SKJr  S SKrS SKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSKJrJr  SSK J!r!  \RD                  " \#5      r$\" SS9\ " S S\5      5       5       r%\" SS9\ " S S\5      5       5       r& " S S\RN                  5      r( " S S\RN                  5      r) SMS\RN                  S\RT                  S\RT                  S\RT                  S \RT                  S-  S!\+S"\+4S# jjr,S$ r- " S% S&\RN                  5      r.SNS'\RT                  S(\+S)\/S*\RT                  4S+ jjr0 " S, S-\RN                  5      r1 " S. S/\RN                  5      r2 " S0 S1\5      r3 " S2 S3\RN                  5      r4S4\RT                  S5\5\RT                     S*\RT                  4S6 jr6 " S7 S8\RN                  5      r7 " S9 S:\RN                  5      r8 " S; S<\RN                  5      r9 " S= S>\RN                  5      r: " S? S@\5      r; " SA SB\5      r< " SC SD\RN                  5      r=\ " SE SF\5      5       r>\ " SG SH\>5      5       r?\" SIS9 " SJ SK\>5      5       r@/ SLQrAg)O    )Callable)	dataclassN)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )VJEPA2ConfigzO
    VJEPA Predictor outputs that also contains the masked encoder outputs
    )custom_introc                       \ rS rSr% Sr\R                  \S'   Sr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Sr\R                  S-  \S	'   S
rg)$VJEPA2WithMaskedInputPredictorOutput#   a  
masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
    The masked hidden state of the model.
target_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `target_mask` is provided which is applied on VJEPA2Encoder outputs):
    The target hidden state of the model.
last_hidden_stateNmasked_hidden_state.hidden_states
attentionstarget_hidden_state )__name__
__module____qualname____firstlineno____doc__torchFloatTensor__annotations__r   r   tupler    r!   __static_attributes__r"       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vjepa2/modeling_vjepa2.pyr   r   #   s     (((48**T18:>M5**C/047>7;Je'',-4;48**T18r-   r   zs
    VJEPA outputs that also contains the masked encoder outputs
    Optionally contains the predictor outputs
    c                      ^  \ rS rSr% Sr\R                  \S'   Sr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Sr\S-  \S	'   U 4S
 jrSrU =r$ ) VJEPA2WithMaskedInputModelOutput8   a]  
masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
    The masked hidden state of the model.
predictor_output (`VJEPA2WithMaskedInputPredictorOutput`, *optional*):
    The output from the Predictor module.
r   Nr   .r   r    predictor_outputc                    > [        [        TU ]	  5       5      n[        US   [        5      (       a  US   R                  5       US'   [        U5      $ )N)listsuperto_tuple
isinstancer   r+   )selfoutput	__class__s     r.   r7   )VJEPA2WithMaskedInputModelOutput.to_tupleM   sG    eg&()fRj"FGG,,.F2JV}r-   r"   )r#   r$   r%   r&   r'   r(   r)   r*   r   r   r+   r    r2   r   r7   r,   __classcell__r;   s   @r.   r0   r0   8   s     (((48**T18:>M5**C/047>7;Je'',-4;DH:TAH r-   r0   c                      ^  \ rS rSrSr SS\S\4U 4S jjjr\S 5       r	S\
R                  S\
R                  4S	 jrS
rU =r$ )VJEPA2PatchEmbeddings3DT   z
Image to Patch Embedding
confighidden_sizec                 B  > [         TU ]  5         UR                  U l        UR                  U l        X l        [
        R                  " UR                  UUR                  UR                  UR                  4UR                  UR                  UR                  4S9U l        g )N)in_channelsout_channelskernel_sizestride)	r6   __init__
patch_sizetubelet_sizerC   r   Conv3din_chansprojr9   rB   rC   r;   s      r.   rI    VJEPA2PatchEmbeddings3D.__init__Y   s    
 	 ++"//&II$,,f.?.?ARARS''):):F<M<MN	
	r-   c                     U R                   U R                  -  U R                  U R                  -  -  U R                  U R                  -  -  $ Nframes_per_cliprK   	crop_sizerJ   rB   s    r.   num_patches#VJEPA2PatchEmbeddings3D.num_patchesj   sO     ##v':'::6#4#4466#4#446	
r-   pixel_values_videosreturnc                 f    U R                  U5      R                  S5      R                  SS5      nU$ )N   r   )rN   flatten	transpose)r9   rY   xs      r.   forwardVJEPA2PatchEmbeddings3D.forwardr   s.    II)*2215??1Er-   )rC   rJ   rN   rK      )r#   r$   r%   r&   r'   r   intrI   staticmethodrW   r(   Tensorr`   r,   r=   r>   s   @r.   r@   r@   T   s]      

 
 
" 
 
5<< ELL  r-   r@   c                   v   ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )VJEPA2Embeddingsw   6
Construct mask token, position and patch embeddings.
rB   rC   c                    > [         TU ]  5         Xl        X l        [	        XS9U l        U R
                  R                  U l        UR                  U l        g )NrC   )r6   rI   rB   rC   r@   patch_embeddingsrW   rJ   rO   s      r.   rI   VJEPA2Embeddings.__init__|   sG    & 7 X00<< ++r-   rY   rZ   c                 f   UR                   S   nUR                  SSSSS5      nX R                  R                  :  a)  UR	                  SSU R                  R                  SS5      nU R
                  R                  R                  R                  nUR                  US9nU R                  U5      nU$ )Nr   r   r\   r      )dtype)
shapepermuterB   rK   repeatrm   rN   weightrq   to)r9   rY   
num_framestarget_dtype
embeddingss        r.   r`   VJEPA2Embeddings.forward   s    (..q1
 299!Q1aH 000"5"<"<Q4;;C[C[]^`a"b,,1188>>144<4H**+>?
r-   )rB   rC   rW   rm   rJ   rb   )r#   r$   r%   r&   r'   r   rd   rI   r(   rf   r`   r,   r=   r>   s   @r.   rh   rh   w   s@    ,| ,# , ,5<< ELL  r-   rh   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr4   )dimrq   )ptrainingr   r\   )r(   matmulr^   r   
functionalsoftmaxfloat32rv   rq   r   r   
contiguous)
r{   r|   r}   r~   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L,,|3K''1-88:K$$r-   c                    U R                  5       u  p#pE[        R                  " US-  U R                  U R                  S9nXeS-  -  nSSU-  -  nUR                  S5      U-  nUR                  5       nUR                  5       n	UR                  SSSS5      nU	R                  SSSS5      n	U R                  SS5      n
U
R                  SS	9u  p[        R                  " U* U4SS	9n
U
R                  S
5      n
X	-  X-  -   $ )Nr\   rq   deviceg       @g      ?i'  r4   r   )r4   r\   r   r   )sizer(   arangerq   r   	unsqueezesincosrt   	unflattenunbindstackr]   )r_   posB	num_headsNDomegafreqemb_sinemb_cosyy1y2s                r.   rotate_queries_or_keysr      s    A!
 LLaqwwqxx@E	WE%,E==u$D hhjGhhjGnnQ1a(GnnQ1a(G 	
B AXX"XFBbS"I2&A			"AKAK((r-   c                      ^  \ rS rSr  SS\S\S\4U 4S jjjrS rS rSS	 jr	S
 r
 SS\R                  S-  S\\R                  \R                  4   4S jjrSrU =r$ )VJEPA2RopeAttention   rB   rC   num_attention_headsc                 b  > [         TU ]  5         Xl        X l        X0l        X#-  S:w  a  [        SU4 SU S35      e[        X#-  5      U l        U R                  U R                  -  U l        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " X"5      U l        UR                   U l        [        R$                  " U R"                  5      U l        U R                  R(                  U R                  R*                  -  U l        U R                  R.                  U R                  R0                  -  U l        [        SU R                  S-  S-  -  5      U l        [        SU R                  S-  S-  -  5      U l        [        SU R                  S-  S-  -  5      U l        U R                  S-  U l        S	U l        g )
Nr   zThe hidden size z4 is not a multiple of the number of attention heads .biasr\   r         F)r6   rI   rB   rC   r   
ValueErrorrd   attention_head_sizeall_head_sizer   Linearqkv_biasr|   r}   r~   rN   attention_probs_dropout_probdropout_probDropoutr   rU   rJ   	grid_sizerT   rK   
grid_depthd_dimh_dimw_dimr   	is_causal)r9   rB   rC   r   r;   s       r.   rI   VJEPA2RopeAttention.__init__   s    	&#6 ,1"K>"2 3,-Q0 
 $'{'H#I !558P8PPYY{,>,>V__U
99[*<*<6??SYY{,>,>V__U
IIk7	"??zz$"3"34..$++2H2HH++559Q9QQt771<BCD
t771<BCD
t771<BCD
//5r-   c                 N    [        U R                  U R                  -  5      nX-  $ rR   )rd   r   )r9   idstokens_per_frames      r.   _get_frame_pos"VJEPA2RopeAttention._get_frame_pos   s#    t~~>?&&r-   c                     [        U R                  U R                  -  5      nU R                  U5      nXU-  -
  nU R                  nX-  $ rR   )rd   r   r   )r9   r   r   	frame_idstokens_per_rows        r.   _get_height_pos#VJEPA2RopeAttention._get_height_pos   sI    t~~>?'',	y00$$r-   Nc                    UR                   nUR                  S5      nUb-  UR                  S5      R                  SU R                  S5      nO[
        R                  " XCS9n[        U R                  U R                  -  5      nU R                  U5      nU R                  nU R                  U5      n	XVU-  -
  X-  -
  n
XyU
4$ )Nr   r   )r   r   r   rt   r   r(   r   rd   r   r   r   )r9   r_   masksr   
token_sizer   r   r   r   
height_ids	width_idss              r.   get_position_ids$VJEPA2RopeAttention.get_position_ids  s    VVAY
 //!$++At/G/GKC,,z9Ct~~>?'',	))#.
 i77>;VV	i//r-   c                    Uu  p4nSn[        USXfU R                  -   24   US9nX`R                  -  n[        USXfU R                  -   24   US9nX`R                  -  n[        USXfU R                  -   24   US9n	X`R                  -  nX`R                  :  a"  USUS 24   n
[
        R                  " XxX/SS9nU$ [
        R                  " XxU	/SS9nU$ )Nr   .)r   r4   r   )r   r   r   r   r   r(   cat)r9   qkpos_idsd_maskh_maskw_masksqkdqkhqkwqkrs              r.   apply_rotary_embeddings+VJEPA2RopeAttention.apply_rotary_embeddings  s    !($RQTZZ-?(?%@fM	ZZ$RQTZZ-?(?%@fM	ZZ$RQTZZ-?(?%@fM	ZZ'''S!"W+CCc/R8B 	 Cc?3B	r-   position_maskrZ   c                 .   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  XS9nU R                  Xh5      nU R                  XX5      n[        R                  " U R                  R                  [        5      n	U	" U UUUS U R                  U R                  U R                   (       d  SOU R"                  S9u  pU
R%                  5       S S U R&                  4-   nU R)                  U
R+                  U5      5      n
X4$ )Nr4   r   r\   )r           r   r   r   r   )rr   r   r|   viewr^   r}   r~   r   r   r   get_interfacerB   _attn_implementationr   r   r   r   r   r   r   rN   reshape)r9   r   r   input_shapehidden_shapequery_layer	key_layervalue_layerr   attention_interfacecontext_layerattention_probsnew_context_layer_shapes                r.   r`   VJEPA2RopeAttention.forward(  s~   
 $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR'''K00D	22;H(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S		-"7"78O"PQ--r-   )r   r   rB   r   r   r   r   r   r   rC   r   r}   r   rN   r|   r   r~   r   )rc      rR   )r#   r$   r%   r&   r   rd   rI   r   r   r   r   r(   rf   r+   r`   r,   r=   r>   s   @r.   r   r      s      #%	## # !	# #J'%0*( .2!. ||d*!. 
u||U\\)	*	!. !.r-   r   input	drop_probr   rZ   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )z[
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

r   r   r   r   r   )rr   ndimr(   randrq   r   floor_div)r   r   r   	keep_probrr   random_tensorr:   s          r.   	drop_pathr   M  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr-   c                      ^  \ rS rSrSrSS\S-  4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )VJEPA2DropPathi]  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   c                 .   > [         TU ]  5         Xl        g rR   )r6   rI   r   )r9   r   r;   s     r.   rI   VJEPA2DropPath.__init__`  s    "r-   r   rZ   c                 B    [        XR                  U R                  5      $ rR   )r   r   r   )r9   r   s     r.   r`   VJEPA2DropPath.forwardd  s    FFr-   c                      SU R                    3$ )Nzp=r   r9   s    r.   
extra_reprVJEPA2DropPath.extra_reprg  s    DNN#$$r-   r  rR   )r#   r$   r%   r&   r'   floatrI   r(   rf   r`   strr  r,   r=   r>   s   @r.   r   r   ]  sJ    b#%$, # #GU\\ Gell G%C % %r-   r   c                   v   ^  \ rS rSrS
S\S\S\4U 4S jjjrS\R                  S\R                  4S jr
S	rU =r$ )	VJEPA2MLPik  rB   rC   	mlp_ratioc                    > [         TU ]  5         U=pE[        X#-  5      n[        R                  " XFSS9U l        [        UR                     U l        [        R                  " XeSS9U l	        g NTr   )
r6   rI   rd   r   r   fc1r   
hidden_act
activationfc2)r9   rB   rC   r  in_featuresout_featureshidden_featuresr;   s          r.   rI   VJEPA2MLP.__init__l  sY    %00k5699[E !2!2399_Fr-   hidden_staterZ   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rR   )r  r  r  )r9   r  s     r.   r`   VJEPA2MLP.forwardt  s2    xx-|4xx-r-   )r  r  r  )rc         @)r#   r$   r%   r&   r   rd   r
  rI   r(   rf   r`   r,   r=   r>   s   @r.   r  r  k  sH    G| G# GQV G GELL U\\  r-   r  c                      ^  \ rS rSrSr    SS\S\S\S\S\4
U 4S jjjr SS
\	R                  S\	R                  S	-  S\\   S\\	R                  S4   4S jjrSrU =r$ )VJEPA2Layeri{  zCThis corresponds to the Block class in the original implementation.rB   drop_path_raterC   r   r  c                   > [         TU ]  5         Xl        X0l        X@l        XPl        [        R                  " X1R                  S9U l	        [        XU5      U l        UR                  S:  a  [        U5      O[        R                  " 5       U l        [        R                  " X1R                  S9U l        [#        XUS9U l        g )Nepsr   )rC   r  )r6   rI   rB   rC   r   r  r   	LayerNormlayer_norm_epsnorm1r   	attentionr  r   Identityr   norm2r  mlp)r9   rB   r  rC   r   r  r;   s         r.   rI   VJEPA2Layer.__init__~  s     	&#6 "\\+3H3HI
,VBUV;A;P;PSV;V7\^\g\g\i\\+3H3HI
V	Rr-   Nr   r   r   rZ   .c                     UnU R                  U5      nU R                  UUS9u  pVU R                  U5      U-   nUnU R                  U5      nU R	                  U5      nU R                  U5      U-   nX4$ )N)r   )r%  r&  r   r(  r)  )r9   r   r   r   residualattention_outputr   s          r.   r`   VJEPA2Layer.forward  s     !

=1)-' *8 *
& '788C !

=1/}5@ **r-   )	r&  rB   r   rC   r)  r  r%  r(  r   )r   rc   r   r  rR   )r#   r$   r%   r&   r'   r   r
  rd   rI   r(   rf   r   r   r+   r`   r,   r=   r>   s   @r.   r  r  {  s    M
 !$#%SS S 	S
 !S S S. .2+||+ ||d*+ +,	+
 
u||S 	!+ +r-   r  c                   l   ^  \ rS rSrS\4U 4S jjr S
S\R                  S-  S\\	   S\
4S jjrS	rU =r$ )VJEPA2Encoderi  rB   c                 X  > [         TU ]  5         Xl        [        XR                  S9U l        [        UR                  5       Vs/ s H4  nUR                  S:  a  UR                  U-  UR                  S-
  -  OSPM6     nn[        R                  " [        UR                  5       Vs/ s H0  n[        UX2   UR                  UR                  UR                  S9PM2     sn5      U l        [        R                  " UR                  UR                   S9U l        SU l        g s  snf s  snf )Nrl   r   r   r  rC   r   r  r!  F)r6   rI   rB   rh   rC   ry   rangenum_hidden_layersr  r   
ModuleListr  r   r  layerr#  r$  	layernormgradient_checkpointingr9   rB   idrop_path_ratesr;   s       r.   rI   VJEPA2Encoder.__init__  s   *6?Q?QR 6334
4 LRKcKcfgKgV""Q&&*B*BQ*FGmpp4 	 
 ]] v778	 9A #2#5 & 2 2(.(B(B$.. 9	

 f&8&8f>S>ST&+##

	s   ;D")7D'NrY   r   rZ   c                     U R                  U5      n[        U R                  5       H  u  pEU" US 40 UD6nUS   nM     U R                  U5      n[	        US9$ )Nr   r   )ry   	enumerater6  r7  r
   )r9   rY   r   r   r:  layer_modulelayer_outputss          r.   r`   VJEPA2Encoder.forward  sc    
 (;<(4OA(GGM)!,M  5 }5+
 	
r-   )rB   ry   r8  r6  r7  rR   )r#   r$   r%   r&   r   rI   r(   rf   r   r   r
   r`   r,   r=   r>   s   @r.   r0  r0    sI    ,| ,4 48
"\\D0
 +,
 
	
 
r-   r0  tensorr   c                    / nU Hi  nUR                  U R                  5      nUR                  S5      R                  SSU R	                  S5      5      nU[
        R                  " U SUS9/-  nMk     [
        R                  " USS9$ )z
Args:
    tensor (`torch.Tensor`):
        Tensor of shape [batch_size, num_patches, feature_dim]
    masks (`List[torch.Tensor]`):
        List of tensors of shape [batch_size, num_patches] containing indices of patches to keep
r4   r   r   indexr   r   )rv   r   r   rt   r   r(   gatherr   )rC  r   all_masked_tensorsmask	mask_keeps        r.   apply_masksrK    sz     wwv}}%NN2&--aFKKOD	u||FKLL 
 99'Q//r-   c                      ^  \ rS rSrSrS\4U 4S jjr\S 5       r SS\	R                  S\\	R                     S\\	R                     S	\S
\\	R                  \	R                  4   4
S jjrSrU =r$ )VJEPA2PredictorEmbeddingsi  rj   rB   c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  5      U l        SU l        UR                  U l
        UR                  U l        [        R                  " [        R                  " U R                  SSUR                  5      5      U l        UR                   U l        Xl        g )Nr   r   )r6   rI   rB   r   r   rC   pred_hidden_sizepredictor_embeddingsnum_mask_tokenspred_zero_init_mask_tokenszero_init_mask_tokenspred_num_mask_tokens	Parameterr(   zerosmask_tokensrJ   r9   rB   r;   s     r.   rI   "VJEPA2PredictorEmbeddings.__init__  s    $&IIf.@.@&BYBY$Z! %+%F%F"%::<<D4H4H!QPVPgPg(hi ++r-   c                 "   U R                   S:  aM  U R                   U R                  -  U R                  U R                  -  -  U R                  U R                  -  -  $ U R                  U R                  -  U R                  U R                  -  -  $ Nr   rS   rV   s    r.   rW   %VJEPA2PredictorEmbeddings.num_patches  s    !!A%''6+>+>>##v'8'88:##v'8'88: $$(9(99f>N>NRXRcRc>cddr-   r   context_masktarget_mask
mask_indexrZ   c                    UR                  S5      nU R                  U5      nX@R                  -  nU R                  U   nUS   R	                  5       S-   nUR                  XXS5      n[        Xs5      nUR                  [        U5      SS5      n[        R                  " Xg/SS9n	[        R                  " USS9n
[        R                  " USS9n[        R                  " X/SS9nX4$ )z
hidden_states : encoder outputs (context)
context_mask: tokens of the context (outputs from the encoder)
target_mask: tokens to predict
mask_index: index of the target mask to choose (useful for multiclip?)
r   r   r   )
r   rP  rQ  rW  maxrt   rK  lenr(   r   )r9   r   r]  r^  r_  r   contexttargetmax_patch_numry   cmtmr   s                r.   r`   !VJEPA2PredictorEmbeddings.forward  s     q!++M:  "6"66
!!*- $A**,q0q3V1 ..\!2Aq9YY0a8
 YY|+YY{*		2(*  r-   )rB   rW  rQ  rJ   rP  rS  r   )r#   r$   r%   r&   r'   r   rI   re   rW   r(   rf   r5   rd   r+   r`   r,   r=   r>   s   @r.   rM  rM    s    |  e e &!||&! 5<<(&! %,,'	&!
 &! 
u||U\\)	*&! &!r-   rM  c            
          ^  \ rS rSrS\4U 4S jjrS rS rS\R                  S\
\R                     S\
\R                     S	\\   S
\4
S jrSrU =r$ )VJEPA2Predictori-  rB   c                   > [         TU ]  5         Xl        SU l        [	        U5      U l        [        UR                  5       Vs/ s H4  nUR                  S:  a  UR                  U-  UR                  S-
  -  OSPM6     nn[        R                  " [        UR                  5       Vs/ s H0  n[        UX2   UR                  UR                  UR                  S9PM2     sn5      U l        [        R                   " UR                  UR"                  S9U l        [        R&                  " UR                  UR(                  SS9U l        g s  snf s  snf )NFr   r   r2  r!  Tr   )r6   rI   rB   r8  rM  ry   r3  pred_num_hidden_layersr  r   r5  r  rO  pred_num_attention_headspred_mlp_ratior6  r#  r$  r7  r   rC   rN   r9  s       r.   rI   VJEPA2Predictor.__init__.  s=   &+#3F; 6889
 : 0014 %%)V-J-JQ-NO : 	 
 ]] v<<=	 >A #2#5 & 7 7(.(G(G$33 >	

 f&=&=6CXCXYIIf55v7I7IPTU	+
	s   ;E	(7Ec                 .   UR                  UR                  5      n[        R                  " USUS9nUR                  UR                  5      nUR	                  S5      R                  SSUR                  S5      5      n[        R                  " USUS9nX4$ )Nr   rE  r4   )rv   r   r(   rG  r   expandr   )r9   r   position_masksargsorthidden_states_argsorts        r.   sort_tokensVJEPA2Predictor.sort_tokensJ  s    **^223n!7K **]112 ' 1 1" 5 < <R]EWEWXZE[ \]AVW,,r-   c                     UR                  UR                  5      n[        R                  " USS9nUR	                  S5      R                  SSUR                  S5      5      n[        R                  " USUS9nU$ )Nr   r   r4   rE  )rv   r   r(   rs  r   rq  r   rG  )r9   r   rs  reverse_argsorts       r.   unsort_tokensVJEPA2Predictor.unsort_tokensV  si    **]112--Q7)33B7>>r2}GYGYZ\G]^]Qr-   encoder_hidden_statesr]  r^  r   rZ   c                    [        X5      nUR                  u  pVnU R                  XU5      u  p[        R                  " U	SS9n
U R                  XU
5      u  p[        U R                  5       H  u  pU" X40 UD6nUS   nM     U R                  U5      nU R                  X5      nUS S 2US 24   nU R                  U5      n[        US9$ )Nr   r   r   r>  )rK  rr   ry   r(   rs  ru  r?  r6  r7  ry  rN   r
   )r9   r{  r]  r^  r   _N_ctxtr   r   rr  rs  r:  r@  rA  s                 r.   r`   VJEPA2Predictor.forward]  s     !,,A P,221(,8M]h(i% --A6(,(8(8X_(`%(4OA(Q&QM)!,M  5 }5**=B%aj1		-0+
 	
r-   )rB   ry   r8  r6  r7  rN   )r#   r$   r%   r&   r   rI   ru  ry  r(   rf   r5   r   r   r
   r`   r,   r=   r>   s   @r.   rj  rj  -  sq    V| V8
-
$||
 5<<(
 %,,'	

 +,
 

 
r-   rj  c            	          ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\R                  \R                  4   4S	 jjr
S
rU =r$ )VJEPA2PoolerSelfAttentioni~  z=Multi-headed attention from 'Attention Is All You Need' paperrB   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r6   rI   rB   rC   	embed_dimr   r   head_dimr   scaleattention_dropoutr   r   r   r   k_projv_projq_projout_projrX  s     r.   rI   "VJEPA2PoolerSelfAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar-   Nr   r   rZ   c                    UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU	R                   " / UQSP76 R#                  5       n	U R%                  U	5      n	X4$ )#Input shape: Batch x Time x ChannelNr4   r   r\   r   r   )rr   r  r  r   r^   r  r  r   r   rB   r   r   r   r  r   r   r   r   r  )r9   r   r   r   r   querieskeysvaluesr   r   r   s              r.   r`   !VJEPA2PoolerSelfAttention.forward  s6    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "));;;;FFHmmK0((r-   )rB   r   r  r  r   r  r   r  r  r  r  rR   r#   r$   r%   r&   r'   r   rI   r(   rf   r+   r`   r,   r=   r>   s   @r.   r  r  ~  s^    GB| B. /3)||) t+) 
u||U\\)	*	) )r-   r  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S	\R                  S-  S
\	\R                  \R                  4   4
S jjr
SrU =r$ )VJEPA2PoolerCrossAttentioni  z_It's different from other cross-attention layers, doesn't have output projection layer (o_proj)rB   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g r  )r6   rI   rB   rC   r  r   r   r  r   r  r  r   r   r   r   r  r  r  rX  s     r.   rI   #VJEPA2PoolerCrossAttention.__init__  s    ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?r-   Nr  r  r  r   rZ   c                    UR                   u  pVnUR                   S   nU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XVU R
                  U R                  5      R                  SS5      nUR	                  XXU R
                  U R                  5      R                  SS5      nUR	                  XXU R
                  U R                  5      R                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUUU R                  U R                  U R                  (       d  SOU R                   S9u  pU
R#                  XVU5      R%                  5       n
X4$ )r  r   r\   r   r   )rr   r  r  r  r   r   r  r^   r   r   rB   r   r   r   r  r   r   r   r   )r9   r  r  r  r   
batch_sizeq_seq_lengthr  kv_seq_lengthr   r   r   s               r.   r`   "VJEPA2PoolerCrossAttention.forward  sF    /6mm+
)

1++g&{{4 V$,,zWaabcefgyyDNNDMMR\\]^`abZV``abdef(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
! "))*INYY[((r-   )
rB   r   r  r  r   r  r   r  r  r  rR   r  r>   s   @r.   r  r    sz    i@| @0 /3%)%) ll%) 	%)
 t+%) 
u||U\\)	*%) %)r-   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\R                  \R                  4   4S jr	Sr
U =r$ )	VJEPA2PoolerSelfAttentionLayeri  rB   c                 *  > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        R                  " UR                  UR
                  S9U l	        [        XR                  S9U l        g Nr!  rl   )r6   rI   r   r#  rC   r$  layer_norm1r  	self_attnlayer_norm2r  r)  rX  s     r.   rI   'VJEPA2PoolerSelfAttentionLayer.__init__  sj    <<(:(:@U@UV26:<<(:(:@U@UVV1C1CDr-   r   r   rZ   c                     UnU R                  U5      nU R                  UUS9u  pX1-   nUnU R                  U5      nU R                  U5      nX1-   nX4$ )a"  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
)r   r   )r  r  r  r)  )r9   r   r   r,  r   s        r.   r`   &VJEPA2PoolerSelfAttentionLayer.forward  ss     !((7&*nn') '5 '
# !0 ((7/ 0**r-   )r  r  r)  r  r#   r$   r%   r&   r   rI   r(   rf   r+   r`   r,   r=   r>   s   @r.   r  r    sQ    E| E+||+ + 
u||U\\)	*	+ +r-   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\\R                  \R                  4   4S	 jjr	S
r
U =r$ )VJEPA2PoolerCrossAttentionLayeri  rB   c                 *  > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        R                  " UR                  UR
                  S9U l	        [        XR                  S9U l        g r  )r6   rI   r   r#  rC   r$  r  r  
cross_attnr  r  r)  rX  s     r.   rI   (VJEPA2PoolerCrossAttentionLayer.__init__  sj    <<(:(:@U@UV4V<<<(:(:@U@UVV1C1CDr-   Nr  r  r   rZ   c                     UnU R                  U5      nU R                  UUUUS9tp%XB-   nUnU R                  U5      nU R                  U5      nXB-   nU/UQ7$ )Nr   )r  r  r  r)  )r9   r  r  r   r,  r   s         r.   r`   'VJEPA2PoolerCrossAttentionLayer.forward#  s     ''5&*oo)	 '6 '
#  .  ''5xx-.*l**r-   )r  r  r  r)  rR   r  r>   s   @r.   r  r    si    E| E /3	++ ll+ t+	+
 
u||U\\)	*+ +r-   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	VJEPA2AttentivePooleri=  zAttentive PoolerrB   c                 R  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf r[  )r6   rI   r   rU  r(   rV  rC   query_tokensr  cross_attention_layerr5  r3  num_pooler_layersr  self_attention_layers)r9   rB   r}  r;   s      r.   rI   VJEPA2AttentivePooler.__init__@  sx    LLQ6;M;M)NO%DV%L"%']]=B6C[C[=\]=\+F3=\]&
"]s   B$r  rZ   c                     U R                    H  nU" US S9S   nM     U R                  R                  UR                  S   SS5      nU R	                  X15      S   nUR                  S5      $ )Nr  r   r   )r  r  rt   rr   r  squeeze)r9   r  r6  r  s       r.   r`   VJEPA2AttentivePooler.forwardH  sn    //E dCAFL 0##**<+=+=a+@!QG11'HK##A&&r-   )r  r  r  )r#   r$   r%   r&   r'   r   rI   r(   rf   r`   r,   r=   r>   s   @r.   r  r  =  s2    
| 
'ELL 'U\\ ' 'r-   r  c                       \ rS rSr% \\S'   SrSrSrSr	/ SQr
SrSr\" \SS	9\" \S
SS9S.r\R$                  " 5       S 5       rSrg)VJEPA2PreTrainedModeliP  rB   vjepa2rY   videoT)r  r  r  rM  zencoder.layer)
layer_namer   )rF  r  )r   r    c                    U R                   R                  n[        U[        5      (       Ga  [        R
                  " UR                  US9  [        UR                  S5       Hr  u  p4X#S-  -  n[        R
                  " UR                  R                  R                  US9  [        R
                  " UR                  R                  R                  US9  Mt     U[        UR                  5      S-   S-  -  n[        R
                  " UR                  R                  R                  R                  US9  g[        U[         5      (       aR  UR"                  (       a!  [        R$                  " UR&                  5        g[        R
                  " UR&                  US9  g[        U[(        R*                  [(        R,                  [(        R.                  45      (       aN  [        R
                  " UR                  US9  UR0                  b!  [        R$                  " UR0                  5        gg[        U[(        R2                  5      (       aA  [        R$                  " UR0                  5        [        R4                  " UR                  5        gg)zInitialize the weights)stdr   g      ?N)rB   initializer_ranger8   r  inittrunc_normal_r  r?  r  r  r  ru   r)  r  rb  r  rM  rS  zeros_rW  r   r   Conv2drL   r   r#  ones_)r9   r{   init_stdr:  r6  r  s         r.   _init_weights#VJEPA2PreTrainedModel._init_weightsd  s    ;;00f344v22A%f&B&BAFS&)""5??#;#;#B#BL""599==#7#7SA G c&">">?!CKKCv;;??CCJJPST 9::++F../""6#5#58DBIIryy ABBv}}(;{{&FKK( '--KK$JJv}}% .r-   r"   N)r#   r$   r%   r&   r   r*   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr   r  r   _can_record_outputsr(   no_gradr  r,   r"   r-   r.   r  r  P  sm     +O&*# N'P$%8o^
 ]]_& &r-   r  c                     ^  \ rS rSrS\4U 4S jjrS\4S jr\\	" SS9\
   SS	\R                  S
\\R                     S-  S\\R                     S-  S\S\\   S\4S jj5       5       5       rS\R                  4S jrSrU =r$ )VJEPA2Modeli  rB   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g rR   )r6   rI   rB   r0  encoderrj  	predictor	post_initrX  s     r.   rI   VJEPA2Model.__init__  s9     $V,(0 	r-   rZ   c                 B    U R                   R                  R                  $ rR   )r  ry   rm   r  s    r.   get_input_embeddings VJEPA2Model.get_input_embeddings  s    ||&&777r-   F)tie_last_hidden_statesNrY   r]  r^  skip_predictorr   c                    Uc  [        S5      eU R                  " S
SU0UD6nUR                  nUc  Uc  UR                  S5      nUR                  S5      n	[        R
                  " XR                  S9R                  S5      R                  US45      /n[        R
                  " XR                  S9R                  S5      R                  US45      /nU(       dJ  U R                  " S
UUUS.UD6n
[        U
R                  [        Xs5      U
R                  U
R                  S9nOSn[        U[        Xr5      UR                  UR                  US	9nU$ )a"  
context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
    The mask position ids indicating which encoder output patches are going to be exposed to the predictor.
    By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating full context
    available to the predictor.
target_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
    The mask position ids indicating which encoder output patches are going to be used as a prediction target
    for the predictor. By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating
    that the predictor should predict all encoder patches.
skip_predictor (bool):
    flag to skip the predictor forward, useful if you just need the encoder outputs
Nz'You have to specify pixel_values_videosrY   r   r   r   )r{  r]  r^  )r   r!   r   r    )r   r   r   r    r2   r"   )r   r  r   r   r(   r   r   r   rt   r  r   rK  r   r    r0   )r9   rY   r]  r^  r  r   encoder_outputssequence_outputr   r   predictor_outputsr2   encoder_outputs                r.   r`   VJEPA2Model.forward  sk   . &FGG+/<< ,
 3,
,
 *;;K$7#((+A$$Q'A!LL3M3MNXXYZ[bbdeghcijkL <<2L2LMWWXYZaacdfgbhijK15 2&5)'2 	2  D"3"E"E$/$M/==,77	   $9- +O J)77&11-
 r-   c                 :    U R                  USS9nUR                  $ )NT)r  )r`   r   )r9   rY   r  s      r.   get_vision_featuresVJEPA2Model.get_vision_features  s!    &9$O///r-   )rB   r  r  )NNF)r#   r$   r%   r&   r   rI   r@   r  r   r   r   r(   rf   r5   boolr   r   r0   r`   r  r,   r=   r>   s   @r.   r  r    s    | 8&= 8  E2 3715$;"\\; 5<<(4/; %,,'$.	;
 ; +,; 
*;  3  ;z0%,, 0 0r-   r  z}
    V-JEPA 2 Model transformer with a video classification head on top (a linear layer on top of the attentive pooler).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\\-  4S	 jj5       5       rS
rU =r$ )VJEPA2ForVideoClassificationi  rB   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g r  )r6   rI   
num_labelsr  r  r  poolerr   r   rC   
classifierr  rX  s     r.   rI   %VJEPA2ForVideoClassification.__init__  sd      ++!&) ,F3))F$6$68I8IPTU 	r-   NrY   labelsr   rZ   c                    U R                   " SUSS.UD6nUR                  nU R                  U5      nU R                  U5      nSnUb  U R	                  XrU R
                  S9n[        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import torch
>>> import numpy as np
>>> from transformers import AutoVideoProcessor, VJEPA2ForVideoClassification

>>> device = "cuda"

>>> video_processor = AutoVideoProcessor.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2")
>>> model = VJEPA2ForVideoClassification.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2").to(device)

>>> video = np.ones((64, 256, 256, 3))  # 64 frames, 256x256 RGB
>>> inputs = video_processor(video, return_tensors="pt").to(device)

>>> # For inference
>>> with torch.no_grad():
...     outputs = model(**inputs)
>>> logits = outputs.logits

>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])

>>> # For training
>>> labels = torch.ones(1, dtype=torch.long, device=device)
>>> loss = model(**inputs, labels=labels).loss

```T)rY   r  N)pooled_logitsr  rB   )losslogitsr   r    r"   )	r  r   r  r  loss_functionrB   r   r   r    )	r9   rY   r  r   outputsr   pooler_outputr  r  s	            r.   r`   $VJEPA2ForVideoClassification.forward  s    V ++ 
 3
 
 $55$56/%%FRVR]R]%^D$!//))	
 	
r-   )r  r  r  r  rR   )r#   r$   r%   r&   r   rI   r   r   r(   rf   r   r   r+   r   r`   r,   r=   r>   s   @r.   r  r    so    |   '+<
"\\<
 t#<
 +,	<

 
&	&<
  <
r-   r  )r  r  r  )r   )r   F)Bcollections.abcr   dataclassesr   r(   r    r   r  activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_vjepa2r   
get_loggerr#   loggerr   r0   Moduler@   rh   rf   r
  r   r   r   r  r   r   r  r  r0  r5   rK  rM  rj  r  r  r  r  r  r  r  r  __all__r"   r-   r.   <module>r
     s   % !   & ! 9 F F & _ _ 7 E . 
		H	% 
 9; 9 9  {  * bii  Fryy T %II%<<% 
% <<	%
 LL4'% % %4)6z.")) z.|U\\ e T V[VbVb  %RYY %		  -+, -+`(
BII (
V0 0T%,,-? 0ELL 0"C!		 C!LN
bii N
b5)		 5)p=) =)B!+%? !+H+&@ +D'BII '& +&O +& +&\ P0' P0 P0f 
L
#8 L

L
^ Sr-   