
    Z j5                     z   S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJr  \R4                  " \5      r\\ " S S\5      5       5       r " S S\R<                  5      r " S S\R<                  5      r  " S S\R<                  5      r! " S S\R<                  5      r" " S S\R<                  5      r# " S S\R<                  5      r$ " S S\R<                  5      r% " S S \5      r& " S! S"\R<                  5      r' " S# S$\R<                  5      r(\ " S% S&\5      5       r) " S' S(\R<                  5      r* " S) S*\R<                  5      r+\*\+S+.r,\" S,S-9 " S. S/\)5      5       r- " S0 S1\R<                  5      r.\" S2S-9 " S3 S4\)5      5       r// S5Qr0g)6zPyTorch TVP Model    N)	dataclass)nn   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)auto_docstringlogging   )	TvpConfigc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
TvpVideoGroundingOutput#   a\  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Temporal-Distance IoU loss for video grounding.
logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
    input texts.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogits.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   #   sq    	 &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;r$   r   c                   D   ^  \ rS rSrSrU 4S jrS rS rS rS r	Sr
U =r$ )	TvpLoss7   ab  
This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervise class and box).

Args:
    losses (`list[str]`):
        List of all the losses to be applied.
c                    > [         TU ]  5         U R                  U R                  U R                  S.U l        U H!  nX R
                  ;  d  M  [        SU S35      e   Xl        g )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr5   r   	__class__s      r%   r/   TvpLoss.__init__B   sa    ==****

 D==( 5n!=>>  r$   c                     [         R                  " XB5      [         R                  " X15      -
  n[         R                  " XB5      [         R                  " X15      -
  nSUR                  SS9U-  -
  nU$ )z&
Measure the intersection over union.
r   r   min)r   r;   maxclamp)	r6   
start_timeend_timecandidates_start_timecandidates_end_timer-   interunionr+   s	            r%   r0   TvpLoss.loss_iouO   s_     		-8599EZ;gg		-8599EZ;gg%++!+$u,,
r$   c                 P   [         R                  " [         R                  " X45      S5      n[         R                  " [         R                  " X5      S5      n[         R                  " [         R                  " Xg5      [         R                  " Xg5      -
  U5      R                  SS9nU$ )z%
Measure the distance of mid points.
g       @g?r:   )r   divaddr<   r;   r=   )	r6   r>   r?   r@   rA   r-   mid_candidatesmid_groundtruthdistance_diffs	            r%   r1   TvpLoss.loss_distanceY   sy     599-B#XZ]^))EIIj$CSI		IIn6>9ccem

%C%. 	 r$   c                     [         R                  " XC5      n[         R                  " X!5      n[         R                  " [         R                  " [         R                  " Xg5      U5      5      nUR	                  SS9nU$ )z%
Measure the difference of duration.
g?r:   )r   subsquarerF   r=   )	r6   r>   r?   r@   rA   r-   duration_candidatesduration_groundtruthduration_diffs	            r%   r2   TvpLoss.loss_duratione   s`     $ii(;S$yy>UYYuyy9L/cem%no%+++4r$   c                    Uu  p4n[         R                  " X5      nUSS2S4   R                  5       USS2S4   R                  5       p0 n	U R                   H*  n
U	R	                  XR
                  U
   " XEXxU5      05        M,     U	$ )a5  
This performs the loss computation.

Args:
    logits (`torch.FloatTensor`):
        The output logits of head module.
    labels (`list[torch.FloatTensor]`):
        List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
Nr   r   )r   mulfloatr5   updater3   )r6   r   labelsr-   r>   r?   
candidatesr@   rA   losses_dictr   s              r%   forwardTvpLoss.forwardp   s     *0&hYYv0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KKD}}T*:AVmuvw  
 r$   )r3   r5   )r   r   r   r   r   r/   r0   r1   r2   rZ   r#   __classcell__r7   s   @r%   r'   r'   7   s&    
	 r$   r'   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVisionModel   c           
        > [         TU ]  5         [        U5      U l        UR                  b  UR                  R
                  S   nO[        U R                  S5      (       aI  [        U R                  R                  S5      (       a$  U R                  R                  R
                  S   nOl[        U R                  S5      (       aF  [        U R                  R                  S5      (       a!  U R                  R                  R                  nO[        S5      e[        R                  " UUR                  SSSSSS	9U l        g )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r.   r/   r   backbonebackbone_configrd   hasattrrc   re   r4   r   Conv2dgrid_encoder_conv)r6   rc   in_channelsr7   s      r%   r/   TvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H--'$--:N:NP^2_2_--..;;B?KT]]H--'$--:N:NP]2^2^--..::K899!#"
r$   c                    UR                   u  p#pEnUR                  X#-  XEU5      nU R                  U5      S   S   nU R                  U5      n[        R
                  R                  USSS9n[        R
                  R                  USS9nUR                   SS  u  pnUR                  X#XU5      nUR                  SSS	S
S5      nU$ )Nfeature_mapsr      )rf   rg   T)inplacer   r      )	shapeviewrk   ro   r   
functional
max_pool2drelupermute)r6   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r%   rZ   TvpVisionModel.forward   s    >J>P>P;
e#(()@,X]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*yy)T||Aq!Q*r$   )rk   ro   r   r   r   r   r/   rZ   r#   r\   r]   s   @r%   r_   r_      s    
. r$   r_   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSS	\
4S jjrSrU =r$ )TvpVisualInputEmbedding   z3
Takes input of both image and video (multi-frame)
c                 x  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l
        [        R                  " SUR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        UR                  U l        UR                  U l	        g )Nr   eps)r.   r/   r   	Embeddingmax_position_embeddingsre   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr6   rc   r7   s     r%   r/    TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r$   	embeddingr   r   returnc                    S=pEX R                   :  a  X R                   -  nX0R                  :  a  X0R                  -  nUR                  SSSS5      n[        R                  R                  UXE4SSS9nUR                  SSSS5      nU$ )z
This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
resolution images (high resolution videos).

r   r   r   rt   bicubicFscale_factormodealign_corners)r   r   r}   r   rz   interpolate)r6   r   r   r   h0w0s         r%   interpolate_pos_encoding0TvpVisualInputEmbedding.interpolate_pos_encoding   s     999???B888>>>B%%aAq1	MM--	 . 
	 %%aAq1	r$   r   c                    UR                   u  p4pV[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      n	S[        UR                   5      S-
  -  USU4-   n
U	R                  " U
6 n	[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      nUSX4nUR                  " U6 nX-   nU(       a4  X@R                  :  d  XPR                  :  a  XR                  XU5      -   nU$ X-   nU$ )a.  
Args:
    grid: (batch_size, height, width, hidden_dim)
    interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.
Returns:
    grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
dtypedevice)r   r   r   )rx   r;   r   r   arangelongr   r   lenry   r   r   r   )r6   r   r   r   r   r   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r%   add_2d_positional_embeddings4TvpVisualInputEmbedding.add_2d_positional_embeddings   s7    15

-
E >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	"9">">	"J ==uE	 <<	DKKX"&">">?O"PI:	"9">">	"J 7 Q $:::eFkFk>k778MW\]]D  /Dr$   c                 x   UR                   u  p4pVnUR                  S5      nU R                  XS9nUR                  USU5      nUR                   SS n	UR                  n
[
        R                  " U	[
        R                  U
S9nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )a  
Args:
    grid: Array of shape (batch_size, num_frames, height, width, num_channels).
        It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
        num_frames can be 1
    interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.

Returns:
    embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

r   r   rb   Nr   )rx   meanr   ry   r   r   zerosr   r   r   r   )r6   r   r   r   r   r   r   r   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r%   rZ   TvpVisualInputEmbedding.forward   s     ?Cjj;
|yy|000i		*b,?+11#26%% %8

SYZ $ : :> J":
__Z0
\\*-
r$   )r   r   r   r   r   r   r   r   F)r   r   r   r   r   r/   r   Tensorintr   boolr   rZ   r#   r\   r]   s   @r%   r   r      sY    
X%,,  TW \a\h\h .'4 'Rd  r$   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvpTextInputEmbeddingsi   zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        g )N)padding_idxr   )r.   r/   r   r   
vocab_sizere   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r%   r/   TvpTextInputEmbeddings.__init__#  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r$   c                 .   Ub  UR                  5       nOUR                  5       S S nUS   nUb  UR                  OUR                  nUcD  [        R                  " U[        R                  US9nUR                  S5      R                  U5      nUc$  [        R                  " U[        R                  US9nUc  U R                  U5      nU R                  U5      nU R                  U5      n	XH-   U	-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nrb   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r6   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r%   rZ   TvpTextInputEmbeddings.forward+  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"8;PP
__Z0
\\*-
r$   )r   r   r   r   r   NNNN	r   r   r   r   r   r/   rZ   r#   r\   r]   s   @r%   r   r      s    Q> r$   r   c                   n   ^  \ rS rSrU 4S jrS\R                  S\S\4S jr  SS\	S-  4S	 jjr
S
rU =r$ )TvpAttentioniD  c                   > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R$                  " UR                  UR&                  S9U l        [        R                  " UR*                  5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r.   r/   re   num_attention_headsrm   r4   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   r   s     r%   r/   TvpAttention.__init__E  s{    : ::a?PVXhHiHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=r$   tensorsequence_lengthr   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   rt   )ry   r   r   	transpose
contiguous)r6   r   r   r   s       r%   _reshapeTvpAttention._reshapeY  s5    KK
T5M5MtOgOghYq!_Z\	
r$   Noutput_attentionsc                 $   UR                   S S u  pEU R                  U5      nU R                  U5      nU R                  U5      nU R	                  XeU5      n	U R	                  XuU5      n
U R	                  XU5      n[
        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      n[
        R                  " X5      nUR                  SS5      R                  5       nUR!                  XEU R"                  5      nU R%                  U5      nU R'                  U5      nU R)                  X-   5      nU(       a  X4nU$ U4nU$ )Nrt   rb   dimr   )rx   r   r   r   r   r   matmulr   mathsqrtr   r   rz   softmaxr   r   reshaper   r   r   r   )r6   r   attention_maskr   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                   r%   rZ   TvpAttention.forward`  sy    '4&9&9"1&=#
 JJ}5((=1 JJ}5mm$5
SMM/JO	mm$5
S !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ++O<ll?@!++Aq1<<>!))*tGYGYZjj-ll;/ook&AB4E;0 MX>r$   )
r   r   r   r   r   r   r   r   r   r   NN)r   r   r   r   r/   r   r   r   r   r   rZ   r#   r\   r]   s   @r%   r   r   D  sF    >(
u|| 
c 
s 
 )-	&  $;	& &r$   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )TvpIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g N)r.   r/   r   r   re   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r%   r/   TvpIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r$   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r  r   r  )r6   r   s     r%   rZ   TvpIntermediate.forward  s&    

=100?r$   r  
r   r   r   r   r/   r   r   rZ   r#   r\   r]   s   @r%   r  r    s(    9U\\ ell  r$   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )TvpOutputLayeri  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r.   r/   r   r   r  re   r   r   r   r   r   r   r   r   s     r%   r/   TvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r$   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  r   r   r   )r6   r   r   s      r%   rZ   TvpOutputLayer.forward  s5    

=1]3(DEr$   r"  r  r]   s   @r%   r  r    s6    >U\\  RWR^R^  r$   r  c                   D   ^  \ rS rSrU 4S jr  SS\S-  4S jjrSrU =r$ )TvpEncodeLayeri  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r  )r.   r/   r   	attentionr  intermediater  outputr   s     r%   r/   TvpEncodeLayer.__init__  s3    %f-+F3$V,r$   Nr   c                     U R                  UUUS9nUS   nUSS  nU R                  U5      nU R                  Xu5      nU4U-   nU$ )N)r   r   r   r'  r(  r)  )	r6   r   r  r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs	            r%   rZ   TvpEncodeLayer.forward  sl     "&/ "0 "

 2!4(,"//0@A{{#6I/G+r$   r,  r  )	r   r   r   r   r/   r   rZ   r#   r\   r]   s   @r%   r%  r%    s(    - )-	  $;	 r$   r%  c            
       f   ^  \ rS rSrU 4S jr    S
S\S-  S\S-  S\S-  S\\-  4S jjrS	r	U =r
$ )
TvpEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r.   r/   rc   r   
ModuleListrangenum_hidden_layersr%  layergradient_checkpointing)r6   rc   _r7   s      r%   r/   TvpEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&Nr   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSn[	        U R
                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  U4nU(       a  X4-   nU(       a  X4-   nU$ [        UU(       a  UOS U(       a  US9$ S S9$ )Nr   r   r   )last_hidden_stater   r   )rc   r=  r   r<  	enumerater8  r
   )r6   r   r  r   r<  r=  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s               r%   rZ   TvpEncoder.forward  s    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4OA#$58H$H!(HYZM)!,M  !/3C2E!E  5   14D D$&G#!$88 !$55N+/C+):~
 	
 AE
 	
r$   )rc   r9  r8  r   )r   r   r   r   r/   r   r"   r
   rZ   r#   r\   r]   s   @r%   r3  r3    sY    , )-,0#'*
  $;	*

 #Tk*
 D[*
 
	 *
 *
r$   r3  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	TvpPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r  )r.   r/   r   r   re   r   Tanh
activationr   s     r%   r/   TvpPooler.__init__  s9    YYv1163E3EF
'')r$   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rK  )r6   r   first_token_tensorpooled_outputs       r%   rZ   TvpPooler.forward  s6     +1a40

#566r$   )rK  r   r  r]   s   @r%   rH  rH    s(    $
U\\ ell  r$   rH  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
TvpPreTrainedModeli  rc   model)videotextTmodulec                    [        U[        R                  [        R                  45      (       a6  [        R
                  " UR                  SU R                  R                  S9  GO[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR                  5        O[        U[        R                  5      (       aO  [        R                  " UR                  SSS9  UR                  b!  [        R                  " UR                  S5        O5[        U[         5      (       a   [        R
                  " UR"                  5        [        U[        R                  5      (       a-  UR                  b   [        R                  " UR                  5        [%        US5      (       a   [        R
                  " UR&                  5        [%        US	5      (       a   [        R
                  " UR(                  5        [%        US
5      (       a   [        R
                  " UR*                  5        [%        US5      (       a!  [        R
                  " UR,                  5        gg)zInitialize the weights        )r   stdfan_outr|   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r  r   r   r   initnormal_weightrc   initializer_ranger   zeros_rj   ones_rn   kaiming_normal_	constant_TvpModeltext_promptrm   r\  r]  r^  r_  )r6   rV  s     r%   _init_weights TvpPreTrainedModel._init_weights  su    fryy",,788LLSdkk6S6ST--KK$JJv}}%		**  YVT{{&v{{A.))LL++,fbii((V[[-DKK$68$$LL'6:&&LL)6:&&LL)6;''LL))* (r$   r   N)r   r   r   r   r   r!   base_model_prefixinput_modalitiessupports_gradient_checkpointingr   no_gradr   Modulerj  r#   r   r$   r%   rR  rR    s=    (&*#
]]_+BII + +r$   rR  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )TvpFrameDownPadPrompteri'  z6
Pad frames extracted from videos only at the bottom.
c           	        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                  U l        UR                   U l         [        R                  " [        R                  " SUR
                  SUR                  UR                  /5      5      U l        g )NrG   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr4   r.   r/   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnr]  r   s     r%   r/    TvpFrameDownPadPrompter.__init__,  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r$   c                    U R                   S:w  ao  [        R                  " U R                  U R                  /UR                  UR
                  S9nSX R                  U R                  -
  U R                  2S S 24'   X-  nU R                   S:w  a  [        R                  " UR                  S   UR                  S   SU R                  U R                  /UR
                  S9nU R                  U R                  -
  nU R                  US S 2S S 2S S 2X@R                  2S S 24'   XR                  UR                  5      -  nU$ )	NrG   r   rX  rv  r   r   r   r   )rx  r   onesr{  r   r   ry  r   rx   r]  to)r6   r~   visual_prompt_maskpromptstart_points        r%   rZ   TvpFrameDownPadPrompter.forward:  s(   %%.!&""D$5$56l>P>PYeYlYl" fi0043J3JJTM^M^^`aab.L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK*;*;;Q>?IIl&8&899Lr$   )rz  r{  r]  ry  rx  r   r]   s   @r%   rr  rr  '  s    
 r$   rr  c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )TvpFramePadPrompteriL  z7
Pad frames extracted from videos in the surroundings.
c           
        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                   U l         UR
                  UR                  S-  -
  U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        g )Nrt  rw  rt   r   r   )rx  r4   r.   r/   r   r{  ry  	base_sizer   r|  r   r}  r\  r]  r^  r_  r   s     r%   r/   TvpFramePadPrompter.__init__Q  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r$   r  r   r   r   c                     X R                   -  X0R                   -  pTUR                  u  pgpn
UR                  Xg-  XU
5      n[        R                  R                  UXE4SSS9nUR                  XgXU5      nU$ )z
This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
resolution images (high resolution videos).

r   Fr   )r{  rx   r   r   rz   r   )r6   r  r   r   r   r   batchr   channelsprompt_heightprompt_widths              r%   interpolate_pad_encoding,TvpFramePadPrompter.interpolate_pad_encodingw  s     +++U5F5F-FBCI<<@8L  2H\Z**	 + 
 8UKr$   r  c           	      ^   U(       a  UR                   S   UR                   S   4OU R                  U R                  4u  p4U R                  S;  a  [        SU R                   35      eU R                  S;   a/  [        R
                  " X4/UR                  UR                  S9nX-  nU R                  S;   a  [        R                  " SU R                  S	U R                  U R                  UR                  S
9n[        R                  " U R                  X`R                  /SS9n[        R                  " U R                  XpR                  /S	S9n[        R                  " UR!                  S5      U/-  5      nU(       a  U R#                  XsU5      nXR%                  UR                  5      -   nU$ )Nr   rb   )rG   rv  ru  z$Invalid visual_prompter_apply value )ru  rv  r   )ru  rG   r   r   r  rw   r   r   )rx   r{  rx  r4   r   r  r   r   r   r   r  catr^  r_  r\  r]  r   r  r  )r6   r~   r  r   r   r  baser  s           r%   rZ   TvpFramePadPrompter.forward  sl    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VO<CUCU^j^q^q!r.L%%);;;;q$//1dnndnn]i]p]pqDYYt^^D!LFYYV]]CKFYY|003vh>?F'66vuM'))L4F4F*GGLr$   )r  r{  r   r]  r^  r_  r\  rx  r   )r   r   r   r   r   r/   r   r   r   r  r   rZ   r#   r\   r]   s   @r%   r  r  L  sL    $
Lu|| S QT Y^YeYe 0d  r$   r  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                      ^  \ rS rSrU 4S jrS rS r\       SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )rh  i  c                 ,  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U5      U l        [        R                  " [        R                   " SSUR"                  /5      5      U l        [        R&                  " UR(                  5      U l        UR,                  [.        ;  a  [1        S5      e[.        UR,                     " U5      U l        U R5                  5         g )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r.   r/   rc   r_   vision_modelr   r   r   visual_embeddingsr3  encoderrH  poolerr   r|  r   r}  re   ri  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr4   visual_prompter	post_initr   s     r%   r/   TvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r$   c                 .    U R                   R                  $ r  r   r   )r6   s    r%   get_input_embeddingsTvpModel.get_input_embeddings  s    ...r$   c                 $    XR                   l        g r  r  )r6   r   s     r%   set_input_embeddingsTvpModel.set_input_embeddings  s    */'r$   Nr   r~   r  r   r<  r=  r   r   c                    Ub  UOU R                   R                  nU R                  U R                  X'S95      nU R	                  US9n	U R                  X'S9n
Ub  UR                  U
R                  SS 5      n[        R                  " UR                  S   S5      R                  UR                  UR                  S9n[        R                  " XU/S	S
9nU R                  X1R                  5       5      R                  UR                  5      nU R                   R#                  U	R                  S   S	S	5      n[        R                  " XU
/SS
9nU R%                  UUUUUS9nU(       a  UR&                  OUS   nU R)                  U5      nU R+                  U5      nU R+                  U5      nU(       d
  UU4USS -   $ [-        UUUR.                  UR0                  S9$ )a  
Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpModel

>>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  )r   r   rt   r   r  )r   r   rb   r   r   )r  r   r<  r=  )r?  pooler_outputr   r   )rc   r=  r  r  r   r  new_onesrx   r   r  r  r   r   r  get_extended_attention_maskr   ri  r   r  r?  r  r   r   r   r   )r6   r   r~   r  r   r<  r=  r   kwargstext_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskri  embedding_outputencoder_outputsr?  rO  s                     r%   rZ   TvpModel.forward  s   4 &1%<k$++BYBY((   a
 !%) D"&"8"8 #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==nnnN^_bbclcscstN&&--.C.I.I!.LbRTU 99kJa%bhij,,)/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r$   )	rc   r   r   r  r  ri  r  r  r  )NNNNNNF)r   r   r   r   r/   r  r  r   r   
LongTensorr    r   r"   r   rZ   r#   r\   r]   s   @r%   rh  rh    s     /0  .21526)-,0#').E
##d*E
 ''$.E
 ((4/	E

  $;E
 #TkE
 D[E
 #'E
 
+	+E
 E
r$   rh  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVideoGroundingHeadi  c                 B  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  S-  S5      U l        [        R                  " 5       U l        [        R                  " 5       U l
        g )Nrt   )r.   r/   r   r   re   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r%   r/   TvpVideoGroundingHead.__init__  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr$   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU$ r  )r  r  r  r  )r6   r  r   s      r%   rZ   TvpVideoGroundingHead.forward  s9    ""4<<#>?""4<<#78r$   )r  r  r  r  r   r]   s   @r%   r  r    s    ) r$   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                      ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\	\R                     S-  S\S-  S	\S-  S
\S-  S\S\	\-  4S jj5       rSrU =r$ )TvpForVideoGroundingi  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r  )r.   r/   rc   rh  rS  r  video_grounding_headr  r   s     r%   r/   TvpForVideoGrounding.__init__%  s8     f%
$9&$A!r$   Nr   r~   r  rW   r   r<  r=  r   r   c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n
U
S   nU R                  U5      nSnUbo  [	        / SQ5      nUR                  U R                  5        U" X5      nUS   U R                   R                  US   -  -   U R                   R                  US   -  -   nU(       d  U4U
SS -   n
Ub  U4U
-   n
U
$ [        UUU
R                  U
R                  S	9$ )
a  
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
    The labels contains duration, start time, and end time of the video corresponding to the text.

Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

>>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r   r<  r=  r   r   r*   r+   r,   r-   rt   )r   r   r   r   )rc   r=  rS  r  r'   r  r   distance_loss_weightduration_loss_weightr   r   r   )r6   r   r~   r  rW   r   r<  r=  r   r  r  r  r   r   	criterion	loss_dicts                   r%   rZ   TvpForVideoGrounding.forward-  s'   < &1%<k$++BYBY**/!5#%=  
  
**=9 ?@ILL%!&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r$   )rc   rS  r  )NNNNNNNF)r   r   r   r   r/   r   r   r  r    r"   r   r   r   rZ   r#   r\   r]   s   @r%   r  r    s      .21526-1)-,0#').?
##d*?
 ''$.?
 ((4/	?

 ell#d*?
  $;?
 #Tk?
 D[?
 #'?
 
(	(?
 ?
r$   r  )rh  rR  r  )1r   r   dataclassesr   r   r    r   r`  activationsr   backbone_utilsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   utilsr   r   configuration_tvpr   
get_loggerr   loggerr   rp  r'   r_   r   r   r   r  r  r%  r3  rH  rR  rr  r  r  rh  r  r  __all__r   r$   r%   <module>r     s     !   & ! + 9 X X - , ( 
		H	% <k <  <$Mbii M`%RYY %Pnbii nb!RYY !HB299 BLbii RYY / 41
 1
j		  + + +B"bii "JW")) Wv ,#   
]
! ]

]
@BII  
I
- I

I
X Er$   