
    Z j[                        S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSKJrJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.J/r/  SSK0J1r1J2r2J3r3  \%Rh                  " \55      r6 " S S\Rn                  5      r8 " S S\Rn                  5      r9\# " S S\5      5       r: SKS\Rn                  S\Rv                  S\Rv                  S\Rv                  S \Rv                  S-  S!\<S"\<4S# jjr= " S$ S%\Rn                  5      r> " S& S'\Rn                  5      r? " S( S)\5      r@ " S* S+\Rn                  5      rA " S, S-\:5      rB " S. S/\Rn                  5      rC " S0 S1\Rn                  5      rD " S2 S3\Rn                  5      rE " S4 S5\Rn                  5      rF " S6 S7\Rn                  5      rG " S8 S9\5      rH " S: S;\Rn                  5      rI " S< S=\:5      rJ\#" S>S?9\ " S@ SA\!5      5       5       rK\#" SBS?9 " SC SD\:5      5       rL\#\ " SE SF\5      5       5       rM\#" SGS?9 " SH SI\:\5      5       rN/ SJQrOg)L    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastSeq2SeqLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )!InstructBlipVideoVisionEmbeddings;   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr#   r   )in_channelsout_channelskernel_sizestrider   )super__init__r*   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr*   	__class__s     ڑ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr1   *InstructBlipVideoVisionEmbeddings.__init__<   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r#   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdim)shaper>   r7   jit
is_tracingr5   r   reshapepermuter   
functionalinterpolateviewcat)r@   rE   rF   rG   r<   r=   class_pos_embedpatch_pos_embedrP   
new_height	new_widthsqrt_num_positionss               rB   interpolate_pos_encoding:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingN   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrD   pixel_valuesr_   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )Ndtyper   r#   rJ   rO   )rQ   r;   weightrd   toflatten	transposer9   expandr7   rY   r_   r>   rL   )r@   ra   r_   
batch_size_rF   rG   target_dtypepatch_embedsclass_embedsrE   r>   s               rB   forward)InstructBlipVideoVisionEmbeddings.forwardv   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
rD   )	r9   r*   r3   r4   r<   r=   r;   r5   r>   F)__name__
__module____qualname____firstlineno__r&   r1   r7   Tensorintr_   FloatTensorboolro   __static_attributes____classcell__rA   s   @rB   r(   r(   ;   sr    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  rD   r(   c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )"InstructBlipVideoQFormerEmbeddings   z;Construct the embeddings from word and position embeddings.c                 "  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  Xl        g )N)padding_idxepsposition_idsr#   rJ   F)
persistent)r0   r1   r   	Embedding
vocab_sizer2   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_eps	layernormDropouthidden_dropout_probdropoutregister_bufferr7   arangeri   r*   r?   s     rB   r1   +InstructBlipVideoQFormerEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 rD   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbY  U R                  U5      nU R	                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr#   r   rO   )rL   r   cloner   r   rf   devicer7   rY   r   re   rd   r   )r@   	input_idsr   query_embedspast_key_values_length
seq_lengthrE   r   s           rB   ro   *InstructBlipVideoQFormerEmbeddings.forward   s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J"&":":<??:K\K\;]"^#9J'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rD   )r*   r   r   r   r   NNNr   )	rr   rs   rt   ru   __doc__r1   ro   rz   r{   r|   s   @rB   r~   r~      s#    E"   rD   r~   c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSr/ SQr\R                   " 5       U 4S j5       rSrU =r$ )	 InstructBlipVideoPreTrainedModel   r*   blip)videotextT)r~   InstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                 V  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g[	        U[        [        45      (       a!  [        R                  " UR                  5        g[	        U[        5      (       a\  [        R                  " UR                   ["        R$                  " UR                   R&                  S   5      R)                  S5      5        gg)zInitialize the weights        )meanstdrJ   r   N)r0   _init_weightsr*   initializer_range
isinstancer(   inittrunc_normal_r>   r9   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelzeros_query_tokensr~   copy_r   r7   r   rQ   ri   )r@   modulefactorrA   s      rB   r   .InstructBlipVideoPreTrainedModel._init_weights   s     	f%..f?@@v88sOv55CVL!JLb cddKK++, BCCJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh DrD    )rr   rs   rt   ru   r$   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr7   no_gradr   rz   r{   r|   s   @rB   r   r      s\    ##(&*#"&N! ]]_
i 
irD   r   r   querykeyvalueattention_maskscalingr   c                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrJ   rO   )ptrainingr#   r   )	r7   matmulrh   r   rV   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rB   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rD   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	S\R                  S	\
\R                  \R                  S
-  \
\R                     S
-  4   4S jrSrU =r$ )r      z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )bias)requires_grad)r0   r1   r*   r2   r3   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr6   r7   zerosrY   
zeros_liker   
projection)r@   r*   q_biasv_biasr   rA   s        rB   r1   #InstructBlipVideoAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrD   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr#   r   )rX   r   r   rh   r   )r@   r   r   r   s       rB   _shape!InstructBlipVideoAttention._shape  s5    {{3GQQRSUVWbbddrD   hidden_statesrH   Nc                    UR                  5       u  p4nU R                  U5      nUR                  X4SU R                  XPR                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        R                  " U R                  R                  [        5      n
U
" U UUU	4SU R                  (       d  SOU R                  U R                  S.UD6u  pUR                  X4S	5      R                  5       nU R                  U5      nX4$ )
z#Input shape: Batch x Time x Channelr   r   r   r#      Nr   )r   r   r   rJ   )rL   r   rT   r   rU   r   get_interfacer*   _attn_implementationr   r   r   r   r   r   )r@   r   r   r   tgt_lenr3   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                rB   ro   "InstructBlipVideoAttention.forward  s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((rD   )	r   r*   r3   r   r   r   r   r   r   )rr   rs   rt   ru   r   r1   r7   rv   rw   r   tuplero   rz   r{   r|   s   @rB   r   r      su    GD>eU\\ eC ec e")||") 
u||U\\D0%2E2LL	M	") ")rD   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoMLPi;  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r0   r1   r*   r	   
hidden_actactivation_fnr   r   r2   intermediate_sizefc1fc2r?   s     rB   r1   InstructBlipVideoMLP.__init__<  sb    #F$5$5699V//1I1IJ99V55v7I7IJrD   r   rH   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  r  r  r@   r   s     rB   ro   InstructBlipVideoMLP.forwardC  s4    /**=9/rD   )r  r*   r  r  
rr   rs   rt   ru   r1   r7   rv   ro   rz   r{   r|   s   @rB   r   r   ;  s)    KU\\ ell  rD   r   c                   ~   ^  \ rS rSrS\4U 4S jjr\S\R                  S\	\
   S\R                  4S j5       rSrU =r$ )	InstructBlipVideoEncoderLayeriJ  r*   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Nr   )r0   r1   r2   r3   r   	self_attnr   r   r   layer_norm1r   mlplayer_norm2r?   s     rB   r1   &InstructBlipVideoEncoderLayer.__init__K  sm    ++3F;<<F<Q<QR'/<<F<Q<QRrD   r   r   rH   c                     UnU R                  U5      nU R                  " SSU0UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )Nr   r   )r  r  r  r  )r@   r   r   residualrk   s        rB   ro   %InstructBlipVideoEncoderLayer.forwardS  su     !((7>> 
'

 &0 ((7/%0rD   )r3   r  r  r  r  )rr   rs   rt   ru   r$   r1   r   r7   rv   r   r   rx   ro   rz   r{   r|   s   @rB   r  r  J  sR    S6 S || +, 
			 rD   r  c                   \   ^  \ rS rSrSrS\4U 4S jjr\S\\	   S\
\-  4S j5       rSrU =r$ )	InstructBlipVideoEncoderij  a
  
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipVideoEncoderLayer`].

Args:
    config (`InstructBlipVideoConfig`):
        The corresponding vision configuration for the `InstructBlipVideoEncoder`.
r*   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r0   r1   r*   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)r@   r*   rk   rA   s      rB   r1   !InstructBlipVideoEncoder.__init__t  sU    mmTYZ`ZrZrTs$tTsq%B6%JTs$tu&+# %u   A&r   rH   c                 P    UnU R                    H  nU" U40 UD6nM     [        US9$ )Nlast_hidden_state)r  r   )r@   inputs_embedsr   r   encoder_layers        rB   ro    InstructBlipVideoEncoder.forwardz  s9     &![[M)M ) ??rD   )r*   r   r  )rr   rs   rt   ru   r   r$   r1   r   r   r   r   r   ro   rz   r{   r|   s   @rB   r  r  j  sL    ,6 , @ +,@ 
	 	@ @rD   r  c                      ^  \ rS rSr% SrSr\\S'   \\	S.r
S\4U 4S jjr\\" SS9\  SS\R                   S	-  S
\S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )InstructBlipVideoVisionModeli  ra   r   r*   )r   
attentionsc                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r  )r0   r1   r*   r2   r(   rE   r  encoderr   r   r   post_layernorm	post_init)r@   r*   r3   rA   s      rB   r1   %InstructBlipVideoVisionModel.__init__  sY     &&	;FC/7 ll9:O:OPrD   F)tie_last_hidden_statesNr_   r   rH   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r_   r&  r   r%  pooler_outputr   )r   rE   r-  r%  r.  r   )r@   ra   r_   r   r   encoder_outputsr%  pooled_outputs           rB   ro   $InstructBlipVideoVisionModel.forward  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rD   c                     U R                   $ r  )rE   r@   s    rB   get_input_embeddings1InstructBlipVideoVisionModel.get_input_embeddings  s    rD   )r*   rE   r-  r.  r  )rr   rs   rt   ru   main_input_namer   r&   r   r  r   _can_record_outputsr1   r   r   r   r7   rx   ry   r   r   r   r   ro   r:  rz   r{   r|   s   @rB   r*  r*    s    $O))60
	< 	  E2 26).
''$.
 #'
 +,	

 
+	+
  3  
6 rD   r*  c                   h   ^  \ rS rSrSU 4S jjrS rS rS rS rS r	   SS\
\   4S	 jjrS
rU =r$ )r   i  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        SU l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)F)r0   r1   r*   r2   r   hasattrr   rw   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   r   attention_probs_dropout_probr   save_attentionr@   r*   is_cross_attentionrA   s      rB   r1   3InstructBlipVideoQFormerMultiHeadAttention.__init__  sb    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF#rD   c                     Xl         g r  attn_gradients)r@   rL  s     rB   save_attn_gradients>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s    ,rD   c                     U R                   $ r  rK  r9  s    rB   get_attn_gradients=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rD   c                     Xl         g r  attention_map)r@   rT  s     rB   save_attention_map=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s    *rD   c                     U R                   $ r  rS  r9  s    rB   get_attention_map<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rD   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrJ   r   r   r#   r   )rL   r   rB  rX   rU   )r@   xnew_x_shapes      rB   transpose_for_scores?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rD   r   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      nUnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n	U R                  U	5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUR                  nUb  X-   n[        R                  " SS9" U5      R                  U5      nU(       a=  U R                  (       a,  U R                  U5        UR!                  U R"                  5        U R%                  U5      n[        R
                  " X5      nUR'                  SSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR.                  " U6 nX4$ )NrJ   r   rO   r   r   r#   r   )r]  r   r   r   r7   r   rh   mathsqrtrB  rd   r   Softmaxrf   rF  rU  register_hookrM  r   rU   r   rL   rC  rX   )r@   r   r   encoder_hidden_statesencoder_attention_maskr   rH  	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                    rB   ro   2InstructBlipVideoQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ+dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?%<J%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD--rD   )rC  rB  rT  rL  r*   r   r   r   r   rF  r   rq   NNN)rr   rs   rt   ru   r1   rM  rP  rU  rX  r]  r   r   ro   rz   r{   r|   s   @rB   r   r     sF    $0-#+"% "#4. +,4. 4.rD   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )r   i!  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r  )r0   r1   r   r   r2   denser   r   r   r   r   r?   s     rB   r1   +InstructBlipVideoQFormerSelfOutput.__init__"  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rD   r   input_tensorrH   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  rt  r   r   r@   r   rv  s      rB   ro   *InstructBlipVideoQFormerSelfOutput.forward(  5    

=1]3}'CDrD   r   rt  r   r  r|   s   @rB   r   r   !  6    >U\\  RWR^R^  rD   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\\	   S	\R                  4S
 jjr
SrU =r$ )!InstructBlipVideoQFormerAttentioni/  c                 b   > [         TU ]  5         [        X5      U l        [	        U5      U l        g r  )r0   r1   r   	attentionr   outputrG  s      rB   r1   *InstructBlipVideoQFormerAttention.__init__0  s&    CF_8@rD   Nr   r   rd  re  r   rH   c                 Z    U R                   " SUUUUS.UD6u  pgU R                  Xa5      nU$ )N)r   r   rd  re  r   r  r  )	r@   r   r   rd  re  r   r   rk   attention_outputs	            rB   ro   )InstructBlipVideoQFormerAttention.forward5  sF      
')"7#9	

 
  ;;{BrD   r  rq   rq  )rr   rs   rt   ru   r1   r7   rv   rx   r   r   ro   rz   r{   r|   s   @rB   r  r  /  s    A 48:>;? ||  ))D0   %0047	 
 !& 1 1D 8  +,  
   rD   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )$InstructBlipVideoQFormerIntermediateiH  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r  )r0   r1   r   r   r2   r  rt  r   r  strr	   intermediate_act_fnr?   s     rB   r1   -InstructBlipVideoQFormerIntermediate.__init__I  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rD   r   rH   c                 J    U R                  U5      nU R                  U5      nU$ r  rt  r  r	  s     rB   ro   ,InstructBlipVideoQFormerIntermediate.forwardQ  s&    

=100?rD   r  r  r|   s   @rB   r  r  H  s(    9U\\ ell  rD   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoQFormerOutputiW  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r0   r1   r   r   r  r2   rt  r   r   r   r   r   r?   s     rB   r1   'InstructBlipVideoQFormerOutput.__init__X  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rD   r   rv  rH   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  rx  ry  s      rB   ro   &InstructBlipVideoQFormerOutput.forward^  r{  rD   r|  r  r|   s   @rB   r  r  W  r}  rD   r  c                   T   ^  \ rS rSrU 4S jr    SS\\   4S jjrS rS r	Sr
U =r$ )	InstructBlipVideoQFormerLayerie  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr#   r   T)rH  F)r0   r1   chunk_size_feed_forwardseq_len_dimr  r  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr  intermediater  r  intermediate_queryoutput_queryr@   r*   r  rA   s      rB   r1   &InstructBlipVideoQFormerLayer.__init__f  s    '-'E'E$:6B"7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrD   r   c           
      l   U R                   " U4SU0UD6nUS:  a  US S 2S U2S S 24   nU R                  (       a%  Uc  [        S5      eU R                  " U4UUUS.UD6n[	        U R
                  U R                  U R                  U5      n	UR                  S   U:  ag  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      R                  U	R                  5      n
[        R                  " X/SS9n	U	$ [	        U R                  U R                  U R                  U5      n	U	$ )Nr   r   z>encoder_hidden_states must be given for cross-attention layers)r   rd  re  r#   rO   )r  r  r   r  r   feed_forward_chunk_queryr  r  rQ   feed_forward_chunkrf   r   r7   rY   )r@   r   r   rd  re  query_lengthr   r  query_attention_outputlayer_outputlayer_output_texts              rB   ro   %InstructBlipVideoQFormerLayer.forwardz  si     >>
)
 
 !%5a,6I%J"''(0$%eff)-)<)<**#1*?+A	*
 *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR  5'',,   	L rD   c                 J    U R                  U5      nU R                  X!5      nU$ r  )r  r  r@   r  intermediate_outputr  s       rB   r  0InstructBlipVideoQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6IrD   c                 J    U R                  U5      nU R                  X!5      nU$ r  )r  r  r  s       rB   r  6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<OrD   )
r  r  r  r  r  r  r  r  r  r  r   )rr   rs   rt   ru   r1   r   r   ro   r  r  rz   r{   r|   s   @rB   r  r  e  s<    C. "#3 +,3j
 rD   r  c                   R   ^  \ rS rSrU 4S jr\    SS\\   4S jj5       rSr	U =r
$ )InstructBlipVideoQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf r  )
r0   r1   r*   r   r  r  r  r  layerr   r  s      rB   r1   (InstructBlipVideoQFormerEncoder.__init__  sY    ]]OTU[UmUmOnoOn)*6=Ono

 ',# pr"  r   c                     [        U R                  R                  5       H   nU R                  U   nU" UUU4UUS.UD6nM"     [	        US9$ )N)re  r  r$  )r  r*   r  r  r   )	r@   r   r   rd  re  r  r   ilayer_modules	            rB   ro   'InstructBlipVideoQFormerEncoder.forward  sf     t{{445A::a=L(% (>) M 6 9+
 	
rD   )r*   r   r  r   )rr   rs   rt   ru   r1   r   r   r   ro   rz   r{   r|   s   @rB   r  r    s:    ,  "#
 +,
 
rD   r  c                     ^  \ rS rSrSrSrSrSrSr\	\
" \SSS9/\
" \SSS9/S.rS	\4U 4S
 jjrS rS r SS\R$                  S\\   S\R*                  S\S\R$                  4
S jjr\\\     SS\R6                  S\R8                  S-  S\R6                  S-  S\R$                  S-  S\R8                  S-  S\R8                  S-  S\\   S\\R8                     \-  4S jj5       5       5       r Sr!U =r"$ )InstructBlipVideoQFormerModeli  z
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr#   z
.attention)index
layer_namez.crossattention)r   r+  cross_attentionsr*   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r  )r0   r1   r*   r~   rE   r  r-  r/  r?   s     rB   r1   &InstructBlipVideoQFormerModel.__init__  s7     <VD6v>rD   c                 .    U R                   R                  $ r  rE   r   r9  s    rB   r:  2InstructBlipVideoQFormerModel.get_input_embeddings  s    ...rD   c                 $    XR                   l        g r  r  r@   r   s     rB   set_input_embeddings2InstructBlipVideoQFormerModel.set_input_embeddings   s    */'rD   r   input_shaper   	has_queryrH   c                    UR                  5       S:X  a  USS2SSS2SS24   nO>UR                  5       S:X  a  USS2SSSS24   nO[        SU SUR                   S35      eUR                  U R                  S9nSU-
  S	-  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device: (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r   Nr   z!Wrong shape for input_ids (shape z) or attention_mask (shape )rc   g      ?g     )rP   r   rQ   rf   rd   )r@   r   r  r   r  extended_attention_masks         rB   get_extended_attention_mask9InstructBlipVideoQFormerModel.get_extended_attention_mask  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rD   Nr   r   r   rd  re  r   c                    Uc  Uc  [        S5      eUb  UR                  S   OSnU R                  UUUS9n	U	R                  5       SS n
U
u  pU	R                  nUc  [
        R                  " X4US9nU R                  X*U5      nUb  [        U[        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U[        5      (       a"  U Vs/ s H  nU R                  U5      PM     nnO>Uc'  [
        R                  " UUS9nU R                  U5      nOU R                  U5      nOSnU R                  " U	4UUUUS.UD6nUR                  nUSS2SSS24   n[        UUS	9$ s  snf )
a  
query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
    Hidden states to be used in the attention computation. If cross-attention,
    will be used for the query (i.e., key and value will use the encoder_hidden_states).
Nz7You have to specify query_embeds when input_ids is Noner#   r   )r   r   r   rJ   )r   )r   rd  re  r  r3  )r   rQ   rE   rL   r   r7   onesr  r   listinvert_attention_maskr-  r%  r   )r@   r   r   r   r   rd  re  r   r  embedding_outputr  rj   r   r   r  encoder_batch_sizeencoder_sequence_lengthrk   encoder_hidden_shapemaskencoder_extended_attention_maskr5  sequence_outputr6  s                           rB   ro   %InstructBlipVideoQFormerModel.forward.  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
!((!"ZZ*)A6RN #'"B"B>`f"g !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2++/<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
) 3xs   &E<)r*   rE   r-  rq   )NNNNN)#rr   rs   rt   ru   r   r   r   r   r   r  r   r   r=  r%   r1   r:  r  r7   rv   r   rw   r   ry   r  r   r   r   
LongTensorrx   r   r   r   ro   rz   r{   r|   s   @rB   r  r    s   
 #( N 7EQ[gh
 EQ[lm
= /0  )')' 3Z)' 	)'
 )' 
)'V   4804,0:>;?F
##F
 ))D0F
 &&-	F

 llT)F
  %0047F
 !& 1 1D 8F
 +,F
 
u  	!$P	PF
    F
rD   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                       \ rS rSr% SrSr\\R                     S-  \	S'   Sr
\\R                     S-  \	S'   Sr\S-  \	S'   Sr\S-  \	S'   Sr\\-  S-  \	S'   S	\\   4S
 jrSrg)4InstructBlipVideoForConditionalGenerationModelOutputiz  a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrH   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r  r  r  N)getattrto_tuple).0kr@   s     rB   	<genexpr>PInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>  sC      
 ! WW Gq!**,- !s   25)r   keysr9  s   `rB   r  =InstructBlipVideoForConditionalGenerationModelOutput.to_tuple  s%     
 YY[	
 
 	
rD   r   )rr   rs   rt   ru   r   r  r   r7   rx   r   r  r  r   r  r   r  r   r   r   r  rz   r   rD   rB   r  r  z  s     -1D%!!
"T
)0.2FE%##$t+28<N.5<KOOADHONR2_DtKR
%* 
rD   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c                     ^  \ rS rSrSrS/rS\4U 4S jjrS rS r	S r
S	\R                  S
\R                  4S jr\\        SS\R                  S\R                  S\R                  S-  S	\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S
\R"                  S-  S\S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )r   i  ra   r   r*   c                   > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R+                  5         g Nr#   )r0   r1   r*  vision_configvision_modelr   r6   r7   r   num_query_tokensqformer_configr2   r   r  qformerr   text_configlanguage_projectionr    from_configlanguage_modelr/  r?   s     rB   r1   InstructBlipVideoModel.__init__  s     89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG 	rD   c                 6    U R                   R                  5       $ r  r  r:  r9  s    rB   r:  +InstructBlipVideoModel.get_input_embeddings      ""7799rD   c                 :    U R                   R                  U5        g r  r  r  r  s     rB   r  +InstructBlipVideoModel.set_input_embeddings      007rD   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r#   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maplenr7   cudadevice_countloggerwarningrA  r  r  io_same_devicer@   r
  s     rB   _preprocess_accelerate-InstructBlipVideoModel._preprocess_accelerate  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4rD   r   r&  c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
rd   r   rJ   )r:  r7   r   r*   image_token_idlongr   all	unsqueeze	expand_asrf   r@   r   r&  special_image_masks       rB   get_placeholder_mask+InstructBlipVideoModel.get_placeholder_mask       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=GGVYYZgZnZno!!rD   Nqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskr_   	use_cacher   rH   c           	      \   UR                   u  ppnUR                  X-  XU5      nU R                  " SUU	S.UD6nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " UU/SS9nU R                  " SUUUUUS.UD6nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  XR                  R                   U-  S5      nUcR  U R"                  R%                  5       " U5      nX@R                  R&                  :H  nUc  [        R                  " U5      nOiXR%                  5       " [        R(                  " U R                  R&                  [        R                  UR                  S95      :H  nUR+                  S5      nUR-                  S5      R/                  U5      R1                  UR                  5      nUR1                  UR                  UR2                  5      nUR5                  UU5      nU R                  R6                  (       a  U R"                  " SUUU
S	.UD6nOU R"                  " SUUUUU
S
.UD6n[9        UUUS9$ )aU  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
ra   r_   r   NrJ   r  rO   r#   r   r   r   rd  re  r&  r   r&  )r&  r   r$  r%  r&  r  r   )rQ   rT   r  r7   r  rL   r  r   r   ri   	ones_likerepeat_interleaverY   r  r  r*   r  r  r:  video_token_idr   r  r  r  rf   rd   masked_scatteruse_decoder_only_language_modelr  )r@   ra   r"  r#  r   r   r$  r%  r&  r_   r&  r   rj   frameschannelrF   rG   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                              rB   ro   InstructBlipVideoModel.forward  sG   P 6B5G5G2
GU#++J,?RWX** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t  //DDFyQM!*kk.H.H!H%!&!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)) +-# 	G )) +-"3'=# G D))#*
 	
rD   r  r  r  r   r  )NNNNNNFN)rr   rs   rt   ru   r<  _keep_in_fp32_modulesr$   r1   r:  r  r  r7   r  rx   r  r   r   rv   ry   r   r   r   r  ro   rz   r{   r|   s   @rB   r   r     sd    %O+,6 :8?("e.>.> "uO`O` " 
 ;?.22659:>-1).!%p
''p
 !,,p
 !& 0 04 7	p

 $$t+p
 ((4/p
 !++d2p
 !& 0 04 7p
 ||d*p
 #'p
 $;p
 +,p
 
E	Ep
  p
rD   r   c                   B    \ rS rSr% SrSr\S-  \S'   Sr\	S-  \S'   Sr
g)'BaseModelOutputWithVisionQformerOutputsiQ  z
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
Nr  r  r   )rr   rs   rt   ru   r   r  r   r   r  r   rz   r   rD   rB   r=  r=  Q  s)     9=N.5<KOOADHOrD   r=  a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S rS	 rS
\R                  4S jrS U 4S jjrS rS rS\R(                  S\R*                  4S jr\\         S!S\R*                  S\R*                  S\R(                  S-  S\R*                  S-  S\R(                  S-  S\R(                  S-  S\R(                  S-  S\R*                  S-  S\R(                  S-  S\S\S-  S\\   S
\\-  4S jj5       5       r\R>                  " 5             S"S\R*                  S\R(                  S-  S\R(                  S-  S\R(                  S-  S\R(                  S-  S\R*                  S-  S\S
\R(                  4S jj5       r \\  S#S\R*                  S\R(                  S\R(                  S-  S\S-  S\\   S
\\!-  4S jj5       5       r"Sr#U =r$$ )$r   i_  r*   ra   Tr   c                   > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nX l        U R1                  5         g r  )r0   r1   r*  _from_configr  r  r   r6   r7   r   r  r  r2   r   r  r  r   r  r  r/  r!   r  r"   r  r/  )r@   r*   r  rA   s      rB   r1   2InstructBlipVideoForConditionalGeneration.__init__o  s     8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN, 	rD   c                 6    U R                   R                  5       $ r  r  r9  s    rB   r:  >InstructBlipVideoForConditionalGeneration.get_input_embeddings  r  rD   c                 :    U R                   R                  U5        g r  r  r  s     rB   r  >InstructBlipVideoForConditionalGeneration.set_input_embeddings  r  rD   c                 :    U R                   R                  U5        g r  )r  set_output_embeddings)r@   new_embeddingss     rB   rG  ?InstructBlipVideoForConditionalGeneration.set_output_embeddings  s    11.ArD   rH   c                 6    U R                   R                  5       $ r  )r  get_output_embeddingsr9  s    rB   rK  ?InstructBlipVideoForConditionalGeneration.get_output_embeddings  s    ""88::rD   Nc                 X   > Uc  U R                   R                  5       $ [        TU ]  US9$ )N)modality)r  get_encoderr0   )r@   rN  rA   s     rB   rO  5InstructBlipVideoForConditionalGeneration.get_encoder  s1    &&22447&&99rD   c                 6    U R                   R                  5       $ r  )r  get_decoderr9  s    rB   rR  5InstructBlipVideoForConditionalGeneration.get_decoder  s    ""..00rD   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggr  r	  r  s     rB   r  @InstructBlipVideoForConditionalGeneration._preprocess_accelerate  r  rD   r   r&  c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ r  )r:  r7   r   r*   r-  r  r   r  r  r  rf   r  s       rB   r  >InstructBlipVideoForConditionalGeneration.get_placeholder_mask  r!  rD   r"  r#  r   r$  r%  labelsr_   r&  r   c           
         U R                   " U4UUU
S.UD6nUR                  nUR                  nUR                  nUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                  R                  (       aT  U R                  " S	UUUS.UD6nUS   nSnU	b3  U R                  " S	UXR                  R                   R"                  S.UD6nO1U R                  " S	UUUUU	US.UD6nUR$                  nUR&                  n[)        UUUUUS9$ )
a	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```r"  r#  r_   Nr&  r*  r   )r  rX  r   )r&  r   r$  r%  rX  r&  )r  r  r  r  r  r   )get_video_featuresr4  r  r  r:  r7   r+  rf   r   rd   r  r.  r*   r/  r  loss_functionr  r   r  r  r  )r@   ra   r"  r#  r   r   r$  r%  r&  rX  r_   r&  r   video_featuresr7  r  r  r  r8  r  r  s                        rB   ro   1InstructBlipVideoForConditionalGeneration.forward  s   ` CGBYBYC
/#9%=	C

 C
 !/ < <(88'66  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +-# 	G QZFD!)) !&[[=T=T=_=_ci
 )) +-"3'=# G <<D^^FC)+#*
 	
rD   c                 X   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUS9n
U
R                  nUc  Uc  U R
                  R                  /U R
                  R                  -  S-  nXR
                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                   " U5      nUR#                  UR                  UR$                  5      nU R'                  XFS9nUR)                  X5      nXeS.nU R*                  R
                  R,                  (       d  XOS	'   U R*                  R.                  " S
0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
r
  r   rZ  r   r  r#   r[  )r&  r   r   r   )rA  r  rQ   r\  r4  r*   video_token_indexr  r  bos_token_idr7   r   r  r   repeatr:  r+  rf   rd   r  r.  r  is_encoder_decodergenerate)r@   ra   r"  r#  r   r   r&  r_   generate_kwargsrj   r^  r7  video_tokensstart_tokensr  inputsr8  s                    rB   re  2InstructBlipVideoForConditionalGeneration.generateC  s   D 4))'')!''*
BFBYBY/#9%=	 CZ C
 !/ < <   $ = =>A]A]]`aa+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+;%%..KK?KrD   c           	         UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  " S
UUS.UD6n[        UR                  UR
                  UR                  UR                  USS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR!                  USS9nUR!                  USS9n[        R"                  " X/SS9nU R$                  " S
UUUUUS	.UD6nUUl        US   SS2SUR                  S5      2SS24   nU R)                  U5      nUR                  X`R*                  R,                  U-  S5      nUUl        U$ )a  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.
r(  N)r%  r4  r   r+  r  r  r   rJ   r  rO   r#   r)  r   )rQ   rT   r  r=  r%  r4  r   r+  r7   r  rL   r  r   r   ri   r+  r,  rY   r  r  r  r*   r  )r@   ra   r"  r#  r_   r   rj   r0  r1  rF   rG   r  r2  r3  r   r4  r  r6  r^  s                      rB   r\  <InstructBlipVideoForConditionalGeneration.get_video_features  s   ( 6B5G5G2
GU#++J,?RWX595F5F 6
%%=6
 6

 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@+Y_`!a,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
KK<X<X[a<acef'5$rD   r:  r  )	NNNNNNNFN)NNNNNFr  )%rr   rs   rt   ru   r$   r   r<  r   r;  r1   r:  r  rG  r   ModulerK  rO  rR  r  r7   r  rx   r  r   r   ry   r   r   r   r  ro   r   re  r=  r\  rz   r{   r|   s   @rB   r   r   _  s    $#$O!+,6 (:8B;ryy ;:1?("e.>.> "uO`O` " 
 ;?.22659:>26*.).!%D
''D
 !,,D
 !& 0 04 7	D

 $$t+D
 ((4/D
 !++d2D
 !& 0 04 7D
 ((4/D
   4'D
 #'D
 $;D
 +,D
 
E	ED
  D
L ]]_ 6::>-12626).C''C !++d2C !& 0 04 7	C
 ##d*C ((4/C ((4/C #'C 
		C CJ 
 ;?05C''C !++C !& 0 04 7	C
 #'+C +,C 
8	8C  CrD   r   )r*  r   r  r   r   )r   )Pr`  collections.abcr   dataclassesr   typingr   r7   r    r   r   activationsr	   
generationr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   autor    r!   r"   configuration_instructblipvideor$   r%   r&   
get_loggerrr   r  rm  r(   r~   r   rv   floatr   r   r   r  r  r*  r   r   r  r  r  r  r  r  r  r   r=  r   __all__r   rD   rB   <module>r     s  ,  $ !    & ! ) 9  G & 6 j j 7 E I I  
		H	%G		 GT/ /d i i iR %II%<<% 
% <<	%
 LL4'% % %.G) G)T299 $> @@ryy @@3#C 3l^. ^.B  		  2299 RYY R$> Rj!
bii !
HY
$D Y
x 
 
; 
 
: 
l
= l

l
^ 
	P.H 	P  	P f0PRa ffRrD   