
    Z jm_                        S SK r S SKJr  S SKJrJrJr  S SKJrJ	r	J
r
JrJrJrJrJr  SSKJr  SSKJr  SSKJrJr  \" S	S
9\ " S S\5      5       5       r\" S	S
9\ " S S\5      5       5       r\" S	S
9\ " S S\5      5       5       r " S S\5      r " S S\5      r " S S\5      r " S S\
5      r " S S\5      r " S S\	5      r/ SQr g)    N)strict)InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfig)'BaseModelOutputWithVisionQformerOutputs$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )BaseModelOutputWithPooling)Unpack)auto_docstringcan_return_tuplez"Salesforce/instructblip-flan-t5-xl)
checkpointc                       \ rS rSrSrSrg)InstructBlipVideoVisionConfig(   a   
Example:

```python
>>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel

>>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instructblip-flan-t5-xl style configuration
>>> configuration = InstructBlipVideoVisionConfig()

>>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
>>> model = InstructBlipVideoVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
``` N__name__
__module____qualname____firstlineno____doc____static_attributes__r       ڐ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   (   s    r    r   c                       \ rS rSrSrSrg)InstructBlipVideoQFormerConfig<   a  
cross_attention_frequency (`int`, *optional*, defaults to 2):
    The frequency of adding cross-attention to the Transformer layers.
encoder_hidden_size (`int`, *optional*, defaults to 1408):
    The hidden size of the hidden states for cross-attention.

Examples:

```python
>>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel

>>> # Initializing a InstructBlipVideo Salesforce/instructblip-flan-t5-xl style configuration
>>> configuration = InstructBlipVideoQFormerConfig()

>>> # Initializing a model (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
>>> model = InstructBlipVideoQFormerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```r   Nr   r   r    r!   r#   r#   <   s    r    r#   c                   D    \ rS rSr% SrSS0rSr\S-  \S'   \	" 5       r
Srg)InstructBlipVideoConfigT   a  
qformer_config (`dict`, *optional*):
    Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
num_query_tokens (`int`, *optional*, defaults to 32):
    The number of query tokens passed through the Transformer.

Example:

```python
>>> from transformers import (
...     InstructBlipVideoVisionConfig,
...     InstructBlipVideoQFormerConfig,
...     OPTConfig,
...     InstructBlipVideoConfig,
...     InstructBlipVideoForConditionalGeneration,
... )

>>> # Initializing a InstructBlipVideoConfig with Salesforce/instructblip-flan-t5-xl style configuration
>>> configuration = InstructBlipVideoConfig()

>>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instructblip-flan-t5-xl style configuration
>>> model = InstructBlipVideoForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

>>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
>>> vision_config = InstructBlipVideoVisionConfig()
>>> qformer_config = InstructBlipVideoQFormerConfig()
>>> text_config = OPTConfig()

>>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
```video_token_idvideo_token_indexNr   )r   r   r   r   r   attribute_mapr)   int__annotations__AttributeErrorimage_token_indexr   r   r    r!   r&   r&   T   s-    "H &':;M$(sTz(&(r    r&   c                       \ rS rSrSrSrg) InstructBlipVideoPreTrainedModel   )videotextr   Nr   r   r   r   input_modalitiesr   r   r    r!   r0   r0      s    (r    r0   c                       \ rS rSrSrSrg)InstructBlipVideoVisionModel   r2   r   Nr4   r   r    r!   r7   r7      s    r    r7   c                       \ rS rSrSrg)InstructBlipVideoQFormerModel   r   Nr   r   r   r   r   r   r    r!   r:   r:          r    r:   c                       \ rS rSrSrg)4InstructBlipVideoForConditionalGenerationModelOutput   r   Nr<   r   r    r!   r?   r?      r=   r    r?   c                   L   \ rS rSr\\        SS\R                  S\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S\
S-  S\\   S\\-  4S jj5       5       rSrg)InstructBlipVideoModel   Npixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsinterpolate_pos_encoding	use_cachekwargsreturnc           	      \   UR                   u  ppnUR                  X-  XU5      nU R                  " SUU	S.UD6nUS   n[        R                  " UR                  5       S S [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       S S [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " UU/SS9nU R                  " SUUUUUS.UD6nUS   S S 2S UR                  S5      2S S 24   nU R                  U5      nUR                  XR                  R                   U-  S5      nUcR  U R"                  R%                  5       " U5      nX@R                  R&                  :H  nUc  [        R                  " U5      nOiXR%                  5       " [        R(                  " U R                  R&                  [        R                  UR                  S95      :H  nUR+                  S5      nUR-                  S5      R/                  U5      R1                  UR                  5      nUR1                  UR                  UR2                  5      nUR5                  UU5      nU R                  R6                  (       a  U R"                  " SUUU
S.UD6nOU R"                  " SUUUUU
S	.UD6n[9        UUUS
9$ )NrD   rL   r   dtypedevicedim   rG   rH   query_embedsencoder_hidden_statesencoder_attention_maskrK   rH   rM   )rK   rH   rI   rJ   rM   )vision_outputsqformer_outputslanguage_model_outputsr   )shapereshapevision_modeltorchonessizelongrU   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionconfignum_query_tokenslanguage_modelget_input_embeddingsr(   tensorall	unsqueeze	expand_astorT   masked_scatteruse_decoder_only_language_modelr?   )selfrD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   
batch_sizeframeschannelheightwidthr^   image_embedsimage_attention_maskrh   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputss                              r!   forwardInstructBlipVideoModel.forward   sF   $ 6B5G5G2
GU#++J,?RWX** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t  //DDFyQM!*kk.H.H!H%!&!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)) +-# 	G )) +-"3'=# G D))#*
 	
r    r   )NNNNNNFN)r   r   r   r   r   r   rd   FloatTensor
LongTensorTensorboolr   r   tupler?   r   r   r   r    r!   rB   rB      s   
 ;?.22659:>-1).!%Z
''Z
 !,,Z
 !& 0 04 7	Z

 $$t+Z
 ((4/Z
 !++d2Z
 !& 0 04 7Z
 ||d*Z
 #'Z
 $;Z
 +,Z
 
E	EZ
  Z
r    rB   c                   J   \ rS rSr\\  SS\R                  S\R                  S\R                  S-  S\	S-  S\
\   S\\-  4S	 jj5       5       rS
 rS\R                  S\R                  4S jr\\         SS\R                  S\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S\	S-  S\
\   S\\-  4S jj5       5       r\R&                  " 5             SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S\R                  4S jj5       rSrg))InstructBlipVideoForConditionalGeneration   NrD   rE   rF   rL   rN   rO   c           	         UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  " S
UUS.UD6n[        UR                  UR
                  UR                  UR                  USS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR!                  USS9nUR!                  USS9n[        R"                  " X/SS9nU R$                  " S
UUUUUS	.UD6nUUl        US   SS2SUR                  S5      2SS24   nU R)                  U5      nUR                  X`R*                  R,                  U-  S5      nUUl        U$ )a  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.
rQ   N)last_hidden_statepooler_outputhidden_states
attentionsr^   r_   r   rR   rS   rV   rX   rY   r   )ra   rb   rc   r   r   r   r   r   rd   re   rf   rg   rU   rh   ri   rj   rk   rl   rm   r_   rn   ro   rp   )rz   rD   rE   rF   rL   rN   r{   r|   r}   r~   r   r^   r   r   rh   r   r_   r   video_featuress                      r!   get_video_features<InstructBlipVideoForConditionalGeneration.get_video_features   s   ( 6B5G5G2
GU#++J,?RWX595F5F 6
%%=6
 6

 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@+Y_`!a,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
KK<X<X[a<acef'5$r    c                      [        S5      e)Nz=No need to inherit as this architecture only supports videos.)r-   )super_kwargss    r!   get_image_features<InstructBlipVideoForConditionalGeneration.get_image_features8  s    \]]r    rG   rK   c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ )zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
rS   rR   )rr   rd   rs   ro   r(   rg   rU   rt   ru   rv   rw   )rz   rG   rK   r   s       r!   get_placeholder_mask>InstructBlipVideoForConditionalGeneration.get_placeholder_mask;  s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=GGVYYZgZnZno!!r    rH   rI   rJ   labelsrM   c           
         U R                   " U4UUU
S.UD6nUR                  nUR                  nUR                  nUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                  R                  (       aT  U R                  " S	UUUS.UD6nUS   nSnU	b3  U R                  " S	UXR                  R                   R"                  S.UD6nO1U R                  " S	UUUUU	US.UD6nUR$                  nUR&                  n[)        UUUUUS9$ )
a	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```rE   rF   rL   NrK   r]   r   )logitsr   
vocab_size)rK   rH   rI   rJ   r   rM   )lossr   r^   r_   r`   r   )r   r   r_   r^   rr   rd   rj   rw   rU   rT   r   rx   ro   ry   rq   loss_functiontext_configr   r   r   r?   )rz   rD   rE   rF   rG   rH   rI   rJ   rK   r   rL   rM   rN   r   r   r_   r^   r   r   r   r   s                        r!   r   1InstructBlipVideoForConditionalGeneration.forwardJ  s   ` CGBYBYC
/#9%=	C

 C
 !/ < <(88'66  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +-# 	G QZFD!)) !&[[=T=T=_=_ci
 )) +-"3'=# G <<D^^FC)+#*
 	
r    c                 X   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUS9n
U
R                  nUc  Uc  U R
                  R                  /U R
                  R                  -  S-  nXR
                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                   " U5      nUR#                  UR                  UR$                  5      nU R'                  XFS9nUR)                  X5      nXeS.nU R*                  R
                  R,                  (       d  XOS	'   U R*                  R.                  " S
0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
hf_device_mapr   r      rS   rX   r   )rK   rH   rG   r   )hasattr_preprocess_acceleratera   r   r   ro   r)   rp   r   bos_token_idrd   rs   rg   rU   repeatrr   rj   rw   rT   r   rx   rq   is_encoder_decodergenerate)rz   rD   rE   rF   rG   rH   rK   rL   generate_kwargsr{   r   r   video_tokensstart_tokensr   inputsr   s                    r!   r   2InstructBlipVideoForConditionalGeneration.generate  s   D 4))'')!''*
BFBYBY/#9%=	 CZ C
 !/ < <   $ = =>A]A]]`aa+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+;%%..KK?Kr    r   )NF)	NNNNNNNFN)NNNNNF)r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r?   r   no_gradr   r   r   r    r!   r   r      s   
 ;?05C''C !++C !& 0 04 7	C
 #'+C +,C 
8	8C  CJ^"e.>.> "uO`O` " 
 ;?.22659:>26*.).!%D
''D
 !,,D
 !& 0 04 7	D

 $$t+D
 ((4/D
 !++d2D
 !& 0 04 7D
 ((4/D
   4'D
 #'D
 $;D
 +,D
 
E	ED
  D
L ]]_ 6::>-12626).C''C !++d2C !& 0 04 7	C
 ##d*C ((4/C ((4/C #'C 
		C Cr    r   )r&   r#   r   r7   r0   r:   rB   r   )!rd   huggingface_hub.dataclassesr   ;transformers.models.instructblip.configuration_instructblipr   r   r   6transformers.models.instructblip.modeling_instructblipr   r   r	   r
   r   r   r   r   modeling_outputsr   processing_utilsr   utilsr   r   r   r#   r&   r0   r7   r:   r?   rB   r   __all__r   r    r!   <module>r      s     . 
	 	 	 ; & 5 ?@$<   A$ ?@%>   A, ?@')0 ')  A')T)'B )#: 	$< 		;j 	]
. ]
@f0T fR		r    