
    Z jK                        S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJr  \\" SS9 " S S\5      5       5       r\" SS9\ " S S\5      5       5       r " S S\R<                  5      r\ " S S\5      5       r \" SS9 " S S\ 5      5       r!\" SS9 " S  S!\ \	5      5       r"/ S"Qr#g)#    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuple   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)VipLlavaModelOutputWithPast&   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )
__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations____static_attributes__r       /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   &   s    	 59**T18r&   r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)VipLlavaCausalLMOutputWithPast;   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr   r   )r   r   r   r    r!   r+   r"   r#   r$   r,   r-   r   r.   tupler/   r   r%   r   r&   r'   r)   r)   ;   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r&   r)   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjectorY   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr4   num_feature_layers	__class__s      r'   r:   $VipLlavaMultiModalProjector.__init__Z   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr&   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rC   rF   rH   rI   )rJ   r.   s     r'   forward#VipLlavaMultiModalProjector.forwardi   sB    00?m4/m4r&   )rH   rF   rI   rC   )	r   r   r   r    r   r:   rP   r%   __classcell__rL   s   @r'   r2   r2   Y   s    m~ m r&   r2   c                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrg)	VipLlavaPreTrainedModelq   r4   model)imagetextTr-   r   N)r   r   r   r    r   r$   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr%   r   r&   r'   rU   rU   q   s=    (&*#"3N!"&r&   rU   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	" SS9 SS	\
R                  S
\\\   -  S-  S\\   S\\-  4S jj5       5       rS\
R&                  S\
R                  S\
R                  4S jr\\	        SS\
R&                  S-  S	\
R                  S-  S\
R*                  S-  S\
R&                  S-  S\S-  S\
R                  S-  S
\\\   -  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )VipLlavaModel   r4   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g rO   )r9   r:   r   from_configr@   vision_towerr2   multi_modal_projectorrE   language_model	post_initrJ   r4   rL   s     r'   r:   VipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr&   c                 6    U R                   R                  5       $ rO   )rj   get_input_embeddingsrJ   s    r'   ro   "VipLlavaModel.get_input_embeddings   s    ""7799r&   c                 :    U R                   R                  U5        g rO   )rj   set_input_embeddingsrJ   values     r'   rs   "VipLlavaModel.set_input_embeddings   s    007r&   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr<   kwargsreturnc                 x   Ub  UOU R                   R                  nSUS'   U R                  " U40 UD6n[        U[        5      (       a  UR
                  U   SS2SS24   nO<U Vs/ s H  odR
                  U   SS2SS24   PM     nn[        R                  " USS9nU R                  U5      nXTl	        U$ s  snf ),  
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
    The tensors corresponding to the input images.
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
NToutput_hidden_statesr   )dim)
r4   r<   rh   r;   r=   r.   r"   catri   pooler_output)rJ   rw   r<   rx   image_outputsimage_featuresindexs          r'   get_image_features VipLlavaModel.get_image_features   s    $ &;%F!DKKLmLm 	 *.%&))

 +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC&4# ls   &!B7	input_idsinputs_embedsr   c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)dtypedevicer}   r   r   z6Image features and image tokens do not match, tokens: z, features: )ro   r"   tensorr4   image_token_idlongr   allsumshape	unsqueeze	expand_astor   numel)rJ   r   r   r   special_image_maskn_image_tokensn_image_featuress          r'   get_placeholder_mask"VipLlavaModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r&   attention_maskposition_idsr-   	use_cache	lm_kwargsc	           	      
   Ub  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUb`  U R	                  X'S9R
                  n
U
R                  UR                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6n[        UR                  UR                  UR                  UR                   Ub  W
OSS9nU$ )z
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedsrw   r<   )r   r   )r   r   r-   r   r   )last_hidden_stater-   r.   r/   r   r   )r4   r<   
ValueErrorro   r   r   r   r   r   r   masked_scatterrj   r   r   r-   r.   r/   )rJ   r   rw   r   r   r-   r   r<   r   r   r   r   outputsoutputs                 r'   rP   VipLlavaModel.forward   s;   ( &;%F!DKKLmLm 	 -t";<YZZ  557	BM#!44) 5 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M+/+>+> ,
)%+',
 ,
 -%77#33!//))2>2JPT
 r&   )rj   ri   rh   rO   )NNNNNNNN)r   r   r   r    r   r:   ro   rs   r   r   r"   r#   r=   listr   r   r0   r
   r   
LongTensorr   Tensorr   boolr   rP   r%   rR   rS   s   @r'   rd   rd      s   ~ :8 n 9="''"  #T#Y5" +,	"
 
+	+" "H"))":?:K:K"]b]n]n"0  .215.204(,268<!%5##d*5 ''$.5 t+	5
 &&-5 5 ((4/5  #T#Y55 $;5 ./5 
,	,5  5r&   rd   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c                   $  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\ SS\R                  S\\\   -  S
-  S\\   S\\-  4S jj5       r\\          SS\R.                  S
-  S\R                  S
-  S\R0                  S
-  S\R.                  S
-  S\S
-  S\R                  S
-  S\\\   -  S
-  S\R.                  S
-  S\S
-  S\\R0                  -  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ ) VipLlavaForConditionalGenerationi  zlm_head.weightz(model.language_model.embed_tokens.weightr4   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr7   )r9   r:   rd   rW   r   rD   rE   rA   
vocab_sizelm_headrk   rl   s     r'   r:   )VipLlavaForConditionalGeneration.__init__  sS     "6*
yy!3!3!?!?ASASA^A^ejkr&   c                 6    U R                   R                  5       $ rO   )rW   ro   rp   s    r'   ro   5VipLlavaForConditionalGeneration.get_input_embeddings  s    zz..00r&   c                 :    U R                   R                  U5        g rO   )rW   rs   rt   s     r'   rs   5VipLlavaForConditionalGeneration.set_input_embeddings   s    

''.r&   ry   c                     U R                   $ rO   )r   rp   s    r'   get_output_embeddings6VipLlavaForConditionalGeneration.get_output_embeddings#  s    ||r&   Nrw   r<   rx   c                 >    U R                   R                  " SXS.UD6$ )r{   r   r   )rW   r   )rJ   rw   r<   rx   s       r'   r   3VipLlavaForConditionalGeneration.get_image_features&  s+     zz,, 
%
V\
 	
r&   r   r   r   r-   r   labelsr   logits_to_keepr   c                    Ub  UOU R                   R                  nU R                  " SUUUUUUU	US.UD6nUR                  n[	        U
[
        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnUb.  U R                  XU R                   R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```N)r   rw   r   r   r-   r   r   r<   )r,   r   r   )r+   r,   r-   r.   r/   r   r   )r4   r<   rW   r   r;   r=   slicer   loss_functionrE   r   r)   r-   r.   r/   r   )rJ   r   rw   r   r   r-   r   r<   r   r   r   r   r   r.   slice_indicesr,   r+   s                    r'   rP   (VipLlavaForConditionalGeneration.forward8  s   j &;%F!DKKLmLm 	 04zz 
0
%)%+'"7
0
 
0
  118B>SV8W8W~ot4]kmA}a,?@A%%Vt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r&   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r-   r   r   r   is_first_iterationr   Trw   )r9   prepare_inputs_for_generationget)rJ   r   r-   r   rw   r   r   r   rx   model_inputsrL   s             r'   r   >VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sZ     w<
+'))1
 
 VZZT%B%B
 ,8(r&   )r   rW   rO   )
NNNNNNNNNr   )NNNNNF) r   r   r   r    _tied_weights_keysr   r:   ro   rs   r   Moduler   r   r"   r#   r=   r   r   r   r0   r
   r   r   r   r   r   r   r)   rP   r   r%   rR   rS   s   @r'   r   r     s    +,VW~ 1/ryy   9=
''
  #T#Y5
 +,	

 
+	+
 
"  .215.204(,268<*.!%-.R
##d*R
 ''$.R
 t+	R

 &&-R
 R
 ((4/R
  #T#Y5R
   4'R
 $;R
 ell*R
 ./R
 
/	/R
  R
n   r&   r   )rd   r   rU   )$dataclassesr   r"   r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   autor   configuration_vipllavar   r   r)   r   r2   rU   rd   r   __all__r   r&   r'   <module>r      s  * "   !   ) ` ` - & O O -  2 
9"9 9 9 
 9[ 9 90")) 0 'o ' ' 
F+ F
FR 
X'> X
Xv [r&   