
    Z jh.                     X   S SK r S SK Jr  S SKJrJrJrJrJr  SSKJ	r	  SSK
Jr  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  \R2                  " \5      r " S S\5      r " S S\5      r " S S\R<                  5      r " S S\5      r  " S S\5      r! " S S\5      r"/ SQr#g)    N)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)BaseModelOutputWithPastBaseModelOutputWithPooling)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )VipLlavaConfigc                       \ rS rSrSrg)VipLlavaModelOutputWithPast&    N__name__
__module____qualname____firstlineno____static_attributes__r       ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &       r   r   c                       \ rS rSrSrg)VipLlavaCausalLMOutputWithPast*   r   Nr   r   r   r    r#   r#   *   r!   r   r#   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjector.   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r
   projector_hidden_actactlinear_2)selfr(   num_feature_layers	__class__s      r    r-   $VipLlavaMultiModalProjector.__init__/   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)r6   r9   r;   r<   )r=   hidden_statess     r    forward#VipLlavaMultiModalProjector.forward>   sB    00?m4/m4r   )r;   r9   r<   r6   )	r   r   r   r   r   r-   rD   r   __classcell__)r?   s   @r    r&   r&   .   s    m~ m r   r&   c                       \ rS rSrSrg)VipLlavaPreTrainedModelF   r   Nr   r   r   r    rH   rH   F   r!   r   rH   c                      \ rS rSr\\" SS9 SS\R                  S\\	\   -  S-  S\
\   S\\-  4S	 jj5       5       r\\        SS
\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S\R                  S-  S\\	\   -  S-  S\S-  S\
\   S\\-  4S jj5       5       rSrg)VipLlavaModelJ   zWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr/   kwargsreturnc                 x   Ub  UOU R                   R                  nSUS'   U R                  " U40 UD6n[        U[        5      (       a  UR
                  U   SS2SS24   nO<U Vs/ s H  odR
                  U   SS2SS24   PM     nn[        R                  " USS9nU R                  U5      nXTl	        U$ s  snf ),  
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
    The tensors corresponding to the input images.
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
NToutput_hidden_statesr   )dim)
r(   r/   vision_towerr.   r0   rC   torchcatmulti_modal_projectorpooler_output)r=   rN   r/   rO   image_outputsimage_featuresindexs          r    get_image_features VipLlavaModel.get_image_featuresK   s    $ &;%F!DKKLmLm 	 *.%&))

 +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC&4# ls   &!B7	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cache	lm_kwargsc	           	      
   Ub  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUb`  U R	                  X'S9R
                  n
U
R                  UR                  UR                  5      n
U R                  XU
S9nUR                  X5      nU R                  " SUUUUUS.U	D6n[        UR                  UR                  UR                  UR                   Ub  W
OSS9nU$ )z
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedsrN   r/   )rd   r\   )ra   rb   rc   rd   re   )last_hidden_staterc   rC   
attentionsimage_hidden_statesr   )r(   r/   
ValueErrorget_input_embeddingsr^   rZ   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   ri   rc   rC   rj   )r=   r`   rN   ra   rb   rc   rd   r/   re   rf   r\   special_image_maskoutputsoutputs                 r    rD   VipLlavaModel.forwards   s;   ( &;%F!DKKLmLm 	 -t";<YZZ  557	BM#!44) 5 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M+/+>+> ,
)%+',
 ,
 -%77#33!//))2>2JPT
 r   r   rB   )NNNNNNNN)r   r   r   r   r   r   rW   FloatTensorr0   listr   r   tupler   r^   
LongTensorTensorr   boolr   rD   r   r   r   r    rK   rK   J   s]   n 9="''"  #T#Y5" +,	"
 
+	+" "H  .215.204(,268<!%5##d*5 ''$.5 t+	5
 &&-5 5 ((4/5  #T#Y55 $;5 ./5 
,	,5  5r   rK   c                      \ rS rSr\ SS\R                  S\\\   -  S-  S\	\
   S\\-  4S jj5       r\\          SS\R                  S-  S\R                  S-  S	\R                   S-  S
\R                  S-  S\S-  S\R                  S-  S\\\   -  S-  S\R                  S-  S\S-  S\\R                   -  S\	\
   S\\-  4S jj5       5       rSrg) VipLlavaForConditionalGeneration   NrN   r/   rO   rP   c                 >    U R                   R                  " SXS.UD6$ )rR   rh   r   )modelr^   )r=   rN   r/   rO   s       r    r^   3VipLlavaForConditionalGeneration.get_image_features   s+     zz,, 
%
V\
 	
r   r`   ra   rb   rc   rd   labelsre   logits_to_keeprf   c                    Ub  UOU R                   R                  nU R                  " SUUUUUUU	US.UD6nUR                  n[	        U
[
        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnUb.  U R                  XU R                   R                  R                  S9n[        UUUR                  UR                  UR                  UR                  S9$ )a  
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```N)r`   rN   ra   rb   rc   rd   re   r/   )logitsr   
vocab_size)lossr   rc   rC   rj   rk   r   )r(   r/   r   ri   r.   r0   slicelm_headloss_functionr8   r   r#   rc   rC   rj   rk   )r=   r`   rN   ra   rb   rc   rd   r/   r   re   r   rf   ru   rC   slice_indicesr   r   s                    r    rD   (VipLlavaForConditionalGeneration.forward   s   j &;%F!DKKLmLm 	 04zz 
0
%)%+'"7
0
 
0
  118B>SV8W8W~ot4]kmA}a,?@A%%Vt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r   rB   )
NNNNNNNNNr   )r   r   r   r   r   rW   rx   r0   ry   r   r   rz   r   r^   r   r{   r|   r   r}   r#   rD   r   r   r   r    r   r      s    9=
''
  #T#Y5
 +,	

 
+	+
 
"  .215.204(,268<*.!%-.R
##d*R
 ''$.R
 t+	R

 &&-R
 R
 ((4/R
  #T#Y5R
   4'R
 $;R
 ell*R
 ./R
 
/	/R
  R
r   r   )rK   r   rH   )$rW   r   (transformers.models.llava.modeling_llavar   r   r   r   r   activationsr
   cache_utilsr   modeling_outputsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_vipllavar   
get_loggerr   loggerr   r#   Moduler&   rH   rK   r   __all__r   r   r    <module>r      s       "   S & @ @ - 2 
		H	%	": 		%@ 	")) 0	2 	`J `Fg
'D g
T [r   