
    Z jG                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJr  \R>                  " \ 5      r!\\" SS9 " S S\5      5       5       r"\" SS9\ " S S\5      5       5       r# " S S\RH                  5      r%\ " S S\5      5       r&\" SS9 " S S\&5      5       r'\" S S9 " S! S"\&\
5      5       r(/ S#Qr)g)$zPyTorch Llava model.    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringloggingtorch_compilable_check)can_return_tuplemerge_with_config_defaults   )	AutoModel   )LlavaConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)LlavaModelOutputWithPast$   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )
__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations____static_attributes__r       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/llava/modeling_llava.pyr   r   $   s    	 59**T18r(   r   zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)LlavaCausalLMOutputWithPast9   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr   r   )r   r    r!   r"   r#   r-   r$   r%   r&   r.   r/   r   r0   tupler1   r   r'   r   r(   r)   r+   r+   9   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r(   r+   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LlavaMultiModalProjectorW   configc                   > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " UR                  R                  U-  UR                  R                  UR                  S9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  UR                  S9U l        g )Nr   bias)super__init__
isinstancevision_feature_layerintlenr   Linearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)selfr6   num_feature_layers	__class__s      r)   r;   !LlavaMultiModalProjector.__init__X   s    ",V-H-H#"N"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r(   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)rE   rG   rH   )rI   image_featuresr0   s      r)   forward LlavaMultiModalProjector.forwardf   s2    n5/m4r(   )rG   rE   rH   )	r   r    r!   r"   r   r;   rP   r'   __classcell__rK   s   @r)   r4   r4   W   s    
{ 
 r(   r4   c                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrg)	LlavaPreTrainedModelm   r6   model)imagetextTr/   r   N)r   r    r!   r"   r   r&   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr'   r   r(   r)   rU   rU   m   s=    (&*#"3N!"&r(   rU   zu
    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
    c                   \  ^  \ rS rSrS\4U 4S jjrS rS r\\	\
" SS9   SS	\R                  S
\\\   -  \\   -  S-  S\S-  S\S-  S\\   S\\-  4S jj5       5       5       rS\R,                  S\R                  S\R                  4S jr\	\
         SS\R,                  S-  S	\R                  S-  S\R0                  S-  S\R,                  S-  S\S-  S\R                  S-  S
\\\   -  \\   -  S-  S\S-  S\R0                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )
LlavaModel}   r6   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g rN   )r:   r;   r   from_configrA   vision_towerr4   multi_modal_projectorrC   language_model	post_initrI   r6   rK   s     r)   r;   LlavaModel.__init__   sY     %11&2F2FG%=f%E"'33F4F4FGr(   c                 6    U R                   R                  5       $ rN   )rj   get_input_embeddingsrI   s    r)   ro   LlavaModel.get_input_embeddings   s    ""7799r(   c                 :    U R                   R                  U5        g rN   )rj   set_input_embeddingsrI   values     r)   rs   LlavaModel.set_input_embeddings   s    007r(   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr=   vision_feature_select_strategyoutput_hidden_stateskwargsreturnc                    UR                  5        VVs0 s H  u  pgUc  M
  Xg_M     nnnU R                  " U4SSS.UD6n[        U[        5      (       a!  UR                  U   n	US:X  a  U	S S 2SS 24   n	OSU V
s/ s H  oR                  U
   PM     nn
US:X  a  U Vs/ s H  oS S 2SS 24   PM     nn[
        R                  " USS9n	U R                  U	5      nUR                  S5      b{  [
        R                  " US   UR                  S9U R                  R                  -  R                  SS9R                  5       n[
        R                  " UR                  S	5      U5      nO[!        U5      nXl        U$ s  snnf s  sn
f s  snf )
NT)ry   return_dictdefaultr   dimimage_sizes)devicer   )itemsrh   r<   r>   r0   r$   catri   get	as_tensorr   
patch_sizeprodtolistsplitsqueezelistpooler_output)rI   rw   r=   rx   ry   rz   kvimage_outputsselected_image_feature	layer_idxhs_poolhsrO   split_sizess                  r)   get_image_featuresLlavaModel.get_image_features   s    $*<<>C>41Q$!$>C))
!%
 	
 *C00%2%@%@AU%V"-:)?12)F&OcdOc)229=OcGd-:/67wae9w7%*YYwB%?"334JK ::m$0!6~?T?TUY]YjYjYuYuu" 
 #[[)?)?)BKPN!.1N&4#K D  e 8s   	E:E::F F	input_idsinputs_embedsrO   c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)dtyper   r   r   r   z6Image features and image tokens do not match, tokens: z, features: )ro   r$   tensorr6   image_token_idlongr   allsumshape	unsqueeze	expand_astor   numel)rI   r   r   rO   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskLlavaModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r(   attention_maskposition_idsr/   r   c
                     US L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbw  U R                  UUUU	SS9R                  n[        R
                  " USS9R                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " S	UUUUS.U
D6n[        UR                  UR                  UR                  UR                   Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rw   r=   rx   r   r}   r   r   )r   rO   )r   r   r/   r   )last_hidden_stater/   r0   r1   r   r   )
ValueErrorro   r   r   r$   r   r   r   r   r   masked_scatterrj   r   r   r/   r0   r1   )rI   r   rw   r   r   r/   r   r=   rx   r   rz   rO   r   outputss                 r)   rP   LlavaModel.forward   s@    -t";<YZZ  557	BM#!44)%9/M'  5  m  #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K\M%% 
)%+'	

 
 (%77#33!//))2>2J
 	

 QU
 	
r(   )rj   ri   rh   )NNN)	NNNNNNNNN)r   r    r!   r"   r   r;   ro   rs   r   r   r   r$   r%   r>   r   strboolr   r   r2   r
   r   
LongTensorr   Tensorr   r   rP   r'   rR   rS   s   @r)   rd   rd   }   s   { :8  n DH59,0-''- "DIoS	9D@- ),d
	-
 #Tk- +,- 
+	+-   
-^"))":?:K:K"]b]n]n"0  .215.204(,26CG59+//
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
 "DIoS	9D@/
 ),d
/
 \\D(/
 +,/
 
)	)/
  /
r(   rd   zS
    The LLAVA model which consists of a vision backbone and a language model.
    c                   h  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  4S	 jr\  SS\R                  S\\\   -  \\   -  S
-  S\S
-  S\\   S\\-  4
S jj5       r\\           SS\R0                  S
-  S\R                  S
-  S\R2                  S
-  S\R0                  S
-  S\S
-  S\R                  S
-  S\\\   -  \\   -  S
-  S\S
-  S\R0                  S
-  S\\R2                  -  S\R2                  S
-  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )LlavaForConditionalGenerationi  zlm_head.weightz(model.language_model.embed_tokens.weightr6   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr8   )r:   r;   rd   rW   r   r@   rC   rB   
vocab_sizelm_headrk   rl   s     r)   r;   &LlavaForConditionalGeneration.__init__  sS     '
yy!3!3!?!?ASASA^A^ejkr(   c                 6    U R                   R                  5       $ rN   )rW   ro   rp   s    r)   ro   2LlavaForConditionalGeneration.get_input_embeddings  s    zz..00r(   c                 :    U R                   R                  U5        g rN   )rW   rs   rt   s     r)   rs   2LlavaForConditionalGeneration.set_input_embeddings"  s    

''.r(   r{   c                     U R                   $ rN   )r   rp   s    r)   get_output_embeddings3LlavaForConditionalGeneration.get_output_embeddings%  s    ||r(   Nrw   r=   rx   rz   c                 B    U R                   R                  " SUUUS.UD6$ )N)rw   r=   rx   r   )rW   r   )rI   rw   r=   rx   rz   s        r)   r   0LlavaForConditionalGeneration.get_image_features(  s3     zz,, 
%!5+I
 	
 	
r(   r   r   r   r/   r   labelslogits_to_keepr   c                    U R                   " SUUUUUUUUUS.	UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SUXR                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )aA  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```)	r   rw   r   r   r/   r   r=   rx   r   r   N)r.   r   r   )r-   r.   r/   r0   r1   r   r   )rW   r<   r>   slicer   loss_functionr6   rC   r   r+   r/   r0   r1   r   )rI   r   rw   r   r   r/   r   r=   rx   r   r   r   rz   r   r0   slice_indicesr.   r-   s                     r)   rP   %LlavaForConditionalGeneration.forward7  s    \ ** 
%)%+'!5+I#
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r(   c           	      z   > [         T
U ]  " U4UUUUUS.UD6n	U(       d  UR                  SS5      (       d  XIS'   U	$ )N)r/   r   r   r   is_first_iteration	use_cacheTrw   )r:   prepare_inputs_for_generationr   )rI   r   r/   r   rw   r   r   r   rz   model_inputsrK   s             r)   r   ;LlavaForConditionalGeneration.prepare_inputs_for_generation  sZ     w<
+'))1
 
 VZZT%B%B
 ,8(r(   )r   rW   )NN)NNNNNNNNNr   N)NNNNNF) r   r    r!   r"   _tied_weights_keysr   r;   ro   rs   r   Moduler   r   r$   r%   r>   r   r   r   r   r2   r
   r   r   r   r   r   r+   rP   r   r'   rR   rS   s   @r)   r   r     s    +,VW{ 1/ryy   DH59	
''
 "DIoS	9D@
 ),d
	

 +,
 
+	+
 
  .215.204(,26CG59*.-.+/K
##d*K
 ''$.K
 t+	K

 &&-K
 K
 ((4/K
 "DIoS	9D@K
 ),d
K
   4'K
 ell*K
 \\D(K
 +,K
 
,	,K
  K
`   r(   r   )r   rU   rd   )*r#   dataclassesr   r$   r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   autor   configuration_llavar   
get_loggerr   loggerr   r+   r   r4   rU   rd   r   __all__r   r(   r)   <module>r      s/    !   !   ) ` ` - & X X I  , 
		H	% 
96 9 9 
 9+ 9 90ryy , '? ' ' 
L
% L

L
^ 
N$8/ N
Nb Rr(   