
    Z j<                     .   S r SSKrSSKJr  SSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  \R0                  " \5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\5      5       r/ SQrg)zPyTorch Fuyu model.    N)nn   )Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)	AutoModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )
FuyuConfigc                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ rSrSrg)	FuyuPreTrainedModel    configmodel)imagetextTpast_key_values N)__name__
__module____qualname____firstlineno__r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placement__static_attributes__r       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s=    (&*#"&N"3r+   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   V  ^  \ rS rSrS\4U 4S jjrS rS rS\R                  S\
\R                     S\R                  S	\R                  4S
 jr\\S\R                  S\\   S	\\-  4S j5       5       rS\R(                  S\R                  S\R                  4S jr\\        SS\R(                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R(                  S-  S\S-  S\R                  S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )	FuyuModel.   r   c                   > [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)super__init__pad_token_idpadding_idxtext_config
vocab_sizer
   from_configlanguage_modelr   Linear
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initselfr   	__class__s     r,   r3   FuyuModel.__init__4   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r+   c                 6    U R                   R                  5       $ N)r9   get_input_embeddingsrB   s    r,   rG   FuyuModel.get_input_embeddingsA   s    ""7799r+   c                 :    U R                   R                  U5        g rF   )r9   set_input_embeddingsrB   values     r,   rK   FuyuModel.set_input_embeddingsD   s    007r+   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       H  n[
        R                  " X5   S:  SS9S   nX5   U   nUR                   S   X%   R                   S   :  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   M     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)rB   rO   rP   rQ   output_embeddings	batch_idxdst_indicessrc_indicess           r,   gather_continuous_embeddings&FuyuModel.gather_continuous_embeddingsG   sI   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78I  --(A(LPQ(Q\`abcdK 4>{KK  #&;&F&L&LQ&OO ^7L7W7]7]6_ `I6A6G6G5II[\e[ffgi  9N8XYd8e8h8h!((945 9  ! r+   pixel_valueskwargsc                 6    U R                  U5      n[        US9$ )z
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
)last_hidden_state)r>   r   )rB   re   rf   patch_embeddingss       r,   get_image_featuresFuyuModel.get_image_featuress   s!      33LA)<LMMr+   	input_idsinputs_embedsimage_featuresc           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
dtyper^   r   r   z6Image features and image tokens do not match, tokens: z, features: )rG   r[   tensorr   image_token_idlongr^   allsumrV   	unsqueeze	expand_asr]   r   numel)rB   rl   rm   rn   special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_maskFuyuModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r+   Nimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cachec	           	      j   USL USL-  (       a  [        S5      eUc   U R                  R                  5       " U5      nUR                  S   n
Uch  Ub  UR                  OUR                  nUb  UR                  5       OSn[        R                  " XU-   [        R                  US9nUR                  S5      nUba  U R                  USS9R                  nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " S
UUUUUS	.U	D6nU$ )a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rp   T)return_dict)rm   rn   )rm   r   r   r   r   r   )rX   r9   rG   rV   r^   get_seq_lengthr[   arangeru   rx   rj   rh   r]   rq   r~   masked_scatter)rB   rl   r   r   r   r   r   rm   r   rf   seq_lenr^   past_key_values_lengthri   r{   outputss                   r,   forwardFuyuModel.forward   sa   , -t";<YZZ  //DDFyQM%%a()2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&2H(HPUPZPZciL (11!4L$#66}RV6Wii/22=3G3GI\I\]!%!:!:GW "; " *889K^M%% 
')%+
 
 r+   )r?   r9   r5   r>   r7   )NNNNNNNN)r   r   r   r   r   r3   rG   rK   r[   Tensorlistrc   r   r   FloatTensorr   r   tupler   rj   
LongTensorr~   r   boolr   r   r*   __classcell__rC   s   @r,   r/   r/   .   s   z :8*!*!  $ELL1*! $)<<	*!
 
*!X N!--N9?@R9SN	+	+N  N"))":?:K:K"]b]n]n"0  .2-159.204(,26!%5##d*5 ||d*	5
  %||d25 t+5 &&-5 5 ((4/5 $;5 +,5 
'	'5  5r+   r/   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                     ^  \ rS rSrSS0rS\4U 4S jjrS rS r\	\
          SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )FuyuForCausalLM   zlm_head.weightz(model.language_model.embed_tokens.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)bias)r2   r3   r/   r   r   r:   r6   r=   r7   lm_headr@   rA   s     r,   r3   FuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr+   c                 6    U R                   R                  5       $ rF   )r   rG   rH   s    r,   rG   $FuyuForCausalLM.get_input_embeddings   s    zz..00r+   c                 :    U R                   R                  U5        g rF   )r   rK   rL   s     r,   rK   $FuyuForCausalLM.set_input_embeddings   s    

''.r+   Nrl   r   r   r   r   r   rm   r   labelslogits_to_keeprf   rR   c                    U R                   " SUUUUUUUUS.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```)rl   r   r   rm   r   r   r   r   r   N)logitsr   r7   )lossr   r   hidden_states
attentionsr   )r   
isinstanceintslicer   loss_functionr   r6   r7   r   r   r   r   )rB   rl   r   r   r   r   r   rm   r   r   r   rf   r   r   slice_indicesr   r   s                    r,   r   FuyuForCausalLM.forward   s    j ** 

'"7')%+

 

  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD &#33!//))
 	
r+   c           
         > [         T
U ]  " U4UUUUUUS.UD6n	U(       d!  UR                  SS5      (       a
  S U	S'   S U	S'   U	$ )N)r   r   rm   r   r   is_first_iterationr   Tr   r   )r2   prepare_inputs_for_generationget)rB   rl   r   r   rm   r   r   r   rf   model_inputsrC   s             r,   r   -FuyuForCausalLM.prepare_inputs_for_generation9  sh     w<	
+)''"71	
 	
 "fjjd&C&C48L01,0L)r+   )r   r   )
NNNNNNNNNr   )NNNNNF)r   r   r   r   _tied_weights_keysr   r3   rG   rK   r   r   r[   r   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   s   @r,   r   r      sc    +,VWz 1/  .2-159.204(,26!%&*%&P
##d*P
 ||d*	P

  %||d2P
 t+P
 &&-P
 P
 ((4/P
 $;P
 t#P
 d
P
 +,P
 
'	'P
  P
j "  r+   r   )r   r   r/   )__doc__r[   r   cache_utilsr   
generationr   modeling_outputsr   r   modeling_utilsr	   models.auto.modeling_autor
   processing_utilsr   utilsr   r   r   r   r   configuration_fuyur   
get_loggerr   loggerr   r/   r   __all__r   r+   r,   <module>r      s         ) R - 2 & j j * 
		H	% 
4/ 
4 
4 
[# [
[| 
@)? @
@F Br+   