ó
    Z– jÆ<  ã                   ó.  • S r SSKrSSKJr  SSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  \R0                  " \5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\5      5       r/ SQrg)zPyTorch Fuyu model.é    N)Únné   )ÚCache)ÚGenerationMixin)ÚBaseModelOutputWithPoolingÚCausalLMOutputWithPast)ÚPreTrainedModel)Ú	AutoModel)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tupleÚloggingÚtorch_compilable_checké   )Ú
FuyuConfigc                   óD   • \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ rSrSrg)	ÚFuyuPreTrainedModelé    ÚconfigÚmodel)ÚimageÚtextTÚpast_key_values© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   Ú__annotations__Úbase_model_prefixÚinput_modalitiesÚsupports_gradient_checkpointingÚ_supports_attention_backendÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnÚ_no_split_modulesÚ_skip_keys_device_placementÚ__static_attributes__r   ó    Úw/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s=   ‡ àÓØÐØ(ÐØ&*Ð#Ø"&ÐØÐØ€NØÐØÐØ"3Ór+   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )Úcustom_introc                   óV  ^ • \ rS rSrS\4U 4S jjrS rS rS\R                  S\
\R                     S\R                  S	\R                  4S
 jr\\S\R                  S\\   S	\\-  4S j5       5       rS\R(                  S\R                  S\R                  4S jr\\        SS\R(                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R(                  S-  S\S-  S\R                  S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )Ú	FuyuModelé.   r   c                 óˆ  >• [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)ÚsuperÚ__init__Úpad_token_idÚpadding_idxÚtext_configÚ
vocab_sizer
   Úfrom_configÚlanguage_modelr   ÚLinearÚ
patch_sizeÚnum_channelsÚhidden_sizeÚvision_embed_tokensÚgradient_checkpointingÚ	post_init©Úselfr   Ú	__class__s     €r,   r3   ÚFuyuModel.__init__4   s–   ø€ Ü‰Ñ˜Ô Ø!×.Ñ.ˆÔØ ×,Ñ,×7Ñ7ˆŒÜ'×3Ò3°F×4FÑ4FÓGˆÔÜ#%§9¢9Ø×Ñ × 1Ñ 1Ñ1°F×4GÑ4GÑGÈ×I[ÑI[ó$
ˆÔ ð ',ˆÔ#à‰Õr+   c                 ó6   • U R                   R                  5       $ ©N)r9   Úget_input_embeddings©rB   s    r,   rG   ÚFuyuModel.get_input_embeddingsA   s   € Ø×"Ñ"×7Ñ7Ó9Ð9r+   c                 ó:   • U R                   R                  U5        g rF   )r9   Úset_input_embeddings©rB   Úvalues     r,   rK   ÚFuyuModel.set_input_embeddingsD   s   € Ø×Ñ×0Ñ0°Õ7r+   Úword_embeddingsÚcontinuous_embeddingsÚimage_patch_input_indicesÚreturnc           
      ó  • UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       Hš  n[
        R                  " X5   S:¬  SS9S   nX5   U   nUR                   S   X%   R                   S   :”  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   Mœ     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)Úas_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element Ú.)	ÚshapeÚlenÚ
ValueErrorÚcloneÚrangeÚtorchÚnonzeroÚtoÚdevice)rB   rO   rP   rQ   Úoutput_embeddingsÚ	batch_idxÚdst_indicesÚsrc_indicess           r,   Úgather_continuous_embeddingsÚ&FuyuModel.gather_continuous_embeddingsG   sI  € ð(  ×%Ñ% aÑ(¬CÐ0EÓ,FÓFÜØJ¬sÐ3HÓ/IÑ.KÐKjÐQ`×QfÑQfÐghÑQiÑPkÐlóð ð ,×1Ñ1Ó3ÐÜ˜×4Ñ4°QÑ7Ö8ˆIô  Ÿ-š-Ð(AÑ(LÐPQÑ(QÐ\`ÑaÐbcÑdˆKð 4Ñ>¸{ÑKˆKà× Ñ  Ñ#Ð&;Ñ&F×&LÑ&LÈQÑ&OÓOÜ Ø^Ð7LÑ7W×7]Ñ7]Ñ6_ð `IØ6A×6GÑ6GÑ5IÐI[Ð\eÐ[fÐfgðióð ð 9NÑ8XÐYdÑ8e×8hÑ8hØ!×(Ñ(ó9Ð¨Ð4Ó5ñ 9ð  !Ð r+   Úpixel_valuesÚkwargsc                 ó6   • U R                  U5      n[        US9$ )z–
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
)Úlast_hidden_state)r>   r   )rB   re   rf   Úpatch_embeddingss       r,   Úget_image_featuresÚFuyuModel.get_image_featuress   s!   € ð  ×3Ñ3°LÓAÐÜ)Ð<LÑMÐMr+   Ú	input_idsÚinputs_embedsÚimage_featuresc           	      óF  • Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z×
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
©Údtyper^   éÿÿÿÿr   r   z6Image features and image tokens do not match, tokens: z, features: )rG   r[   Útensorr   Úimage_token_idÚlongr^   ÚallÚsumrV   Ú	unsqueezeÚ	expand_asr]   r   Únumel)rB   rl   rm   rn   Úspecial_image_maskÚn_image_tokensÚn_image_featuress          r,   Úget_placeholder_maskÚFuyuModel.get_placeholder_mask   s  € ð ÑØ!.×2KÑ2KÔ2MÜ—’˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÑgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ñà!*¯k©k×.HÑ.HÑ!HÐà+×/Ñ/Ó1ˆØ)×/Ñ/°Ñ2°^×5IÑ5IÈ!Ñ5LÑLÐØ/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐÜØÑ-×3Ñ3Ó5¸×9MÑ9MÓ9OÑOØDÀ^ÐDTÐT`ÐaqÐ`rÐsô	
ð "Ð!r+   NÚimage_patchesÚimage_patches_indicesÚattention_maskÚposition_idsr   Ú	use_cachec	           	      ój  • USL USL-  (       a  [        S5      eUc   U R                  R                  5       " U5      nUR                  S   n
Uch  Ub  UR                  OUR                  nUb  UR                  5       OSn[        R                  " XÊU-   [        R                  US9nUR                  S5      nUba  U R                  USS9R                  nUR                  UR                  UR                  5      nU R                  XUS9nUR                  Xí5      nU R                  " S
UUUUUS	.U	D6nU$ )a´  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rp   T)Úreturn_dict)rm   rn   )rm   r‚   rƒ   r   r„   r   )rX   r9   rG   rV   r^   Úget_seq_lengthr[   Úarangeru   rx   rj   rh   r]   rq   r~   Úmasked_scatter)rB   rl   r€   r   r‚   rƒ   r   rm   r„   rf   Úseq_lenr^   Úpast_key_values_lengthri   r{   Úoutputss                   r,   ÚforwardÚFuyuModel.forward—   sa  € ð, ˜Ð -°tÐ";×<ÜÐYÓZÐZàÑ Ø ×/Ñ/×DÑDÔFÀyÓQˆMà×%Ñ% aÑ(ˆàÑØ)2Ñ)>Y×%Ò%ÀM×DXÑDXˆFØIXÑId _×%CÑ%CÔ%EÐjkÐ"Ü Ÿ<š<Ø&Ð2HÑ(HÔPU×PZÑPZÐciñˆLð (×1Ñ1°!Ó4ˆLàÑ$Ø#×6Ñ6°}ÐRVÐ6ÐW×iÑiÐØ/×2Ñ2°=×3GÑ3GÈ×I\ÑI\Ó]ÐØ!%×!:Ñ!:ØÐGWð ";ð "Ðð *×8Ñ8Ð9KÓ^ˆMà×%Ò%ð 
Ø'Ø)Ø%Ø+Øñ
ð ñ
ˆð ˆr+   )r?   r9   r5   r>   r7   )NNNNNNNN)r   r   r   r   r   r3   rG   rK   r[   ÚTensorÚlistrc   r   r   ÚFloatTensorr   r   Útupler   rj   Ú
LongTensorr~   r   Úboolr   r   r*   Ú__classcell__©rC   s   @r,   r/   r/   .   s¼  ø† ð˜z÷ ò:ò8ð*!àŸ™ð*!ð  $ E§L¡LÑ1ð*!ð $)§<¡<ð	*!ð
 
‰ô*!ðX ØðNØ!×-Ñ-ðNØ9?Ð@RÑ9SðNà	Ð+Ñ	+óNó ó ðNð"Ø×)Ñ)ð"Ø:?×:KÑ:Kð"Ø]b×]nÑ]nô"ð0 Øð .2à-1Ø59Ø.2Ø04Ø(,Ø26Ø!%ñ5à×#Ñ# dÑ*ð5ð —|‘| dÑ*ð	5ð
  %Ÿ|™|¨dÑ2ð5ð Ÿ™ tÑ+ð5ð ×&Ñ&¨Ñ-ð5ð  ™ð5ð ×(Ñ(¨4Ñ/ð5ð ˜$‘;ð5ð Ð+Ñ,ð5ð 
Ð'Ñ	'ô5ó ó ö5r+   r/   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                   óš  ^ • \ rS rSrSS0rS\4U 4S jjrS rS r\	\
          SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )ÚFuyuForCausalLMéÑ   zlm_head.weightz(model.language_model.embed_tokens.weightr   c                 óê   >• [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)Úbias)r2   r3   r/   r   r   r:   r6   r=   r7   Úlm_headr@   rA   s     €r,   r3   ÚFuyuForCausalLM.__init__Ù   sS   ø€ Ü‰Ñ˜Ô Ü˜vÓ&ˆŒ
Ü—y’y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÑkˆŒØ‰Õr+   c                 ó6   • U R                   R                  5       $ rF   )r   rG   rH   s    r,   rG   Ú$FuyuForCausalLM.get_input_embeddingsß   s   € Øz‰z×.Ñ.Ó0Ð0r+   c                 ó:   • U R                   R                  U5        g rF   )r   rK   rL   s     r,   rK   Ú$FuyuForCausalLM.set_input_embeddingsâ   s   € Ø
‰
×'Ñ'¨Õ.r+   Nrl   r€   r   r‚   rƒ   r   rm   r„   ÚlabelsÚlogits_to_keeprf   rR   c                 ó†  • U R                   " SUUUUUUUUS.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SXùU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a±  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```)rl   r€   r   rm   r‚   rƒ   r   r„   r   N)Úlogitsr¢   r7   )Úlossr¥   r   Úhidden_statesÚ
attentionsr   )r   Ú
isinstanceÚintÚslicerœ   Úloss_functionr   r6   r7   r   r   r§   r¨   )rB   rl   r€   r   r‚   rƒ   r   rm   r„   r¢   r£   rf   rŒ   r§   Úslice_indicesr¥   r¦   s                    r,   r   ÚFuyuForCausalLM.forwardå   sê   € ðj —*’*ð 

ØØ'Ø"7Ø'Ø)Ø%Ø+Øñ

ð ñ

ˆð   ™
ˆä8BÀ>ÔSV×8WÑ8Wœ˜~˜o¨tÔ4Ð]kˆØ—‘˜mªA¨}ºaÐ,?Ñ@ÓAˆàˆØÑØ×%Ò%ð Ø¸¿¹×9PÑ9P×9[Ñ9[ñØ_eñˆDô &ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
r+   c           
      óˆ   >• [         T
U ]  " U4UUUUUUS.UD6n	U(       d!  UR                  SS5      (       a
  S U	S'   S U	S'   U	$ )N)r   r‚   rm   r€   r   Úis_first_iterationr„   Tr   r€   )r2   Úprepare_inputs_for_generationÚget)rB   rl   r   r‚   rm   r€   r   r°   rf   Úmodel_inputsrC   s             €r,   r±   Ú-FuyuForCausalLM.prepare_inputs_for_generation9  sh   ø€ ô ‘wÒ<Øð	
à+Ø)Ø'Ø'Ø"7Ø1ñ	
ð ñ	
ˆö " f§j¡j°¸d×&CÑ&Cà48ˆLÐ0Ñ1Ø,0ˆL˜Ñ)àÐr+   )rœ   r   )
NNNNNNNNNr   )NNNNNF)r   r   r   r   Ú_tied_weights_keysr   r3   rG   rK   r   r   r[   r“   r   r   r‘   r”   rª   r   r   r’   r   r   r±   r*   r•   r–   s   @r,   r˜   r˜   Ñ   sc  ø† ð +Ð,VÐWÐð˜z÷ ò1ò/ð Øð .2à-1Ø59Ø.2Ø04Ø(,Ø26Ø!%Ø&*Ø%&ñP
à×#Ñ# dÑ*ðP
ð —|‘| dÑ*ð	P
ð
  %Ÿ|™|¨dÑ2ðP
ð Ÿ™ tÑ+ðP
ð ×&Ñ&¨Ñ-ðP
ð  ™ðP
ð ×(Ñ(¨4Ñ/ðP
ð ˜$‘;ðP
ð —‘˜tÑ#ðP
ð ˜d™
ðP
ð Ð+Ñ,ðP
ð 
Ð'Ñ	'ôP
ó ó ðP
ðj ØØØØ"Ø ÷õ r+   r˜   )r˜   r   r/   )Ú__doc__r[   r   Úcache_utilsr   Ú
generationr   Úmodeling_outputsr   r   Úmodeling_utilsr	   Úmodels.auto.modeling_autor
   Úprocessing_utilsr   Úutilsr   r   r   r   r   Úconfiguration_fuyur   Ú
get_loggerr   Úloggerr   r/   r˜   Ú__all__r   r+   r,   Ú<module>rÂ      s¶   ðñ ã Ý å  Ý )ß RÝ -Ý 2Ý &ß jÕ jÝ *ð 
×	Ò	˜HÓ	%€ð ô
4˜/ó 
4ó ð
4ñ ðñô
[Ð#ó [óð
[ñ| ðñô
@Ð)¨?ó @óð
@òF Br+   