
    Z ja^                        S r SSKJr  SSKJr  SSKrSSKJr  SSKJr  SSK	J
r
  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$  \RJ                  " \&5      r'\\" SS9 " S S\5      5       5       r(\" SS9\ " S S\5      5       5       r) " S S\RT                  5      r+S\RX                  S\4S  jr-\ " S!S"S#S$9    S6S%\
S#\RX                  S&\RX                  S-  S'\S-  S(\RX                  S-  S)\RX                  S-  S*\R\                  S-  S+\/S-  S,\/S-  S\04S- jj5       r1\ " S. S/\5      5       r2\" S0S9 " S1 S2\25      5       r3\" S0S9 " S3 S4\2\5      5       r4/ S5Qr5g)7zPyTorch PaliGemmamodel.    )Callable)	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)PaligemmaModelOutputWithPast.   a  
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )
__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations____static_attributes__r!       ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   .   s     59**T18r+   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)PaliGemmaCausalLMOutputWithPast>   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr    r!   )r"   r#   r$   r%   r&   r0   r'   r(   r)   r1   r2   r   r3   tupler4   r    r*   r!   r+   r,   r.   r.   >   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r+   r.   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )PaliGemmaMultiModalProjector\   configc                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr9   	__class__s     r,   r>   %PaliGemmaMultiModalProjector.__init__]   s;    ii 4 4 @ @&BVBVBeBelpqr+   c                 (    U R                  U5      nU$ NrC   )rE   image_featuresr3   s      r,   forward$PaliGemmaMultiModalProjector.forwarda   s    N3r+   rJ   )	r"   r#   r$   r%   r   r>   rL   r*   __classcell__rF   s   @r,   r7   r7   \   s    r r r+   r7   	group_idsreturnc           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )aY  
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
Args:
    group_ids (`torch.Tensor`):
        A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
        come from the same input image. Text is denoted by `-1`.
	batch_idxhead_idxq_idxkv_idxrQ   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   )maxr   )shapeclampr'   where)
rS   rT   rU   rV   
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_grouprP   s
            r,   
inner_mask0token_type_ids_mask_function.<locals>.inner_maskq   s    __R(
 
Q7*q.9 I45Y67++e0'2>;;v2HbA#155r+   )intbool)rP   rb   s   ` r,   token_type_ids_mask_functionrf   g   s3    6c 6S 6 6c 6d 6 r+   input_embeds5.6.0inputs_embedsversionnew_namer9   attention_maskr2   position_idstoken_type_idspixel_valuesis_trainingis_first_iterationc	                    U(       a  Uc  [        S5      eU R                  5       UUUUS.n
U(       a  UO&USL =(       d    UR                  (       + =(       d    USLnU(       d  U	R                  SS5      (       d>  Ub  SU-
  nO5[        R                  S5        [        R                  " U5      SS2SS2S4   nUb  U(       a  US:H  R                  UR                  5      n[        R                  R                  US	SS
9SS2SS24   nX) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " X[        R                   " US5      5      n[#        U5      U
S'   [%        S0 U
D6$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Paligemma uses a bidirectional mask on the prompt tokens.

Uses `pixel_values` as an optional input to disambiguate edge cases.
Nz;`token_type_ids` is required as a model input when training)r9   ri   rm   r2   rn   	use_cacheTr   zIt is a prefill stage but The `token_type_ids` is not provided. We recommend passing `token_type_ids` to the model to prevent bad attention masking.r   )r   r   )valuerX   )dimor_mask_functionr!   )
ValueErrorget_text_configis_initializedgetloggerwarning_oncer'   	ones_liketodevicer   
functionalpadcumsumrd   r\   	full_likerf   r
   )r9   ri   rm   r2   rn   ro   rp   rq   rr   kwargsmask_kwargsis_imageis_previous_imagenew_image_startrP   s                  r,   create_causal_mask_mappingr      ss   & ~-VWW ((*&(*$K  	%g_-K-K)Kg|cgOg  K!>!>% /NZ
 #__];Aq!GDN
 !&8 #a'++M,@,@AMM--ha-HCRCP"%77LL!4!4!6A>B	KKU__^UW5XY	*Fy*Q&'$3{33r+   c                   J    \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrSrSrS	rg
)PaliGemmaPreTrainedModel   r9   model)imagetextTr7   r2   Fr!   N)r"   r#   r$   r%   r   r)   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr*   r!   r+   r,   r   r      sF    (&*#78"3"N"&r+   r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
" SS9S	\R                  S
\\   S\\-  4S j5       5       rS\R$                  S\R                  S\R                  4S jr\	\
         SS\R$                  S-  S	\R                  S-  S\R(                  S-  S\R$                  S-  S\S-  S\R$                  S-  S\R                  S-  S\R$                  S-  S\S-  S
\\   S\\-  4S jj5       5       rSrU =r$ )PaliGemmaModel   Fr9   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  R                  5       R                  =(       d    U R                  U l        U R                  5         g )N)r9   )r=   r>   r   from_configr@   vision_towerr7   multi_modal_projectortext_config
vocab_sizelanguage_modelr9   ry   dtypetext_config_dtype	post_init)rE   r9   r   rF   s      r,   r>   PaliGemmaModel.__init__   s     %119M9MN%A&%I" ,,77"..f6H6HI,!%!<!<!>!D!D!R

r+   c                 6    U R                   R                  5       $ rI   )r   get_input_embeddings)rE   s    r,   r   #PaliGemmaModel.get_input_embeddings   s    ""7799r+   c                 :    U R                   R                  U5        g rI   )r   set_input_embeddings)rE   ru   s     r,   r   #PaliGemmaModel.set_input_embeddings   s    007r+   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   rp   r   rQ   c                 r    U R                   " U40 UD6nUR                  nU R                  U5      nXSl        U$ rI   )r   last_hidden_stater   pooler_output)rE   rp   r   image_outputsselected_image_featurerK   s         r,   get_image_features!PaliGemmaModel.get_image_features   sB     )),A&A!.!@!@334JK&4#r+   	input_idsri   rK   c           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r   r   rX   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r'   tensorr9   image_token_idlongr   allsumrZ   	unsqueeze	expand_asr   r   numel)rE   r   ri   rK   special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_mask#PaliGemmaModel.get_placeholder_mask  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r+   Nrm   rn   r2   ro   labelsrt   c
                    USL USL-  (       a  [        S5      eUbQ  U R                  R                  U R                  :  a-  XR                  R                  :H  nUR	                  5       nSX'   OUnUc  U R                  5       " U5      nUcX  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      S-   nUbb  U R                  U5      R                  nUR                  UR                  UR                  5      nU R!                  XUS9nUR#                  X5      n[%        U=n[&        5      (       d$  [)        U R                  UUUUUUU R*                  S9nU R,                  " S
UUUUU	S.U
D6n[/        UR0                  UR2                  UR4                  UR6                  Ub  WS	9$ SS	9$ )
  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

>>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )r   )ri   rK   )rq   )rm   rn   r2   ri   rt   )r   r2   r3   r4   r    r!   )rx   r9   r   r   cloner   get_seq_lengthr'   arangerZ   r   r   r   r   r   r   r   masked_scatter
isinstancedictr   trainingr   r   r   r2   r3   r4   )rE   r   rp   rm   rn   r2   ro   ri   r   rt   r   r   llm_input_idspast_seen_tokensrK   causal_mask_mappingoutputss                    r,   rL   PaliGemmaModel.forward  s   Z -t";<YZZ  T[[%?%?4??%R!*kk.H.H!H%OO-M01M-%M  557FMCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4q8L #!44\BPPN+..}/C/C]EXEXYN!%!:!:~ "; " *889K\M ?-FF"< MM	# %% 
.%+'
 
 ,%77#33!//))2>2J
 	

 QU
 	
r+   )r   r   r   r   r   )	NNNNNNNNN)r"   r#   r$   r%   accepts_loss_kwargsr   r>   r   r   r   r   r'   r(   r   r   r5   r   r   
LongTensorr   Tensorr   re   r   r   rL   r*   rN   rO   s   @r,   r   r      s     
 
:8 n!--9?@R9S	+	+ "))":?:K:K"]b]n]n"0  .215.204(,2626*.!%c
##d*c
 ''$.c
 t+	c

 &&-c
 c
 ((4/c
 ((4/c
   4'c
 $;c
 -.c
 
-	-c
  c
r+   r   c                     ^  \ rS rSrSS0rS\4U 4S jjr\S\R                  S\
\   4S j5       r\\          SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\\R                  -  S\
\   S\\-  4S jj5       5       r          SU 4S jjr\\" SSSS9  SS\S\R                  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\4S jj5       5       rSrU =r$ )!PaliGemmaForConditionalGenerationi  zlm_head.weightz(model.language_model.embed_tokens.weightr9   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr;   )r=   r>   r   r   r   r?   r   rA   r   lm_headr   rD   s     r,   r>   *PaliGemmaForConditionalGeneration.__init__  sS     #F+
yy!3!3!?!?ASASA^A^ejkr+   rp   r   c                 <    U R                   R                  " U40 UD6$ rI   )r   r   )rE   rp   r   s      r,   r   4PaliGemmaForConditionalGeneration.get_image_features  s    zz,,\DVDDr+   Nr   rm   rn   r2   ro   ri   r   rt   logits_to_keeprQ   c                    U R                   " SUUUUUUUU	US.	UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )r   )	r   rp   ro   rm   rn   r2   ri   rt   r   r   N)r1   r   r   )r0   r1   r2   r3   r4   r    r!   )r   r   rd   slicer   loss_functionr9   r   r   r.   r2   r3   r4   r    )rE   r   rp   rm   rn   r2   ro   ri   r   rt   r   r   r   r3   slice_indicesr1   r0   s                    r,   rL   )PaliGemmaForConditionalGeneration.forward  s    Z ** 
%))%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD /#33!//)) ' ; ;
 	
r+   c                    > [         TU ]  " U4UUUUUU	UUS.UD6nUR                  S5      b  US   S-   US'   U(       d  U(       d  X]S'   U$ )N)r2   ri   rm   rn   rt   r   ro   rr   rn   r   rp   )r=   prepare_inputs_for_generationr{   )rE   r   r2   ri   rn   rp   rm   ro   rt   r   r   rr   r   model_inputsrF   s                 r,   r   ?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation  sz      w<
+')%))1
 
 N+7+7+G!+KL( Y+7(r+   rg   rh   rj   rr   c           
          [        U UUUUU4SU0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6$ s  sn	nf )Nrr   rp   )r   items)
r9   ri   rm   r2   rn   ro   rr   r   kvs
             r,   r
   ;PaliGemmaForConditionalGeneration.create_masks_for_generate  s_     *	
  2	
 !'F!~2EtqtF	
 		
 Gs   ??)r   r   )
NNNNNNNNNr   )
NNNNNNTNNF)NF)r"   r#   r$   r%   _tied_weights_keysr   r>   r   r'   r(   r   r   r   r   r   r   r   re   rd   r5   r.   rL   r   staticmethodr   r   r   r
   r*   rN   rO   s   @r,   r   r     s?    +,VW  Eu/@/@ EFSeLf E E  .215.204(,2626*.!%-.J
##d*J
 ''$.J
 t+	J

 &&-J
 J
 ((4/J
 ((4/J
   4'J
 $;J
 ell*J
 +,J
 
0	0J
  J
^  )V ^WO /3*/
 
||
 t+
 	

 llT)
 t+
 !4K
 

 P 
r+   r   )r   r   r   )NNFN)6r&   collections.abcr   dataclassesr   r'   r   cache_utilsr   configuration_utilsr   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor   configuration_paligemmar   
get_loggerr"   r|   r   r.   Moduler7   r   rf   r(   re   r   r   r   r   r   __all__r!   r+   r,   <module>r      s!    $ !     3 ) 6 B S - &  1  4 
		H	% 
9#: 9 9 
 9k 9 90299 ELL X 6 ?K +/-1$&*C4C4<<C4 LL4'C4 T\	C4
 ,,%C4 LL4'C4 ##d*C4 C4 tC4 
C4 LC4L ' ' ' 
c
- c

c
L 
\
(@/ \

\
~ ^r+   