
    Z j                     l   S SK Jr  S SKJrJr  S SKrS SKJr  S SKJ	r	  SSK
Jr  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJrJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9  SSK:J;r;J<r<J=r=J>r>J?r?  SSK@JArA  \)R                  " \C5      rD\'" SS9\	 " S S\/\5      5       5       rE\'" SS9\	 " S S\5      5       5       rF " S S\>5      rG " S S \;5      rH " S! S"\R                  5      rJ " S# S$\35      rK " S% S&\65      rL " S' S(\7\R                  5      rN " S) S*\15      rO " S+ S,\5      rPSrQ " S- S.\55      rRS/\SS0\\S\S\S\S/\T4   4S1 jrU " S2 S3\45      rV " S4 S5\25      rW " S6 S7\R                  5      rX\+" S8S9S:S;9  SLS<\S:\R                  S=\R                  S-  S>\S-  S?\R                  S-  S@\R                  S-  SA\TS-  S0\Z4SB jj5       r[ " SC SD\=5      r\ " SE SF\<5      r] " SG SH\R5      r^ " SI SJ\\R5      r_/ SKQr`g)M    )Callable)AnyOptionalN)strict   )initialization)CacheDynamicCache)PreTrainedConfig)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)maybe_autocast   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPasttoken_type_ids_mask_function)SiglipVisionConfigzgoogle/gemma-3-4b-it)
checkpointc            
           \ rS rSr% SrSrSSSSSSSSSS.	rSS	S
.rSr\	\
S'   Sr\	\
S'   Sr\\   S-  \
S'   Sr\S-  \
S'   Sr\S-  \
S'   Sr\S-  \
S'   Sr\S-  \
S'   S rS rSrg)Gemma3TextConfig>   a  
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
    scaling factor used on the attention scores
final_logit_softcapping (`float`, *optional*):
    Scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*):
    Scaling factor when applying tanh softcapping on the attention scores.
use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
    If True, the model will attend to all text tokens instead of using a causal mask. This does not change
    behavior for vision tokens.

```python
>>> from transformers import Gemma3TextModel, Gemma3TextConfig
>>> # Initializing a Gemma3Text gemma3_text-7b style configuration
>>> configuration = Gemma3TextConfig()
>>> # Initializing a model from the gemma3_text-7b style configuration
>>> model = Gemma3TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3_textcolwisereplicated_with_grad_allreducerowwise)	zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projg    .Ag     @)globallocali@  
vocab_sizei   max_position_embeddingsNlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersFuse_bidirectional_attentionc                 l   U R                   (       a  U R                  S-  S-   U l        UR                  SS5      U l        U R                  cM  [        U R                  5       Vs/ s H'  n[        US-   U R                  -  5      (       a  SOSPM)     snU l        [        R                  " S0 UD6  g s  snf )Nr      sliding_window_pattern   sliding_attentionfull_attention )
r@   sliding_windowget_sliding_window_patternr<   rangenum_hidden_layersboolr   __post_init__)selfkwargsis      z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.pyrN   Gemma3TextConfig.__post_init__m   s    ++#'#6#6!#;q"@D (.zz2JA'N$# t556 6A (,QUd6R6R,R'S'S#Yii6 D
 	&&00 s   $.B1c                    UR                  SS 5      nSS0SS0S.nU R                  b  U R                  OUU l        Ub  U R                  S   R                  U5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  SU R
                  S   5      5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  S	U R
                  S
   5      5        U R                  5         U$ )Nrope_scaling	rope_typedefault)rE   rF   rF   
rope_thetar8   rE   rope_local_base_freqr9   )popr?   updaterI   
setdefaultdefault_thetastandardize_rope_params)rO   rP   rU   default_rope_paramss       rR   convert_rope_params_to_dict,Gemma3TextConfig.convert_rope_params_to_dict|   sI   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&    )rJ   r<   r?   rH   )__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planr]   r:   int__annotations__r;   r<   liststrr=   floatr>   r?   dictr@   rM   rN   r`   __static_attributes__rG   rb   rR   r2   r2   >   s    , J%.%.%.%E%E%."+ )"+
  +X>MJ#*S*$(KcT!(,0UT\0+/EDL/#'OTD['/441rb   r2   c                     ^  \ rS rSr% SrSrSSSS.r\\S.r	S	r
\\\\4   -  S	-  \S
'   S	r\\\\4   -  S	-  \S'   Sr\S	-  \S'   Sr\S	-  \S'   Sr\S	-  \S'   Sr\S	-  \S'   Sr\S	-  \S'   Sr\S	-  \S'   U 4S jrSrU =r$ )Gemma3Config   a  
mm_tokens_per_image (`int`, *optional*, defaults to 256):
    The number of tokens per image embedding.
boi_token_index (`int`, *optional*, defaults to 255999):
    The begin-of-image token index to wrap the image prompt.
eoi_token_index (`int`, *optional*, defaults to 256000):
    The end-of-image token index to wrap the image prompt.

Example:

```python
>>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

>>> # Initializing a Siglip-like vision config
>>> vision_config = SiglipVisionConfig()

>>> # Initializing a Gemma3 Text config
>>> text_config = Gemma3TextConfig()

>>> # Initializing a Gemma3 gemma-3-4b style configuration
>>> configuration = Gemma3Config(vision_config, text_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3TextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr{   r|      mm_tokens_per_imagei i  i   g{Gz?initializer_rangeTtie_word_embeddingsc                   > U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        O1U R                  c$  [        5       U l        [        R                  S5        [        TU ]$  " S0 UD6  g )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rG   )
r{   r2   loggerinfo
isinstancero   r|   r/   superrN   )rO   rP   	__class__s     rR   rN   Gemma3Config.__post_init__   s    #/1DKKZ[(($///C$2B2BCDd(($//!3!Id6H6H!ID'!3!5DKK`a''rb   )rc   rd   re   rf   rg   rh   attribute_mapr2   r/   sub_configsr{   ro   rm   r   rk   r|   r~   rj   rv   rw   ru   r   rn   r   rM   rN   rp   __classcell__r   s   @rR   rr   rr      s    : J-))M (+K
 =AK!DcN2T9@@DM%S#X6=D&)t)")OS4Z)")OS4Z)$+sTz+&*ut|*'++( (rb   rr   c                       \ rS rSrSrg)Gemma3ModelOutputWithPast   rG   Nrc   rd   re   rf   rp   rG   rb   rR   r   r          rb   r   c                       \ rS rSrSrg)Gemma3CausalLMOutputWithPast   rG   Nr   rG   rb   rR   r   r      r   rb   r   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbedding   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )Nr   F
persistent)r   __init__scalar_embed_scaleregister_buffertorchtensor)rO   r   r   r   r   r   s        rR   r   &Gemma3TextScaledWordEmbedding.__init__   s7    D"-]ELL,ERWXrb   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ N)r   forwardr   toweightdtype)rO   r   r   s     rR   r   %Gemma3TextScaledWordEmbedding.forward   s2    wy)D,<,<,?,?@Q@Q,RRRrb   )r   )      ?)rc   rd   re   rf   rg   rj   rn   r   r   Tensorr   rp   r   r   s   @rR   r   r      sM    Ys Y3 YS Y_d Y Y
S S Srb   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	Gemma3MLP   configc                 $   > [         TU ]  U5        g r   r   r   rO   r   r   s     rR   r   Gemma3MLP.__init__   s     rb   rG   )rc   rd   re   rf   r2   r   rp   r   r   s   @rR   r   r      s    !/ ! !rb   r   c                   8   ^  \ rS rSrSS\S\4U 4S jjjrSrU =r$ )Gemma3RMSNorm   dimepsc                     > [         TU ]  XS9  g )Nr   r   r   )rO   r   r   r   s      rR   r   Gemma3RMSNorm.__init__   s    S*rb   rG   )gư>)	rc   rd   re   rf   rj   rn   r   rp   r   r   s   @rR   r   r      s    +C +e + +rb   r   c                       \ rS rSrS\4S jr\    SS\S-  S\S   S\S-  S\	S-  S	\
S
\4   4
S jj5       r\R                  " 5       \SS j5       5       rSrg)Gemma3RotaryEmbedding   r   c                    [         R                  R                  U 5        UR                  U l        UR                  U l        Xl        [        [        UR                  5      5      U l	        0 U l
        U R                   H  nU R                  R                  U   nUc  M!  US   U R                  U'   U R                  nU R                  U   S:w  a  [        U R                  U      nU" U R                  US9u  pVU R                  U S3USS9  U R                  U S3UR                  5       SS9  [!        X S3U5        M     g )	NrV   rW   
layer_type	_inv_freqFr   _original_inv_freq_attention_scaling)nnModuler   r;   max_seq_len_cachedoriginal_max_seq_lenr   rl   setr<   rV   r?   compute_default_rope_parametersr   r   clonesetattr)rO   r   r   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalings          rR   r   Gemma3RotaryEmbedding.__init__   s.   
		4 "("@"@$*$B$B!F$6$6 78**J++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@Yc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST +rb   Ndeviceztorch.deviceseq_lenr   returnztorch.Tensorc           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
rX   head_dimNr   r   r   r   r   r   )	r?   getattrhidden_sizenum_attention_headsr   arangeint64r   rn   )r   r   r   r   baser   attention_factorinv_freqs           rR   r   5Gemma3RotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))rb   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)Nr   r   r   rB   mpscpuF)device_typeenabledr   r   r   )r   rn   expandshaper   r   r   typerm   r   	transposer   catcossinr   )rO   xposition_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                rR   r   Gemma3RotaryEmbedding.forward8  sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)r   r<   r   r   rV   NNNNr   )rc   rd   re   rf   r2   r   staticmethodr   rj   rm   tuplern   r   r   no_gradr   r   rp   rG   rb   rR   r   r      s    U/ U* *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <rb   r   c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\	S-  S
\
\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma3AttentioniL  r   	layer_idxc                 ^  > [         TU ]  X5        U R                  S:X  a  UR                  OS U l        U R                  S:H  U l        U R
                  R                  (       + U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )NrE   r   )r   r   r   rH   
is_slidingr   r@   	is_causalr   r   rms_norm_epsq_normk_normrO   r   r   r   s      rR   r   Gemma3Attention.__init__M  s    +7;J]7]f33cg//-@@![[DDD#V=P=PQ#V=P=PQrb   Nhidden_statesposition_embeddingsattention_maskpast_key_valuesrP   r   c                 b   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      nU R                  U	5      n	Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       a  U R$                  OSU R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nU R/                  U5      nX4$ )Nr   rB   r   g        )dropoutscalingrH   )r   r   q_projviewr   k_projv_projr  r  r(   r[   r   r   get_interfacer   _attn_implementationr)   trainingattention_dropoutr  rH   reshape
contiguouso_proj)rO   r	  r
  r  r  rP   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightss                   rR   r   Gemma3Attention.forwardV  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
! "));;;;FFHkk+.((rb   )r  r  r  r  rH   )NNN)rc   rd   re   rf   r2   rj   r   r   r   r	   r   r   r   r   rp   r   r   s   @rR   r   r   L  s    R/ RC R -1.2(,*)||*) #\\*) t+	*)
 *) +,*) 
u||U\\D0%2E2LL	M*) *)rb   r   c                   $  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma3DecoderLayeri  r   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)r   r   r   )r   r   r   r   r   r   	self_attnr   mlpr   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      rR   r   Gemma3DecoderLayer.__init__  s    !--"(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'rb   Nr	  r
  r  r   r  rP   r   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pU R                  U5      nXq-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXq-   nU$ )N)r	  r
  r  r   r  rG   )r*  r(  r+  r,  r)  r-  )	rO   r	  r
  r  r   r  rP   residual_s	            rR   r   Gemma3DecoderLayer.forward  s     !,,];>> 
' 3)%+
 
 55mD 0 66}E/77F 0rb   )	r   r   r*  r   r)  r+  r-  r,  r(  r   )rc   rd   re   rf   r2   rj   r   r   r   
LongTensorr	   r   r   r   FloatTensorr   rp   r   r   s   @rR   r%  r%    s    
c/ 
cC 
c -1.204(,|| #\\ t+	
 &&-  +, 
u  %(9(95;L;L(L"MPT"TT	U rb   r%  c                   R    \ rS rSrSrSr/ SQr\R                  " 5       S 5       r	Sr
g)Gemma3PreTrainedModeli  model)imagetext)r%  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    [         R                  " X5        [        U[        5      (       a!  [        R
                  " UR                  5        g SUR                  R                  ;   a!  [        R
                  " UR                  5        g [        U[        5      (       a,  [        R                  " UR                  UR                  5        g [        U[        5      (       a  UR                   H  nUR                   nUR"                  U   S:w  a  [$        UR"                  U      nU" UR&                  US9u  pE[        R(                  " [+        X S35      U5        [        R(                  " [+        X S35      U5        M     g g )NRMSNormrW   r   r   r   )r   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightr   rc   r   r   	constant_r   r   r   r<   r   rV   r   r   copy_r   )rO   moduler   r   r   r1  s         rR   r?  #Gemma3PreTrainedModel._init_weights  s   %%d3f788KK99:&**333KK& =>>NN6--v/H/HI 566$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 7rb   rG   N)rc   rd   re   rf   base_model_prefixinput_modalities_no_split_modulesr   r   r?  rp   rG   rb   rR   r6  r6    s2    ( ]]_^ ^rb   r6  rH   r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z9
Enables a bidirectional mask within the sliding window.
	batch_idxhead_idxq_idxkv_idxr   c                 $   > [        X#-
  5      T:  $ )zA token can attend to any other token if their absolute distance is within
the (exclusive) sliding window size (distance < sliding_window).)abs)rL  rM  rN  rO  rH   s       rR   
inner_mask1_bidirectional_window_overlay.<locals>.inner_mask  s     5>"^33rb   )rj   rM   )rH   rR  s   ` rR   _bidirectional_window_overlayrT    s3    
4c 4S 4 4c 4d 4
 rb   c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjr      SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\R                  S-  S\S-  S\\   S\4S jjrSrU =r$ )Gemma3TextModeli  r   r9  c                    > [         TU ]  U5        [        UR                  UR                  U R
                  U R                  R                  S-  S9U l        g )N      ?)r   )r   r   r   r:   r   r   r   embed_tokensr   s     rR   r   Gemma3TextModel.__init__  sM      :v1143C3CQUQ\Q\QhQhjmQm
rb   Nr   r  r   r  inputs_embeds	use_cacherP   r   c           	      
   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d|  U R                  UUUUS.n
U
R                  5       nU R                  R                  (       a(  S U
S'   [        U R                  R                  5      US'   [!        S0 U
D6[#        S0 UD6S	.n	Un0 n[%        U R                  R&                  5       H  nU R)                  XU5      X'   M     [+        U R,                  S U R                  R.                   5       HF  u  nnU" U4XR                  R&                  U      XR                  R&                  U      UUS
.UD6nMH     U R1                  U5      n[3        UUS9$ )N:You must specify exactly one of input_ids or inputs_embeds)r   r   rB   r   r   r\  r  r  r   c                  H    [         R                  " S[         R                  S9$ )NTr   )r   r   rM   )argss    rR   <lambda>)Gemma3TextModel.forward.<locals>.<lambda>  s    TY^YcYc@drb   or_mask_function)rF   rE   )r  r
  r   r  )last_hidden_stater  rG   )
ValueErrorrZ  r
   r   get_seq_lengthr   r   r   r   	unsqueezer   ro   copyr@   rT  rH   r   r   r   r<   
rotary_emb	enumeratelayersrL   normr   )rO   r   r  r   r  r\  r]  rP   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr	  r
  r   rQ   decoder_layers                    rR   r   Gemma3TextModel.forward  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & dkk556J.2oom[e.f+ 7 !*$++6U8U8U*V WA})2;;3J3J13MN$78O8OPQ8R$S) / M !X 		-0&++
 	
rb   )rZ  )NNNNNN)rc   rd   re   rf   r2   rk   rI  r   r   r3  r   r	   r4  rM   r   r   r   r   rp   r   r   s   @rR   rV  rV    s     
/ 
 .2.204(,26!%C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 +,C
 
!C
 C
rb   rV  c                   <   ^  \ rS rSr% \\S'   S\4U 4S jjrSrU =r$ )Gemma3ForCausalLMi1  r   c                 D   > [         TU ]  U5        [        U5      U l        g r   )r   r   rV  r7  r   s     rR   r   Gemma3ForCausalLM.__init__4  s     $V,
rb   )r7  )	rc   rd   re   rf   r2   rk   r   rp   r   r   s   @rR   rw  rw  1  s    -/ - -rb   rw  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r@  i9  r   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr'  rY  )kernel_sizestride)r   r   r   	Parameterr   zerosr|   r   r{   rC  r   layer_norm_epsmm_soft_emb_normrj   
image_size
patch_sizepatches_per_imager~   tokens_per_sider|  	AvgPool2davg_poolr   s     rR   r   "Gemma3MultiModalProjector.__init__:  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[rb   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )NrB   r   )r   r   r  r  r  r  flattenr  r   matmulrC  type_as)	rO   r  
batch_sizer1  r   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rR   r   !Gemma3MultiModalProjector.forwardJ  s    %3%9%9"
{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??rb   )r  r|  rC  r  r  r  )rc   rd   re   rf   rr   r   r   r   r   rp   r   r   s   @rR   r@  r@  9  s)    \| \ @ell @ @rb   r@  input_embedsz5.6.0r\  )versionnew_namer   r  r  r   token_type_idsis_first_iterationc                 v   U R                  5       UUUUS.nUb  US:H  R                  UR                  5      n	[        R                  R                  U	SSS9SS2SS24   n
X) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " XS5      n[        U5      US	'   [        S
0 UD6$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

Uses `pixel_values` as an optional input to disambiguate edge cases.
ra  NrB   )rB   r   r   )valuer   r   rf  rG   )get_text_configr   r   r   
functionalpadr   cumsumrj   wherer.   r   )r   r\  r  r  r   r  r  rP   rr  is_imageis_previous_imagenew_image_start	group_idss                rR   create_causal_mask_mappingr  ]  s    $ ((*&(*$K ! #a'++M,@,@AMM--ha-HCRCP"%77LL!4!4!6A>B	KKR8	*Fy*Q&'$3{33rb   c                     ^  \ rS rSrSrS\4U 4S jjr\\" SS9S\	R                  S\\   S	\\-  4S
 j5       5       r\\         SS\	R                   S-  S\	R                  S-  S\	R"                  S-  S\	R                   S-  S\S-  S\	R                   S-  S\	R                  S-  S\	R                   S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )Gemma3Modeli  Fr   c                 (   > [         TU ]  U5        U ?g r   )r   r   text_config_dtyper   s     rR   r   Gemma3Model.__init__  s     "rb   zOProjects the last hidden state from the vision model into language model space.)custom_intropixel_valuesrP   r   c                 t    U R                   " SUSS.UD6nUR                  nU R                  U5      Ul        U$ )NT)r  return_dictrG   )vision_towerrg  multi_modal_projectorpooler_output)rO   r  rP   r  rg  s        rR   get_image_featuresGemma3Model.get_image_features  sF    
 **aRVaZ`a*<<'+'A'ABS'T$rb   Nr   r  r   r  r  r\  labelsr]  	lm_kwargsc
           
         US L US L-  (       a  [        S5      eUbQ  U R                  R                  U R                  :  a-  XR                  R                  :H  nUR	                  5       nSX'   OUnUc  U R                  5       " U5      nUba  U R                  USS9R                  nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      n[        U=n[        5      (       d  [        U R                  UUUUUS9nU R                   " S	UUUUU	SS.U
D6n[#        UR$                  UR&                  UR(                  UR*                  Ub  WS9$ S S9$ )
Nr_  r   T)r  )r\  image_features)r\  r  r  r   r  )r  r   r  r\  r]  r  )rg  r  r	  
attentionsimage_hidden_statesrG   )rh  r   rx   r:   r   get_input_embeddingsr  r  r   r   r   get_placeholder_maskmasked_scatterr   ro   r  language_modelr   rg  r  r	  r  )rO   r   r  r  r   r  r  r\  r  r]  r  special_image_maskllm_input_idsr  rq  outputss                   rR   r   Gemma3Model.forward  s    -t";<YZZ  T[[%?%?4??%R!*kk.H.H!H%OO-M01M-%M  557FM #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K\M ?-FF"<+- /)-# %% 
.%+'
 
 )%77#33!//))2>2J
 	

 QU
 	
rb   rG   	NNNNNNNNN)rc   rd   re   rf   accepts_loss_kwargsrr   r   r   r   r   r4  r   r   r   r   r  r3  r   r	   rM   r   r   rp   r   r   s   @rR   r  r    sf   #| # !rs!--9?@R9S	+	+ t   .215.204(,2626*.!%?
##d*?
 ''$.?
 t+	?

 &&-?
 ?
 ((4/?
 ((4/?
   4'?
 $;?
 ./?
 
*	*?
  ?
rb   r  c                     ^  \ rS rSrSr\\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\\-  4S jj5       5       r          SU 4S jjrSrU =r$ )Gemma3ForConditionalGenerationi  FNr   r  r  r   r  r  r\  r  r]  logits_to_keepr  r   c                    U R                   " S	UUUUUUUU	USS.
UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUGbQ  UR                  5       nUSSS2SS24   nUSSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR                  SU R                  R                  R                  5      nUR                  S5      R                  UR                  5      nU" UU5      n[!        UUUR"                  UR$                  UR&                  UR(                  S9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
T)
r   r  r  r  r   r  r\  r]  r  r  r   N.r   rB   )losslogitsr  r	  r  r  rG   )r7  r   rj   slicelm_headrn   r   r   r   r  r   CrossEntropyLossr  r   r{   r:   r   r  r	  r  r  )rO   r   r  r  r   r  r  r\  r  r]  r  r  r  r	  slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                          rR   r   &Gemma3ForConditionalGeneration.forward  s   z ** 
%))%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
rb   c                 n   > [         TU ]  " U4UUUUUU	UUS.UD6nU(       d  U(       d  X]S'   U$ S US'   U$ )N)r  r\  r  r   r]  r  r  r  r  r  )r   prepare_inputs_for_generation)rO   r   r  r\  r   r  r  r  r]  r  r  r  rP   model_inputsr   s                 rR   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generationP  sh      w<
+')%))1
 
" Y+7(
  .2L)*rb   rG   )
NNNNNNNNNr   )
NNNNNNTNNF)rc   rd   re   rf   r  r   r   r   r3  r4  r   r	   rM   rj   r   r   r   r   r   r  rp   r   r   s   @rR   r  r    s\      .215.204(,2626*.!%-.k
##d*k
 ''$.k
 t+	k

 &&-k
 k
 ((4/k
 ((4/k
   4'k
 $;k
 ell*k
 ./k
 
-	-k
  k
`  ' 'rb   r  c                   \  ^  \ rS rSrU 4S jrS rS r\\         SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForSequenceClassificationiz  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g )NF)bias)r   r   
num_labelsr  r7  r   Linearr{   r   score	post_initr   s     rR   r   (Gemma3ForSequenceClassification.__init__{  sZ      ++ (
YYv11==tUZ[
 	rb   c                 6    U R                   R                  5       $ r   )r7  r  )rO   s    rR   r  4Gemma3ForSequenceClassification.get_input_embeddings  s    zz..00rb   c                 :    U R                   R                  U5        g r   )r7  set_input_embeddings)rO   r  s     rR   r  4Gemma3ForSequenceClassification.set_input_embeddings  s    

''.rb   Nr   r  r  r   r  r\  r  r  r]  rP   r   c
                    U R                   " U4UUUUUUU	SS.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOUb  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S	35        U[        R                  " XR                  S
9U4   nSnUb  U R%                  XUU R                  S9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
T)r  r  r   r  r\  r  r]  r  Nr   rB   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r`  )r  r  pooled_logitsr   )r  r  r  r	  r  )r7  rg  r  r   r   r{   pad_token_idrh  r   r   r   int32r   argmaxr   warning_oncer   rc   loss_functionr   r  r	  r  )rO   r   r  r  r   r  r\  r  r  r]  rP   transformer_outputsr	  r  r  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rR   r   'Gemma3ForSequenceClassification.forward  s   , #jj
)%%+')
 
 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
rb   )r7  r  r  r  )rc   rd   re   rf   r   r  r  r   r   r   r3  r4  r   r	   rM   r   r   r   r   rp   r   r   s   @rR   r  r  z  s   1/  .215.204(,2626*.!%D
##d*D
 ''$.D
 t+	D

 &&-D
 D
 ((4/D
 ((4/D
   4'D
 $;D
 +,D
 
*D
  D
rb   r  c                   (    \ rS rSr% Sr\\S'   SrSrg)#Gemma3TextForSequenceClassificationi  z
Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
It uses the generic sequence classification implementation for efficiency and consistency.
r   rW  rG   N)	rc   rd   re   rf   rg   r2   rk   rI  rp   rG   rb   rR   r  r    s    
  rb   r  )	rr   r2   r6  rV  rw  r  r  r  r  )NN)acollections.abcr   typingr   r   r   torch.nnr   huggingface_hub.dataclassesr    r   rA  cache_utilsr	   r
   configuration_utilsr   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   gemma2.configuration_gemma2r    gemma2.modeling_gemma2r!   r"   r#   r$   r%   r&   r'   r(   r)   paligemma.modeling_paligemmar*   r+   r,   r-   r.   siglipr/   
get_loggerrc   r   r2   rr   r   r   	Embeddingr   r   r   r   r   r   r%  GEMMA3_START_DOCSTRINGr6  rj   rM   rT  rV  rw  r@  r   ro   r  r  r  r  r  __all__rG   rb   rR   <module>r     s   %     . & . 3 m m [ u u G & R R 0 + 6
 
 
  ( 
		H	% 12W|%5 W  3Wt 12?(# ?(  3?(D	 < 		#B 	SBLL S!	 !
+M +
J<1299 J<\4)o 4)n+3 +\  ^1 ^<
# 
(CcSVCWY]C]:^ 
O
k O
d-) -!@		 !@H ?K +/&*$4$4<<$4 LL4'$4 T\	$4
 ,,%$4 LL4'$4 t$4 
$4 L$4NT
. T
n[%F [|V
&; V
r!*JLa !
rb   