
    Z j                     <   S SK Jr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSKJrJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0J1r1J2r2  SSK3J4r4  SSK5J6r6J7r7  SSK8J9r9  SSK:J;r;  SSK<J=r=J>r>  \1R~                  " \@5      rA\\/" SS9 " S S\ 5      5       5       rB\/" SS9\ " S S \-5      5       5       rC " S! S"\R                  5      rE " S# S$\R                  5      rG " S% S&\R                  5      rH " S' S(\R                  5      rIS) rJ\" S*5      S]S+ j5       rKS,\R                  S-\MS.\R                  4S/ jrN   S^S0\R                  S1\R                  S2\R                  S3\R                  S4\R                  S-  S5\O\M-  S6\OS-  S7\OS-  S.\P\R                  \R                  4   4S8 jjrQ\" \K5       " S9 S:\R                  5      5       rR " S; S<\5      rS\/ " S= S>\)5      5       rTS?\MS.\\M\M\M\M/\U4   4S@ jrV\/ " SA SB\T5      5       rW\/ " SC SD\T\5      5       rX " SE SF\R                  5      rYSG\R                  S.\4SH jrZ\4" SISJSKSL9  S_SM\SK\R                  S4\R                  S-  SN\S-  SO\R                  S-  SP\R                  S-  SQ\US-  S.\[4SR jj5       r\\/" SSS9 " ST SU\T5      5       r]\/" SSS9 " SV SW\T\5      5       r^ " SX SY\T5      r_ " SZ S[\\T5      r`/ S\Qrag)`    )Callable)	dataclass)OptionalN   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3ModelOutputWithPast8   a  
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )
__name__
__module____qualname____firstlineno____doc__r1   torchFloatTensor__annotations____static_attributes__r2       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.pyr/   r/   8   s     59**T18r<   r/   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)Gemma3CausalLMOutputWithPastH   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr1   r2   )r3   r4   r5   r6   r7   rA   r8   r9   r:   rB   rC   r	   rD   tuplerE   r1   r;   r2   r<   r=   r?   r?   H   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r<   r?   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbeddingf   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )NrM   F
persistent)super__init__scalar_embed_scaleregister_bufferr8   tensor)selfrJ   rK   rL   rM   	__class__s        r=   rR   &Gemma3TextScaledWordEmbedding.__init__k   s7    D"-]ELL,ERWXr<   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ N)rQ   forwardrM   toweightdtype)rV   rY   rW   s     r=   r\   %Gemma3TextScaledWordEmbedding.forwardp   s2    wy)D,<,<,?,?@Q@Q,RRRr<   )rS   )      ?)r3   r4   r5   r6   r7   intfloatrR   r8   Tensorr\   r;   __classcell__rW   s   @r=   rH   rH   f   sM    Ys Y3 YS Y_d Y Y
S S Sr<   rH   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	Gemma3MLPt   configc                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)rQ   rR   rj   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrV   rj   rW   s     r=   rR   Gemma3MLP.__init__u   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r<   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r[   )ru   rw   rs   rt   )rV   xru   s      r=   r\   Gemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r<   )rw   rj   ru   rs   ro   rp   rt   )	r3   r4   r5   r6   r+   rR   r\   r;   re   rf   s   @r=   rh   rh   t   s    7/ 7 r<   rh   c                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
Gemma3RMSNorm   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g r[   )rQ   rR   r   rq   	Parameterr8   zerosr^   )rV   r   r   rW   s      r=   rR   Gemma3RMSNorm.__init__   s,    ll5;;s#34r<   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr'   T)keepdim)r8   rsqrtpowmeanr   )rV   r{   s     r=   _normGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr<   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )Nra   )r   rc   r^   type_as)rV   r{   outputs      r=   r\   Gemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r<   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)rF   r^   shaper   rV   s    r=   
extra_reprGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r<   )r   r^   )gư>)r3   r4   r5   r6   rb   rc   rR   r   r\   r   r;   re   rf   s   @r=   r~   r~      s0    5C 5e 5 5
K!= =r<   r~   c                      ^  \ rS rSr% \R
                  \S'   S\4U 4S jjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )Gemma3RotaryEmbedding   inv_freqrj   c                 f  > [         TU ]  5         UR                  U l        UR                  U l        Xl        [        [        UR                  5      5      U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   U R                  U'   U R                  nU R                  U   S:w  a  [        U R                  U      nU" U R
                  US9u  pVU R                  U S3USS9  U R                  U S3UR                  5       SS9  [        X S3U5        M     g )	N	rope_typedefault
layer_type	_inv_freqFrO   _original_inv_freq_attention_scaling)rQ   rR   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrj   listsetlayer_typesr   rope_parameterscompute_default_rope_parametersr   rT   clonesetattr)rV   rj   r   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingrW   s          r=   rR   Gemma3RotaryEmbedding.__init__   s(   "("@"@$*$B$B!F$6$6 78**J++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@Yc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST +r<   Ndeviceztorch.deviceseq_lenr   returnztorch.Tensorc           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNra   r   r'   r_   r   r_   )	r   getattrro   num_attention_headsr8   arangeint64r]   rc   )rj   r   r   r   baser   attention_factorr   s           r=   r   5Gemma3RotaryEmbedding.compute_default_rope_parameters   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r<   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)Nr   r   r   r   r)   mpscpuF)device_typeenabledr'   r   r   )r   rc   expandr   r]   r   
isinstancetypestrr$   	transposer8   catcossinr_   )rV   r{   position_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r=   r\   Gemma3RotaryEmbedding.forward   sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)rj   r   r   r   r   NNNNr[   )r3   r4   r5   r6   r8   rd   r:   r+   rR   staticmethodr   rb   r   rF   rc   r   no_gradr   r\   r;   re   rf   s   @r=   r   r      s    llU/ U* *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <r<   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r'   r   )r   r8   r   )r{   x1x2s      r=   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r=   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr<   rD   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r)   N)r   r   reshape)rD   r   batchnum_key_value_headsslenr   s         r=   	repeat_kvr     s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr<   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )N      r'   r   r   )r   r_   )ptrainingr)   )r   r   num_key_value_groupsr8   matmulr   tanhrq   
functionalsoftmaxfloat32r]   r_   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r=   eager_attention_forwardr    s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r<   c                     ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\R                  S	\R                  S-  S
\
S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma3Attentioni6  z=Multi-headed attention from 'Attention Is All You Need' paperrj   	layer_idxc                   > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        UR                  S-  U l        U R
                  R                  U l        U R
                  R                   (       + U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR                  UR(                  S9U l        U R
                  R2                  U l        U R                  S:X  a  UR4                  OS U l        U R                  S:H  U l        [9        UR                  UR:                  S9U l        [9        UR                  UR:                  S9U l        g )Nr   r   r   rm   sliding_attention)r   r   ) rQ   rR   hasattrr   r   rj   r  r   ro   r   r   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalrq   rr   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingr~   rms_norm_epsq_normk_normrV   rj   r  rW   s      r=   rR   Gemma3Attention.__init__:  s   ;B6=;Y;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>![[DDDii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cg//-@@#V=P=PQ#V=P=PQr<   NrD   position_embeddingsr   rC   r   r   c                 b   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      nU R                  U	5      n	Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       a  U R$                  OSU R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nU R/                  U5      nX4$ )Nr   r)   r'           )r   r   r  )r   r   r  viewr   r  r  r  r  r   updater  r   get_interfacerj   _attn_implementationr  r   r  r   r  r   r   r  )rV   rD   r  r   rC   r   input_shapehidden_shapequery_statesr  r  r   r   attention_interfacer  r  s                   r=   r\   Gemma3Attention.forwardX  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
! "));;;;FFHkk+.((r<   )r  r  rj   r   r  r  r  r  r  r   r   r  r  r  r   r  r  )NNN)r3   r4   r5   r6   r7   r+   rb   rR   r8   rd   r	   r   r   rF   r\   r;   re   rf   s   @r=   r  r  6  s    GR/ RC RB -1.2(,*)||*) #\\*) t+	*)
 *) +,*) 
u||U\\D0%2E2LL	M*) *)r<   r  c                   $  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma3DecoderLayeri  rj   r  c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)rj   r  r   )rQ   rR   rj   ro   r  r  	self_attnrh   mlpr~   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      r=   rR   Gemma3DecoderLayer.__init__  s    !--"(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r<   NrD   r  r   r   rC   r   r   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pU R                  U5      nXq-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXq-   nU$ )N)rD   r  r   r   rC   r2   )r/  r-  r0  r1  r.  r2  )	rV   rD   r  r   r   rC   r   residual_s	            r=   r\   Gemma3DecoderLayer.forward  s     !,,];>> 
' 3)%+
 
 55mD 0 66}E/77F 0r<   )	rj   ro   r/  r  r.  r0  r2  r1  r-  r   )r3   r4   r5   r6   r+   rb   rR   r8   rd   
LongTensorr	   r   r   rF   r9   r\   r;   re   rf   s   @r=   r*  r*    s    
c/ 
cC 
c -1.204(,|| #\\ t+	
 &&-  +, 
u  %(9(95;L;L(L"MPT"TT	U r<   r*  c                      ^  \ rS rSr% \\S'   SrSr/ SQrS/r	Sr
SrSrSrSr\\S.rSr\R(                  " 5       U 4S	 j5       rS
rU =r$ )Gemma3PreTrainedModeli  rj   modelT)r*  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadrC   )rD   rE   )imagetextc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g SUR                  R                  ;   a!  [        R
                  " UR                  5        g [        U[        5      (       a,  [        R                  " UR                  UR                  5        g [        U[        5      (       a  UR                   H  nUR                   nUR"                  U   S:w  a  [$        UR"                  U      nU" UR&                  US9u  pE[        R(                  " [+        X S35      U5        [        R(                  " [+        X S35      U5        M     g g )NRMSNormr   r   r   r   )rQ   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightrW   r3   r^   rH   	constant_rM   rS   r   r   r   r   r   rj   copy_r   )rV   r   r   r   r   r6  rW   s         r=   rC  #Gemma3PreTrainedModel._init_weights  s   f%f788KK99:&**333KK& =>>NN6--v/H/HI 566$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 7r<   r2   )r3   r4   r5   r6   r*   r:   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr*  r  _can_record_outputsinput_modalitiesr8   r   rC  r;   re   rf   s   @r=   r:  r:    su    &*# $5"5N!"&+% )
]]_^ ^r<   r:  r  c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z9
Enables a bidirectional mask within the sliding window.
	batch_idxhead_idxq_idxkv_idxr   c                 $   > [        X#-
  5      T:  $ )zA token can attend to any other token if their absolute distance is within
the (exclusive) sliding window size (distance < sliding_window).)abs)rW  rX  rY  rZ  r  s       r=   
inner_mask1_bidirectional_window_overlay.<locals>.inner_mask  s     5>"^33r<   rb   bool)r  r]  s   ` r=   _bidirectional_window_overlayra    s3    
4c 4S 4 4c 4d 4
 r<   c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	\
      SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\R                  S-  S\S-  S\\   S\4S jj5       5       5       rSrU =r$ )Gemma3TextModeli  rj   r@  c           	      "  > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        U5      U l        SU l        U R+                  5         g s  snf )N      ?)rM   r,  F)rQ   rR   pad_token_idrL   
vocab_sizerH   ro   rj   embed_tokensrq   
ModuleListrangenum_hidden_layersr*  layersr~   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r=   rR   Gemma3TextModel.__init__  s     !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	/7&+# 	 es    DNrY   r   r   rC   inputs_embeds	use_cacher   r   c           	      
   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d|  U R                  UUUUS.n
U
R                  5       nU R                  R                  (       a(  S U
S'   [        U R                  R                  5      US'   [!        S0 U
D6[#        S0 UD6S	.n	Un0 n[%        U R                  R&                  5       H  nU R)                  XU5      X'   M     [+        U R,                  S U R                  R.                   5       HF  u  nnU" U4XR                  R&                  U      XR                  R&                  U      UUS
.UD6nMH     U R1                  U5      n[3        UUS9$ )N:You must specify exactly one of input_ids or inputs_embedsrj   r   r)   r   rj   rs  r   rC   r   c                  H    [         R                  " S[         R                  S9$ )NTr   )r8   rU   r`  )argss    r=   <lambda>)Gemma3TextModel.forward.<locals>.<lambda>,  s    TY^YcYc@dr<   or_mask_function)full_attentionr
  )r   r  r   rC   )last_hidden_staterC   r2   )
ValueErrorri  r
   rj   get_seq_lengthr8   r   r   r   r   r   dictcopyr  ra  r  r   r   r   r   ro  	enumeraterm  rl  rn  r   )rV   rY   r   r   rC   rs  rt  r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsrD   r  r   idecoder_layers                    r=   r\   Gemma3TextModel.forward  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & dkk556J.2oom[e.f+ 7 !*$++6U8U8U*V WA})2;;3J3J13MN$78O8OPQ8R$S) / M !X 		-0&++
 	
r<   )ri  rp  rm  rn  rL   ro  rh  )NNNNNN)r3   r4   r5   r6   r+   r:   rU  rR   r%   r&   r   r8   r8  rd   r	   r9   r`  r   r   r   r\   r;   re   rf   s   @r=   rc  rc    s     / &   .2.204(,26!%C
##d*C
 t+C
 &&-	C

 C
 ((4/C
 $;C
 +,C
 
!C
    C
r<   rc  c                   d  ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   S\4U 4S	 jjr	\
\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForCausalLMiM  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrD   rB   rj   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g rl   )
rQ   rR   rc  r;  rh  rq   rr   ro   r  rq  rx   s     r=   rR   Gemma3ForCausalLM.__init__T  sU     $V,
 ++yy!3!3V5F5FUS 	r<   NrY   r   r   rC   rs  labelsrt  logits_to_keepr   r   c	           
          U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a"  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3ForCausalLM

>>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)rY   r   r   rC   rs  rt  NrA   rB   rC   rD   rE   r2   )r;  r  r   rb   slicer  rj   final_logit_softcappingr8   r   loss_functionrh  r   rC   rD   rE   )rV   rY   r   r   rC   rs  r  rt  r  r   outputsrD   slice_indicesrB   rA   s                  r=   r\   Gemma3ForCausalLM.forward]  s   @ ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD%#33!//))
 	
r<   )r  r;  rh  )NNNNNNNr   )r3   r4   r5   r6   _tied_weights_keys_tp_plan_pp_planr+   r:   rR   r    r   r8   r8  rd   r	   r9   r`  rb   r   r   r   r\   r;   re   rf   s   @r=   r  r  M  s%   *,GH23H_-z:;H/   .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
  ;
r<   r  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )rD  i  rj   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr,  rf  )kernel_sizestride)rQ   rR   rq   r   r8   r   vision_configro   text_configrG  r~   layer_norm_epsmm_soft_emb_normrb   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrx   s     r=   rR   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r<   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr)   r'   )r   r   r   r  r   r  flattenr  r8   r   rG  r   )	rV   r  
batch_sizer6  ro   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r=   r\   !Gemma3MultiModalProjector.forward  s    %3%9%9"
{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r<   )r  r  rG  r  r  r  )r3   r4   r5   r6   r*   rR   r8   rd   r\   r;   re   rf   s   @r=   rD  rD    s)    \| \ @ell @ @r<   rD  	group_idsc           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )aY  
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
Args:
    group_ids (`torch.Tensor`):
        A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
        come from the same input image. Text is denoted by `-1`.
rW  rX  rY  rZ  r   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   r)   )maxr   )r   clampr8   where)
rW  rX  rY  rZ  
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            r=   r]  0token_type_ids_mask_function.<locals>.inner_mask  s    __R(
 
Q7*q.9 I45Y67++e0'2>;;v2HbA#155r<   r_  )r  r]  s   ` r=   token_type_ids_mask_functionr    s3    6c 6S 6 6c 6d 6 r<   input_embeds5.6.0rs  versionnew_namerj   rC   r   token_type_idsis_first_iterationc                 v   U R                  5       UUUUS.nUb  US:H  R                  UR                  5      n	[        R                  R                  U	SSS9SS2SS24   n
X) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " XS5      n[        U5      US	'   [        S
0 UD6$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

Uses `pixel_values` as an optional input to disambiguate edge cases.
ry  Nr)   )r)   r   r   )r   r   r   r~  r2   )get_text_configr]   r   rq   r   padr8   cumsumrb   r  r  r   )rj   rs  r   rC   r   r  r  r   r  is_imageis_previous_imagenew_image_startr  s                r=   create_causal_mask_mappingr    s    $ ((*&(*$K ! #a'++M,@,@AMM--ha-HCRCP"%77LL!4!4!6A>B	KKR8	*Fy*Q&'$3{33r<   zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
" SS9S	\R                  S
\\   S\\-  4S j5       5       rS\R$                  S\R                  S\R                  4S jr\	\
         SS\R$                  S-  S	\R                  S-  S\R(                  S-  S\R$                  S-  S\S-  S\R$                  S-  S\R                  S-  S\R$                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )Gemma3Modeli  Frj   c                 (  > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  5         g )Nrw  )rQ   rR   r(   from_configr  vision_towerrD  multi_modal_projectorr  rh  language_modelrq  )rV   rj   r  rW   s      r=   rR   Gemma3Model.__init__  so     %119M9MN%>v%F" ,,77"..f6H6HI,r<   c                 6    U R                   R                  5       $ r[   )r  get_input_embeddingsr   s    r=   r   Gemma3Model.get_input_embeddings  s    ""7799r<   c                 :    U R                   R                  U5        g r[   )r  set_input_embeddingsrV   r   s     r=   r   Gemma3Model.set_input_embeddings  s    007r<   zOProjects the last hidden state from the vision model into language model space.r,   pixel_valuesr   r   c                 t    U R                   " SUSS.UD6nUR                  nU R                  U5      Ul        U$ )NT)r  return_dictr2   )r  r  r  pooler_output)rV   r  r   r  r  s        r=   get_image_featuresGemma3Model.get_image_features  sF    
 **aRVaZ`a*<<'+'A'ABS'T$r<   rY   rs  image_featuresc           	      F   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  U5      R                  UR                  5      n[        X$   R                  5       UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r_   r   r   r   r)   z6Image features and image tokens do not match, tokens: z, features: )r  r8   rU   rj   image_token_idlongr   allsumr   r   	expand_asr]   r"   numel)rV   rY   rs  r  special_image_maskn_image_tokensn_image_featuress          r=   get_placeholder_mask Gemma3Model.get_placeholder_mask(  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno-3359M9M9OOD^DTT`aq`rs	
 "!r<   Nr   r   rC   r  r  rt  	lm_kwargsc
           
         USL USL-  (       a  [        S5      eUbQ  U R                  R                  U R                  :  a-  XR                  R                  :H  nUR	                  5       nSX'   OUnUc  U R                  5       " U5      nUba  U R                  USS9R                  nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      n[        U=n[        5      (       d  [        U R                  UUUUUS9nU R                   " S
UUUUU	SS.U
D6n[#        UR$                  UR&                  UR(                  UR*                  Ub  WS	9$ SS	9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```Nrv  r   T)r  )rs  r  )rs  r   rC   r   r  )r   r   rC   rs  rt  r  )r  rC   rD   rE   r1   r2   )r  rj   r  rh  r   r  r  r  r]   r   r_   r  masked_scatterr   r  r  r  r/   r  rC   rD   rE   )rV   rY   r  r   r   rC   r  rs  r  rt  r  r  llm_input_idsr  r  r  s                   r=   r\   Gemma3Model.forward@  s   X -t";<YZZ  T[[%?%?4??%R!*kk.H.H!H%OO-M01M-%M  557FM #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K\M ?-FF"<+- /)-# %% 
.%+'
 
 )%77#33!//))2>2J
 	

 QU
 	
r<   )r  r  r  rh  	NNNNNNNNN)r3   r4   r5   r6   accepts_loss_kwargsr*   rR   r  r  r    r   r8   r9   r   r   rF   r   r  r8  r  rd   r	   r`  r/   r\   r;   re   rf   s   @r=   r  r    s     | :8 !rs!--9?@R9S	+	+ t "))":?:K:K"]b]n]n"0  .215.204(,2626*.!%\
##d*\
 ''$.\
 t+	\

 &&-\
 \
 ((4/\
 ((4/\
   4'\
 $;\
 ./\
 
*	*\
  \
r<   r  c                     ^  \ rS rSrSS0rSrS\4U 4S jjr\S\	R                  S\\   4S	 j5       r\\          SS\	R                  S
-  S\	R                  S
-  S\	R                   S
-  S\	R                  S
-  S\S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\S
-  S\\	R                   -  S\\   S\\-  4S jj5       5       r          SU 4S jjr\\" SSSS9  S S\S\	R                   S\	R                   S
-  S\S
-  S\	R                   S
-  S\	R                   S
-  S\S
-  S\4S jj5       5       rSrU =r$ )!Gemma3ForConditionalGenerationi  r  z(model.language_model.embed_tokens.weightFrj   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g rl   )rQ   rR   r  r;  rq   rr   r  ro   rh  r  rq  rx   s     r=   rR   'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr<   r  r   c                 <    U R                   R                  " U40 UD6$ r[   )r;  r  )rV   r  r   s      r=   r  1Gemma3ForConditionalGeneration.get_image_features  s    zz,,\DVDDr<   NrY   r   r   rC   r  rs  r  rt  r  r  r   c                    U R                   " S	UUUUUUUU	USS.
UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUGbQ  UR                  5       nUSSS2SS24   nUSSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR                  SU R                  R                  R                  5      nUR                  S5      R                  UR                  5      nU" UU5      n[!        UUUR"                  UR$                  UR&                  UR(                  S9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
T)
rY   r  r  r   r   rC   rs  rt  r  r  r   N.r   r)   )rA   rB   rC   rD   rE   r1   r2   )r;  r   rb   r  r  rc   r   r]   r   r   rq   CrossEntropyLossr   rj   r  rh  r?   rC   rD   rE   r1   )rV   rY   r  r   r   rC   r  rs  r  rt  r  r  r  rD   r  rB   rA   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                          r=   r\   &Gemma3ForConditionalGeneration.forward  s   z ** 
%))%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
r<   c                 n   > [         TU ]  " U4UUUUUU	UUS.UD6nU(       d  U(       d  X]S'   U$ S US'   U$ )N)rC   rs  r   r   rt  r  r  r  r  r  )rQ   prepare_inputs_for_generation)rV   rY   rC   rs  r   r  r   r  rt  r  r  r  r   model_inputsrW   s                 r=   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generation%  sh      w<
+')%))1
 
" Y+7(
  .2L)*r<   r  r  r  r  c           
          [        U UUUUU4SU0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6$ s  sn	nf )Nr  r  )r  items)
rj   rs  r   rC   r   r  r  r   r   vs
             r=   r   8Gemma3ForConditionalGeneration.create_masks_for_generateN  s_     *	
  2	
 !'F!~2EtqtF	
 		
 Gs   ??)r  r;  )
NNNNNNNNNr   )
NNNNNNTNNF)NF)r3   r4   r5   r6   r  r  r*   rR   r   r8   r9   r   r   r  r    r8  rd   r	   r`  rb   rF   r?   r\   r  r   r#   r   r  r   r;   re   rf   s   @r=   r   r     sG    +,VW  |  Eu/@/@ EFSeLf E E  .215.204(,2626*.!%-.k
##d*k
 ''$.k
 t+	k

 &&-k
 k
 ((4/k
 ((4/k
   4'k
 $;k
 ell*k
 ./k
 
-	-k
  k
`  'R ^WO /3*/
 
||
 t+
 	

 llT)
 t+
 !4K
 

 P 
r<   r   c                   \  ^  \ rS rSrU 4S jrS rS r\\         SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForSequenceClassificationig  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g rl   )rQ   rR   
num_labelsr  r;  rq   rr   r  ro   scorerq  rx   s     r=   rR   (Gemma3ForSequenceClassification.__init__h  sZ      ++ (
YYv11==tUZ[
 	r<   c                 6    U R                   R                  5       $ r[   )r;  r  r   s    r=   r  4Gemma3ForSequenceClassification.get_input_embeddingsq  s    zz..00r<   c                 :    U R                   R                  U5        g r[   )r;  r  r  s     r=   r  4Gemma3ForSequenceClassification.set_input_embeddingst  s    

''.r<   NrY   r  r   r   rC   rs  r  r  rt  r   r   c
                    U R                   " U4UUUUUUU	SS.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOUb  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S	35        U[        R                  " XR                  S
9U4   nSnUb  U R%                  XUU R                  S9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
T)r   r  r   rC   rs  r  rt  r  Nr   r)   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rx  )rB   r  pooled_logitsrj   r  )r;  r  r  r   rj   r  rg  r  r]   r   r8   int32r   argmaxloggerwarning_oncerW   r3   r  r   rC   rD   rE   )rV   rY   r  r   r   rC   rs  r  r  rt  r   transformer_outputsrD   rB   r  last_non_pad_tokennon_pad_masktoken_indicesr!  rA   s                       r=   r\   'Gemma3ForSequenceClassification.forwardw  s   , #jj
)%%+')
 
 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r<   )r;  r  r  r  )r3   r4   r5   r6   rR   r  r  r    r   r8   r8  r9   rd   r	   r`  r   r   r   r\   r;   re   rf   s   @r=   r  r  g  s   1/  .215.204(,2626*.!%D
##d*D
 ''$.D
 t+	D

 &&-D
 D
 ((4/D
 ((4/D
   4'D
 $;D
 +,D
 
*D
  D
r<   r  c                   (    \ rS rSr% Sr\\S'   SrSrg)#Gemma3TextForSequenceClassificationi  z
Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
It uses the generic sequence classification implementation for efficiency and consistency.
rj   rd  r2   N)	r3   r4   r5   r6   r7   r+   r:   rU  r;   r2   r<   r=   r,  r,    s    
  r<   r,  )r:  rc  r  r   r  r  r,  )r)   )r  NN)NN)bcollections.abcr   dataclassesr   typingr   r8   torch.nnrq    r   rE  activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   integrationsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   utils.deprecationr#   utils.genericr$   r%   utils.output_capturingr&   autor(   configuration_gemma3r*   r+   
get_loggerr3   r$  r/   r?   	EmbeddingrH   Modulerh   r~   r   r   r   rd   rb   r   rc   rF   r  r  r*  r:  r`  ra  rc  r  rD  r  r  r  r  r   r  r,  __all__r2   r<   r=   <module>rG     s  * % !    & ! . 3 ) I m m [  L F & w w 0 G 5  @ 
		H	% 
9 7 9 9 
 9; 9 90SBLL S		  =BII =(L<BII L<^( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D )*K)bii K) +K)\+3 +\ (^O (^ (^V
# 
(CcSVCWY]C]:^ 
 ]
+ ]
 ]
@ L
- L
 L
^!@		 !@HELL X 6 ?K +/&*$4$4<<$4 LL4'$4 T\	$4
 ,,%$4 LL4'$4 t$4 
$4 L$4N 
U
' U

U
p 
~
%:O ~

~
BV
&; V
r!*JLa !r<   