
    Z j                    t
   S SK r S SKJr  S SKJr  S SKJr  S SKJr  S SK	r	S SK	J
r
  S SKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJr  SSKJrJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  SSK1J2r2J3r3J4r4J5r5J6r6J7r7  SSK8J9r9J:r:  SSK;J<r<J=r=  SSK>J?r?  SSK@JArAJBrBJCrCJDrD  \6" 5       (       a  S SKEJFrF  \\4" SS9 " S  S!\'5      5       5       rG\4" S"S9\ " S# S$\25      5       5       rH\ " S% S&\'5      5       rI\4\ " S' S(\(5      5       5       rJ " S) S*\
R                  5      rL " S+ S,\
R                  5      rM " S- S.\
R                  5      rN " S/ S0\
R                  5      rO " S1 S2\
R                  5      rP " S3 S4\
R                  5      rQ " S5 S6\
R                  5      rR " S7 S8\
R                  5      rT " S9 S:\
R                  5      rU " S; S<\
R                  5      rV " S= S>\
R                  5      rW " S? S@\
R                  5      rX " SA SB\
R                  5      rY " SC SD\
R                  5      rZSE r[SSF\	R                  SG\	R                  SH\	R                  SI\]4SJ jjr^SK\	R                  SL\]SM\	R                  4SN jr_   SSO\
R                  SP\	R                  SQ\	R                  SR\	R                  SS\	R                  S-  ST\`\]-  SU\`S-  SV\`S-  SM\a\	R                  \	R                  4   4SW jjrb SSF\	R                  SG\	R                  SH\	R                  SX\	R                  SI\]SM\	R                  4SY jjrc\" \^5       " SZ S[\
R                  5      5       rd " S\ S]\%5      re " S^ S_\
R                  5      rf " S` Sa\
R                  5      rg " Sb Sc\
R                  5      rh " Sd Se\
R                  5      ri\ " Sf Sg\
R                  5      5       rj " Sh Si\
R                  5      rk " Sj Sk\%5      rl " Sl Sm\
R                  5      rn\4 " Sn So\.5      5       ro\4" SpS9 " Sq Sr\o5      5       rp\4" SsS9 " St Su\o\5      5       rqSv\a\]\]4   SM\4Sw jrr " Sx Sy\o5      rs " Sz S{\o5      rt " S| S}\
R                  5      ruS~\	R                  S-  S\	R                  S-  SM\S-  4S jrv  SS\S\	R                  SS\	R                  S-  S\S-  SX\	R                  S-  S\	R                  S-  S\wS-  SM\x4S jjry\4" SS9 " S S\o5      5       rz\4" SS9 " S S\o\5      5       r{/ SQr|g)    N)Callable)	dataclass)cached_property)Optional)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_experts_implementationuse_kernelized_func)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_accelerate_availabletorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfig)add_hook_to_modulezK
    Base class for Gemma4 outputs, with hidden states and attentions.
    custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\\\R                  \R                  4   4   S-  \S'   Srg)Gemma4ModelOutputWithPastC   aw  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nimage_hidden_statesaudio_hidden_statesshared_kv_states )__name__
__module____qualname____firstlineno____doc__r6   torchFloatTensor__annotations__r7   r8   dictstrtupleTensor__static_attributes__r9       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma4/modeling_gemma4.pyr4   r4   C   sa    " 59**T1848**T18LPd3ellELL&@ AABTIPrG   r4   zR
    Base class for Gemma4 causal language model (or autoregressive) outputs.
    c                   z   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\\\\R(                  \R(                  4   4   S-  \S'   Srg)Gemma4CausalLMOutputWithPastb   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nlosslogitspast_key_valueshidden_states
attentionsr6   r7   r8   r9   )r:   r;   r<   r=   r>   rL   r?   r@   rA   rM   rN   r   rO   rD   rP   r6   r7   r8   rB   rC   rE   rF   r9   rG   rH   rJ   rJ   b   s    * &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18LPd3ellELL&@ AABTIPrG   rJ   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4TextModelOutputWithPast   a!  
BaseModelOutputWithPast extended with shared_kv_states for KV sharing.

Args:
    shared_kv_states (`dict`, *optional*):
        Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
        Used to pass shared KV states between layers during KV sharing.
Nr8   r9   )r:   r;   r<   r=   r>   r8   rB   rC   rD   r?   rE   rA   rF   r9   rG   rH   rR   rR      s7     MQd3ellELL&@ AABTIPrG   rR   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma4AudioModelOutput   z
attention_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
Nattention_maskr9   )
r:   r;   r<   r=   r>   rW   r?   
BoolTensorrA   rF   r9   rG   rH   rU   rU      s    
 /3NE$$t+2rG   rU   c                   |   ^  \ rS rSrS\\-  S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr
S
rU =r$ )Gemma4ClippableLinear   configin_featuresout_featuresreturnNc                   > [         TU ]  5         UR                  U l        [        R                  " X#SS9U l        U R                  (       a  U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        g g )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferr?   tensorfloat)selfr\   r]   r^   	__class__s       rH   ri   Gemma4ClippableLinear.__init__   s     	#)#=#= iiF##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $rG   rO   c                    U R                   (       a+  [        R                  " XR                  U R                  5      nU R                  U5      nU R                   (       a+  [        R                  " XR                  U R                  5      nU$ N)rj   r?   clamprc   re   rl   rf   rg   )rp   rO   s     rH   forwardGemma4ClippableLinear.forward   sX    ##!KK~~t~~VMM2##!KKXMrG   )rl   rj   )r:   r;   r<   r=   r/   r,   intri   r?   rE   rv   rF   __classcell__rq   s   @rH   rZ   rZ      sY    K"%66K K 	K
 
K 	U\\ 	ell 	 	rG   rZ   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  4S jr
S\R                  S\R                  4S	 jrS
rU =r$ )Gemma4RMSNorm   dimeps
with_scalec                    > [         TU ]  5         X l        X0l        U R                  (       a/  [        R
                  " [        R                  " U5      SS9U l        g g )NT)requires_grad)	rh   ri   r   r   r   	Parameterr?   onesweight)rp   r~   r   r   rq   s       rH   ri   Gemma4RMSNorm.__init__   s>    $??,,uzz#dKDK rG   rO   c                     UR                  S5      R                  SSS9U R                  -   nU[        R                   " US5      -  $ )Nr)   T)keepdim      )powmeanr   r?   )rp   rO   mean_squareds      rH   _normGemma4RMSNorm._norm   sA    $((+00T0BTXXMuyyt<<<rG   r_   c                     U R                  UR                  5       5      nU R                  (       a  X R                  R                  5       -  nUR	                  U5      $ rt   )r   ro   r   r   type_as)rp   rO   normed_outputs      rH   rv   Gemma4RMSNorm.forward   sF    

=#6#6#89??)KK,=,=,??M$$]33rG   )r   r   r   )gư>T)r:   r;   r<   r=   rx   ro   boolri   r?   rE   r   rv   rF   ry   rz   s   @rH   r|   r|      sW    LC Le L L L=5<< =
4U\\ 4ell 4 4rG   r|   c                      ^  \ rS rSr% Sr\R                  \S'   S\4U 4S jjr	\R                  " 5       S\R                  S\R                  4S j5       rS	rU =r$ )
 Gemma4AudioRelPositionalEncoding   zSinusoidal relative positional encoding for the audio encoder.

Produces position embeddings of shape [1, context_size // 2 + 1, hidden_size] with
concatenated [sin..., cos...] layout matching the original Gemma4 convention.
inv_timescalesr\   c                   > [         TU ]  5         UR                  U l        UR                  UR                  -   S-
  UR
                  -   U l        SnSnU R                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  nU R                  SUR                  S5      R                  S5      SS9  g )	Nr+         ?     @r)   r   r   F
persistent)rh   ri   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxr?   exparangerm   	unsqueeze)rp   r\   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   rq   s          rH   ri   )Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((=+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/WdijrG   rO   r_   c                 b   [         R                  " U R                  S-  SSUR                  S9nUS   nX R                  R                  UR                  S9-  n[         R                  " [         R                  " U5      [         R                  " U5      /SS9nUR                  UR                  S9$ )Nr)   r   device.Nr~   dtype)
r?   r   r   r   r   tocatsincosr   )rp   rO   position_idsscaled_time	pos_embeds        rH   rv   (Gemma4AudioRelPositionalEncoding.forward   s    ||D$5$5$:B=K_K_`#I."%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66rG   )r   r   )r:   r;   r<   r=   r>   r?   rE   rA   r,   ri   no_gradrv   rF   ry   rz   s   @rH   r   r      sS     LL k0 k ]]_7U\\ 7ell 7 7rG   r   c                   f  ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr
S\R                  S\R                  4S	 jrS
\R                  S\R                  4S jr SS\R                  S\R                  S\R                  S-  S\\R                  S4   4S jjrSrU =r$ )Gemma4AudioAttention   z3Chunked local attention with relative position biasr\   	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR                  UR                  -  U l        UR                  U l	        U R                  S-  [        R                  " S5      -  U l        [        R                  " S[        R                  -   5      [        R                  " S5      -  U l        UR                  U l        UR"                  S-
  U l        UR&                  U l        U R                   U R$                  -   U R(                  -   U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  UR                  5      U l        [6        R8                  " UR                  U R                  U R                  -  SS9U l        [6        R<                  " [>        R@                  " U R                  5      5      U l!        U RE                  S[>        RF                  " U R
                  5      SS9  g )Nr   r)   r+   Fra   softcapr   )$rh   ri   r\   r   attention_logit_capattention_logits_soft_capr   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler   
chunk_sizer   max_past_horizonr   max_future_horizonr   rZ   q_projk_projv_projpostr   rk   relative_k_projr   r?   zerosper_dim_scalerm   rn   rp   r\   r   rq   s      rH   ri   Gemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg)&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(TafgrG   rO   r_   c           	         UR                   u  p#pEX0R                  -   S-
  U R                  -  nX`R                  -  U-
  n[        R                  " USSSSSU45      nUR	                  X&U R                  XE5      R                  5       $ )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r+   r   )shaper   Fpadreshape
contiguous)rp   rO   
batch_sizeseq_lenr   r   
num_blocksr   s           rH   _convert_to_block&Gemma4AudioAttention._convert_to_block  s|    3@3F3F0
Y/!3G
??*W4maAq!S-AB$$ZT__ibmmoorG   c           
      @   UR                   u  p#pE[        R                  " USSSSU R                  U R                  U R
                  -   S-
  45      nUR                  SU R                  U R
                  5      n[        R                  " USS5      nUR                  5       $ )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r+   r   r)   )r   r   r   r   r   r   unfoldr   r?   movedimr   )rp   rO   r   r   r   r   s         rH   _extract_block_context+Gemma4AudioAttention._extract_block_context  s    3@3F3F0
YAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))rG   xc                     UR                   u  p#pEnU R                  n[        R                  " USUS-   U-
  45      nUR	                  X#XEUS-   -  5      nUSSXW-  24   nUR	                  X#XEU5      $ )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r+   .N)r   r   r   r   view)rp   r   r   r   r   
block_sizeposition_lengthr   s           rH   
_rel_shiftGemma4AudioAttention._rel_shift#  s    IJF
z((EE!a)O;<=FF:*LSTDT6UVc.Z.../vvjZ\RRrG   Nposition_embeddingsrW   c                    UR                   u  pEnXEU R                  U R                  4nU R                  U5      R	                  5       R                  U5      nU R                  U5      R	                  5       R                  U5      n	U R                  U5      R	                  5       R                  U5      n
XR                  -  [        R                  " U R                  5      -  nXR                  -  n	U R                  U5      nU R                  U	5      n	U R                  U
5      n
UR                   S   nU R                  U5      nUR                  SU R                  U R                  5      nUR!                  UR"                  S9nUR%                  SSSSS5      nXR%                  SSSSS5      -  nUR'                  X@R                  SU R                  5      nXR%                  SSS5      -  nUR'                  X@R                  XR(                  S5      nU R+                  U5      nUU-   nUU R,                  -  n[.        R0                  " U5      nUU R,                  -  nUb4  UR3                  UR5                  5       U R6                  R8                  5      n[        R:                  " US[.        R<                  S9R!                  U
R"                  5      nUU
R%                  SSSSS5      -  nUR%                  SSSSS5      R'                  XKU R(                  -  S5      nUS S 2S U24   R?                  5       nU RA                  UR!                  U R@                  RB                  RD                  R"                  S95      nUU4$ )	Nr+   r   r   r   r	   r)      r~   r   )#r   r   r   r   ro   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   r?   tanhmasked_filllogical_notr\   attention_invalid_logits_valuesoftmaxfloat32r   r   rl   r   )rp   rO   r   rW   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      rH   rv   Gemma4AudioAttention.forward,  s    %2$7$7!
"N{{=1779>>|L[[/557<<\J
{{=1779>>|L#ll2QZZ@R@R5SS,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q1500Aq!Q??	z>>2t}}U #>#>q!Q#GG	%%j..*oo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@Z^ZiZiMikmn!![j[.1<<>iiTYY5E5E5L5L5R5R STL((rG   )r   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rt   )r:   r;   r<   r=   r>   r,   rx   ri   r?   rE   r   r   r   rX   rD   rv   rF   ry   rz   s   @rH   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1) 1)rG   r   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	'Gemma4AudioSubSampleConvProjectionLayeri`  c           	         > [         TU ]  5         [        R                  " UUSSSSS9U l        [        R
                  " X#SSS9U l        [        R                  " 5       U l        g )N)r	   r	   )r)   r)   r+   F)in_channelsout_channelskernel_sizestridepaddingrb   T)r   elementwise_affinerb   )	rh   ri   r   Conv2dconv	LayerNormnormReLUact)rp   r  r  norm_epsrq   s       rH   ri   0Gemma4AudioSubSampleConvProjectionLayer.__init__a  sU    II#%
	 LLPT[`a	779rG   NrO   maskc           
         Ub(  UR                  UR                  S9nXS S 2S S S 2S 4   -  nU R                  UR                  U R                  R                  R                  5      5      nU R                  U R                  UR                  SSSS5      5      R                  SSSS5      R                  5       5      nUb  US S 2S S S24   nX4$ )Nr   r   r)   r	   r+   )	r   r   r  r   r   r  r  r   r   )rp   rO   r  s      rH   rv   /Gemma4AudioSubSampleConvProjectionLayer.forwardn  s    77-"6"677D)D!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<D""rG   )r  r  r  rt   )
r:   r;   r<   r=   ri   r?   rE   rv   rF   ry   rz   s   @rH   r
  r
  `  s-    #U\\ #9L # #rG   r
  c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S-  S\\R                  \R                  4   4S jjr	S	r
U =r$ )"Gemma4AudioSubSampleConvProjectioni|  r\   c                 d  > [         TU ]  5         [        SUR                  S   UR                  S9U l        [        UR                  S   UR                  S   UR                  S9U l        UR                  S   S-  UR                  S   -  n[        R                  " X!R                  SS9U l
        g )Nr+   r   )r  r  r  r   Fra   )rh   ri   r
  subsampling_conv_channelsrms_norm_epslayer0layer1r   rk   r   input_proj_linear)rp   r\   proj_input_dimrq   s      rH   ri   +Gemma4AudioSubSampleConvProjection.__init__}  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>;M;MTY!ZrG   Ninput_featuresinput_features_maskr_   c                    UR                  S5      nU R                  X25      u  p4U R                  X45      u  p4UR                  u  pVpvUR	                  SSSS5      R                  5       R                  XWS5      nU R                  U5      U4$ )Nr+   r   r)   r	   r   )r   r"  r#  r   r   r   r   r$  )rp   r'  r(  rO   r  r   r   r   s           rH   rv   *Gemma4AudioSubSampleConvProjection.forward  s    
 '003"kk-M"kk->$1$7$7!
w%--aAq9DDFNNzdfg%%m4d::rG   )r$  r"  r#  rt   )r:   r;   r<   r=   r,   ri   r?   rE   rD   rv   rF   ry   rz   s   @rH   r  r  |  s\    [0 [$ 48;; #\\D0; 
u||U\\)	*	; ;rG   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioFeedForwardi  r\   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  S-  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l	        [        UR                     U l        UR                  U l        UR                  U l        g )Nr   )rh   ri   r\   rZ   r   ffw_layer_1ffw_layer_2r|   pre_layer_normpost_layer_normr   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalerp   r\   rq   s     rH   ri   Gemma4AudioFeedForward.__init__  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6rG   rO   r_   c                    [        U R                  [        R                  " U R                  R
                  R                  R                  5      R                  5      nUn[        R                  " X* U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nXR                  -  nX-  nU$ rt   )minr4  r?   finfor.  rl   r   r   r   ru   r0  r3  r/  r1  r6  )rp   rO   r4  residuals       rH   rv   Gemma4AudioFeedForward.forward  s     6 6DDTDTD[D[DbDbDhDh8i8m8mn M3EGXY++M:((7M2((7M3EGXY,,];...!rG   )r3  r\   r.  r/  r4  r1  r6  r0  r:   r;   r<   r=   r,   ri   r?   rE   rv   rF   ry   rz   s   @rH   r,  r,    s0    70 7U\\ ell  rG   r,  c                   l   ^  \ rS rSr\S 5       rS\R                  S\R                  4U 4S jjrSr	U =r
$ )Gemma4AudioCausalConv1di  c                 n    U R                   S   S-
  U R                  S   -  S-   nXR                  S   -
  $ )Nr   r+   )r  dilationr  )rp   effective_kernel_sizes     rH   left_pad Gemma4AudioCausalConv1d.left_pad  s<    !%!1!1!!4q!8DMM!<L Lq P${{1~55rG   r   r_   c                 x   > [         R                  R                  XR                  S45      n[        TU ]  U5      $ Nr   )r   r   r   rD  rh   rv   )rp   r   rq   s     rH   rv   Gemma4AudioCausalConv1d.forward  s1     MMa--!34wq!!rG   r9   )r:   r;   r<   r=   r   rD  r?   rE   rv   rF   ry   rz   s   @rH   r@  r@    s;     6 6"<<" 
	" "rG   r@  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioLightConv1di  r\   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  UR                  5      U l        [        UR                  UR                  UR                  UR                  SS9U l	        [        UR                  UR                  SS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        UR"                  U l        g )Nr)   F)r  r  r  groupsrb   Tr   r   )rh   ri   r\   rZ   r   linear_start
linear_endr@  conv_kernel_sizedepthwise_conv1dr|   r!  r0  	conv_normr   r2  r3  r4  r7  s     rH   ri   Gemma4AudioLightConv1d.__init__  s    1&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9rG   rO   r_   c                 B   UnU R                  U5      nU R                  U5      n[        R                  R	                  USS9nU R                  UR                  SS5      5      R                  SS5      n[        U R                  [        R                  " U R                  R                  R                  R                  5      R                  5      n[        R                  " X* U5      nU R!                  U5      nU R#                  U5      nU R%                  U5      nX-  nU$ )Nr   r   r+   r)   )r0  rN  r   r   glurQ  	transposer:  r4  r?   r;  rl   r   r   r   ru   rR  r3  rO  )rp   rO   r<  r4  s       rH   rv   Gemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6DDUDUD\D\DcDcDiDi8j8n8noM3EGXY}5M26!rG   )r3  r\   rR  rQ  r4  rO  rN  r0  r>  rz   s   @rH   rJ  rJ    s0    :0 :(U\\ ell  rG   rJ  c            
          ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  S-  S\R                  S	\
\   S
\R                  4
S jrSrU =r$ )Gemma4AudioLayeri  r\   r   c                 l  > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        X5      U l        [        U5      U l	        [        UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        g rt   )rh   ri   r\   r,  feed_forward1feed_forward2r   	self_attnrJ  lconv1dr|   r   norm_pre_attnnorm_post_attnnorm_outr4  r   s      rH   ri   Gemma4AudioLayer.__init__	  s    3F;3F;-f@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9rG   rO   rW   Nr   kwargsr_   c                 8   [        U R                  [        R                  " U R                  R
                  R                  5      R                  5      nU R                  U5      nUn[        R                  " X* U5      nU R	                  U5      nU R                  UUUS9u  p[        R                  " X* U5      nU R                  U5      nX-  nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nU$ )N)rO   r   rW   )r:  r4  r?   r;  r_  r   r   r   r[  ru   r]  r`  r^  r\  ra  )rp   rO   rW   r   rc  r4  r<  r   s           rH   rv   Gemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M3EGXY**=9>>' 3) * 
 M3EGXY++M:!]3**=9M3EGXYm4rG   )	r\   r[  r\  r4  r^  ra  r`  r_  r]  )r:   r;   r<   r=   r,   rx   ri   r?   rE   rX   r   r    rv   rF   ry   rz   s   @rH   rY  rY    sn    :0 :S : ||  ((4/  #\\	 
 +,  
   rG   rY  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4VisionPatchEmbedderi>  r\   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  U l        [        R                  " SU R                  S-  -  U R                  SS9U l        [        R                  " [        R                  " SU R
                  U R                  5      5      U l        g )Nr	   r)   Fra   )rh   ri   r\   r   
patch_sizeposition_embedding_sizer   rk   
input_projr   r?   r   position_embedding_tabler7  s     rH   ri   "Gemma4VisionPatchEmbedder.__init__?  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%rG   pixel_position_idspadding_positionsr_   c                 B   UR                  SS9n[        R                  " X0R                  S9nUR	                  SSSS5      R                  U R                  5      nX@R                  -  nUR                  SS9n[        R                  " UR                  S5      S	U5      nU$ )
zDPrepare patch positions map for matmul with positon embedding table.r   r:  num_classesr)   r+   r	   r   r           )ru   r   one_hotrj  r   r   rl  sumr?   wherer   )rp   rn  ro  clamped_positionsru  r   s         rH   _position_embeddings.Gemma4VisionPatchEmbedder._position_embeddingsI  s     /444;))-;W;WX//!Q1-001N1NO%(E(EE155!5<#kk*;*E*Eb*I3Pcd""rG   pixel_valuesc                     SUS-
  -  nU R                  UR                  U R                   R                  R                  5      5      nU R	                  X#5      nXE-   $ )Nr)         ?)rk  r   r   r   ry  )rp   r{  rn  ro  rO   r   s         rH   rv   !Gemma4VisionPatchEmbedder.forwardV  sU     L3./8N8N8T8T(UV"778J^22rG   )r\   r   rk  ri  rj  rl  )r:   r;   r<   r=   r/   ri   r?   rE   ry  rv   rF   ry   rz   s   @rH   rg  rg  >  sz    t1 t#u|| #X]XdXd #iniuiu #3!LL3>Cll3_d_k_k3	3 3rG   rg  c                   @  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	S\
\R                  \R                  4   4S	 jr SS\R                  S\R                  S\R                  S\	S
-  S\
\R                  \R                  4   4
S jjrSrU =r$ )Gemma4VisionPooleri`  z9Scaling and optional spatial pooling for vision encodingsr\   c                 l   > [         TU ]  5         UR                  U l        U R                  S-  U l        g )Nr}  )rh   ri   r   root_hidden_sizer7  s     rH   ri   Gemma4VisionPooler.__init__c  s/    !-- $ 0 0# 5rG   rO   rn  lengthr_   c                 z   UR                   S   n[        XC-  S-  5      nUS-  nXc-  U:w  a'  [        SUR                    SU SU< SU< SU S	35      eUR                  S
S9nUS   R	                  SSS9S
   S-   n[
        R                  " XuSS9n	U	S   X-  U	S   -  -   n	[        R                  " U	R                  5       U5      R                  5       U-  n
U
R                  SS5      UR                  5       -  n[
        R                  " U
S
:H  R                  SS95      nUR                  UR                  5      U4$ )z
2D spatial pooling according to patch positions.
Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
input and output lengths
r+   r}  r)   zCannot pool z to z: k=z^2 times length=z	 must be .r   rq  .r   r   Tr~   r   floor)rounding_mode).r+   r   )r   rx   
ValueErrorru   r   r?   divr   ru  longro   rV  r   allr   r   )rp   rO   rn  r  input_seq_lenk	k_squaredrx  max_xkernel_idxsweightsoutputr  s                rH   _avg_pool_by_positions)Gemma4VisionPooler._avg_pool_by_positionsh  s`    &++A.(S01qD	.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 1GL!&)UZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33rG   Nro  output_lengthc                    XAR                   S   :  a  [        SU SUR                   S    S35      eUR                  UR                  S5      S5      nUR                   S   U:w  a  U R	                  XU5      u  pXR
                  -  nX4$ )Nr+   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.r   rt  )r   r  r   r   r  r  )rp   rO   rn  ro  r  s        rH   rv   Gemma4VisionPooler.forward  s     ..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J=0,M 	...//rG   )r   r  rt   )r:   r;   r<   r=   r>   r/   ri   r?   rE   rx   rD   r  rv   rF   ry   rz   s   @rH   r  r  `  s    C61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0 0rG   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Gemma4VisionMLPi  r\   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        UR                     U l        g rt   )rh   ri   r\   r   intermediate_sizerZ   	gate_projup_proj	down_projr   hidden_activationr3  r7  s     rH   ri   Gemma4VisionMLP.__init__  s    !--!'!9!9.v7G7GI_I_`,V5E5EtG]G]^.v7M7MtO_O_`V556rG   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rt   r  r3  r  r  rp   r   r  s      rH   rv   Gemma4VisionMLP.forward  6    NN4;;t~~a/@#ADLLQRO#ST	rG   r3  r\   r  r  r   r  r  )	r:   r;   r<   r=   r/   ri   rv   rF   ry   rz   s   @rH   r  r    s    71 7 rG   r  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\R                  S-  S\S-  S\S	\4   4S
 jj5       r\R                  " 5       \S 5       5       rSrU =r$ )Gemma4VisionRotaryEmbeddingi  inv_freqNr\   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr  Fr   original_inv_freq)rh   ri   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   rope_parametersr  compute_default_rope_parametersr   attention_scalingrm   clone)rp   r\   r   rope_init_fnr  rq   s        rH   ri   $Gemma4VisionRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUrG   r   r   r_   torch.Tensorc           	      "   U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nUS-  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nr)   r   r   r   r   r   	r  getattrr   r   r?   r   int64r   ro   )r\   r   r   baser~   spatial_dimattention_factorr  s           rH   r  ;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 ))rG   c                 $   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn/ / pe[        S5       H  nUS S 2S S 2U4   nUS S 2S S S 24   R                  5       n	[        USS9   UR                  5       U	R                  5       -  R                  SS5      n
[        R                  " X4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  nS S S 5        UR#                  W5        UR#                  W5        M     [        R                  " USS	9R	                  UR$                  S
9n[        R                  " USS	9R	                  UR$                  S
9nX4$ ! , (       d  f       N= f)Nr   r   r+   mpscpur)   Fdevice_typeenabledr   r   )r  ro   expandr   r   r   
isinstancetyperC   ranger%   rV  r?   r   r   r  r   appendr   )rp   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 rH   rv   #Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E'E!((--[`J`ahhmmfk rqA+Aq!G4(8D!(D(J(J(L%KG*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	 H
 NN3NN3  iiR(++!''+:iiR(++!''+:x HGs   6BH
H	)r  r\   r  r  r  rt   NNN)r:   r;   r<   r=   r?   rE   rA   r/   ri   staticmethodr   rx   rD   ro   r  r   r   rv   rF   ry   rz   s   @rH   r  r    s    llV1 V V  ,0&*" *"T) *t# * t * 
~u$	%	 *  *D ]]_  rG   r  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r)   r   )r   r?   r   )r   x1x2s      rH   rotate_halfr    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''rG   r   r   r   unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )a$  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   r  r   r   r   r  s       rH   apply_rotary_pos_embr    s6    " --
&C
--
&CGA,--rG   rO   n_repr_   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r+   N)r   r  r   )rO   r  batchnum_key_value_headsslenr   s         rH   	repeat_kvr    s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrG   modulequerykeyvaluerW   dropoutscalingr   c                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r)   r	   r   r   )ptrainingr+   )r   r  num_key_value_groupsr?   matmulrV  r   r   r   r   r   r   r   r  r  r   )r  r  r  r  rW   r  r  r   rc  r   r   r  r  s                rH   eager_attention_forwardr  #  s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$rG   r   c           
         UR                   S   nU R                   S   nSUSU-  -  -  nUS::  a  [        SU SU SU S35      eU/U-  n[        R                  " XSS9n	[        R                  " XSS9n
[        R                  " X(SS9n[	        U5       Vs/ s H  n[        X   X   X   US	9PM     nn[        R                  " USS9$ s  snf )
a#  Applies multidimensional RoPE to inputs.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
        Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
        `apply_rotary_pos_emb()`, and then concatenated back together.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

Returns:
  Tensor of shape [B, L, N, H] with RoPE applied.
r   r)   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   r  )r   r  r?   splitr  r  r   )r   r   r   r   r  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsr  y_partss                 rH   apply_multidimensional_roper   E  s
   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk!b1GC"5IC"5I t A 	j'		
    99W"%%s   C
c                      ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\R                  S	\R                  S-  S
\R                  S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma4VisionAttentioni}  =Multi-headed attention from 'Attention Is All You Need' paperr\   r   c                   > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        SU l        U R
                  R                  U l        SU l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  U R                  -  UR                  5      U l        [+        UR                  UR,                  S9U l        [+        UR                  UR,                  S9U l        [+        U R                  UR,                  SS9U l        g )Nlayer_typesr   r   Fr~   r   rM  )rh   ri   hasattrr  
layer_typer\   r   r  r   r   r   r  r  r  attention_dropout	is_causalrZ   r   r   r   o_projr|   r!  q_normk_normv_normr   s      rH   ri   Gemma4VisionAttention.__init__  sw   ;B6=;Y;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4N4NQUQ^Q^4^`f`r`rs#V=P=PQ#V=P=PQ#DMMv7J7JW\]rG   NrO   r   rW   r   rc  r_   c                 L   UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      n
U R	                  U
5      n
[        XX5      n
U
R                  SS5      n
U R                  U5      R                  U5      nU R                  U5      n[        XX5      nUR                  SS5      nU R                  U5      R                  U5      nU R                  U5      nUR                  SS5      n[        R                  " U R                  R                  [        5      nU" U U
UUU4U R                   (       a  U R"                  OSU R$                  S.UD6u  pUR&                  " / UQSP76 R)                  5       nU R+                  U5      nX4$ )Nr   r+   r)   rt  )r  r  )r   r   r   r   r  r   rV  r   r  r   r  r   get_interfacer\   _attn_implementationr  r  r	  r  r   r   r  )rp   rO   r   rW   r   rc  input_shaper   r   r   r   r   r   attention_interfacer  r  s                   rH   rv   Gemma4VisionAttention.forward  s    $))#2.88b8$--8&{{=166|D{{<02<cX#--a3[[/44\B
[[,
0#T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
! "));;;;FFHkk+.((rG   )r	  r\   r   r
  r  r   r   r  r  r  r  r   r  r  r   r  )r:   r;   r<   r=   r>   r/   rx   ri   r?   rE   
LongTensorr   r    rD   rv   rF   ry   rz   s   @rH   r  r  }  s    G^1 ^c ^, -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,) ,)rG   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma4VisionEncoderLayeri  r\   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )Nr\   r   r   )rh   ri   r\   r   r   r  r]  r  mlpr|   r!  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rH   ri   !Gemma4VisionEncoderLayer.__init__  s    !--".fR"6*,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'rG   NrO   r   rW   r   rc  r_   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pU R                  U5      nXa-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXa-   nU$ )N)rO   r   rW   r   r9   )r  r]  r  r  r  r   )rp   rO   r   rW   r   rc  r<  r   s           rH   rv    Gemma4VisionEncoderLayer.forward  s     !,,];>> 
' 3)%	

 
 55mD 0 66}E/77F 0rG   )	r\   r   r  r   r  r  r   r  r]  r  )r:   r;   r<   r=   r/   rx   ri   r?   rE   r  r   r    rD   r@   rv   rF   ry   rz   s   @rH   r  r    s    
c1 
cc 
c -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	U rG   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\	\
   S	\4
S
 jjrSrU =r$ )Gemma4VisionEncoderi  r\   c           
        > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " [        U R                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )Nr  )rh   ri   r\   num_hidden_layers
num_layersr  
rotary_embr   
ModuleListr  r  layers)rp   r\   r  rq   s      rH   ri   Gemma4VisionEncoder.__init__  se     225f=mmKPQUQ`Q`KabKaa%VAKab
bs   A>Ninputs_embedsrW   rn  rc  r_   c                     [        U R                  UUS9nUnU R                  XS5      nU R                  SU R                  R                    H  nU" U4UUUS.UD6nM     [        US9$ )zw
pixel_position_ids (torch.Tensor):
    Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
)r\   r-  rW   N)rW   r   r   last_hidden_state)r   r\   r)  r+  r'  r   )rp   r-  rW   rn  rc  rO   r   decoder_layers           rH   rv   Gemma4VisionEncoder.forward  s     3;;')
 &"oomP "[[)H4;;+H+HIM)-$7/	
 M J 'GGrG   )r\   r+  r(  r)  rt   )r:   r;   r<   r=   r/   ri   r?   rE   r  r   r    r   rv   rF   ry   rz   s   @rH   r%  r%    so    
1 
 7;	H||H H ",,t3	H
 +,H 
!H HrG   r%  c                   :   ^  \ rS rSrS\S\4U 4S jjrS rSrU =r	$ )Gemma4TextMLPi  r\   r   c                 X  > [         TU ]  5         UR                  UR                  -
  nX#s=:  =(       a    S:  Os  nUR                  =(       a    UnXl        UR                  U l        UR                  U(       a  SOS-  U l        [        R                  " U R                  U R                  SS9U l
        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        UR                     U l        g )Nr   r)   r+   Fra   )rh   ri   r'  num_kv_shared_layersuse_double_wide_mlpr\   r   r  r   rk   r  r  r  r   r  r3  )rp   r\   r   first_kv_shared_layer_idxis_kv_shared_layerr7  rq   s         rH   ri   Gemma4TextMLP.__init__  s    $*$<$<v?Z?Z$Z!&GGaG$88O=O!--!'!9!9BUQ[\!]4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556rG   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rt   r  r  s      rH   rv   Gemma4TextMLP.forward)  r  rG   r  )
r:   r;   r<   r=   r.   rx   ri   rv   rF   ry   rz   s   @rH   r4  r4    s!    7/ 7C 7 rG   r4  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )Gemma4TextRotaryEmbeddingi.  r  Nr\   c                 |  > [         T
U ]  5         UR                  U l        UR                  U l        Xl        [        UR                  5      U l        0 U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   =nS:w  a
  [        U   nOU R                  nX`R                  U'   XPR                  U'   X#S.nUS:X  a  US:X  a  SUS'   U" U R
                  40 UD6u  pU R                  U S3US	S
9  U R                  U S3UR                  5       S	S
9  [        X S3U	5        M     g )Nr  r  )r   r  full_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFr   _original_inv_freq_attention_scaling)rh   ri   r  r  r  r\   setr  rope_init_fnsr  r  r   r  rm   r  setattr)rp   r\   r   r  rope_paramsr  r  rope_init_fn_kwargscurr_inv_freqcurr_attention_scalingrq   s             rH   ri   "Gemma4TextRotaryEmbedding.__init__1  sQ   "("@"@$*$B$B!v112SU)+**J++55jAK"(55	)C29=#CC-9z*)2NN:&-3"N--)~2M6G#N34@4dPc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST) +rG   r   ztorch.devicer   r  r_   r  c           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
r  r   Nr   r   r)   r   r  r  )r\   r   r   r  r  r~   r  r  s           rH   r  9Gemma4TextRotaryEmbedding.compute_default_rope_parametersQ  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))rG   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)NrD  rF  r   r   r+   r  r  Fr  r)   r   r   )r  ro   r  r   r   r   r  r  rC   r%   rV  r?   r   r   r   r   )rp   r   r   r  r  r  r  position_ids_expandedr  r  r  r   r   s                rH   rv   !Gemma4TextRotaryEmbedding.forwardu  sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)r\   r  r  r  rH  r  NN)NNNNrt   )r:   r;   r<   r=   r?   rE   rA   r.   ri   r  r   rx   rC   rD   ro   r  r   r   rv   rF   ry   rz   s   @rH   r>  r>  .  s    llU/ U U@ *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <rG   r>  c                   &  ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S	\R                  S-  S
\
\\\R                  \R                  4   4   S\S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma4TextAttentioni  r  r\   r   c                    > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        U R                  (       d  UR                  (       a  UR                  OUR                  U l
        UR                  =(       a    U R                  (       + U l        U R                  (       a  UR                  OUR                  nUR                  U-  U l        SU l        U R
                  R$                  U l        UR&                  S:g  U l        U R
                  R*                  [-        U R
                  SS5      -
  nX$s=:  =(       a    S:  Os  U l        UR                  S U nU R.                  (       + =(       a6    U[1        U5      S-
  US S S2   R3                  UR                  U   5      -
  :H  U l        [6        R8                  " UR:                  UR                  U R                  -  UR<                  S	9U l        [A        U R                  URB                  S
9U l"        U R.                  (       d  [A        U R                  URB                  S
9U l#        [A        U R                  URB                  SS9U l$        [6        R8                  " UR:                  X0R                  -  UR<                  S	9U l%        U R                  (       d6  [6        R8                  " UR:                  X0R                  -  UR<                  S	9OS U l&        [6        R8                  " UR                  U R                  -  UR:                  UR<                  S	9U l'        g )Nr  sliding_attentionr   r  r6  r   r+   r   ra   r  FrM  )(rh   ri   r  r  r  r\   r   
is_slidingsliding_windowrB  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   r  r  r	  use_bidirectional_attentionr
  r'  r  r9  lenindexstore_full_length_kvr   rk   r   attention_biasr   r|   r!  r  r  r  r   r   r  )rp   r\   r   r  r8  prev_layersrq   s         rH   ri   Gemma4TextAttention.__init__  s   ;B6=;Y;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+"M"MA"M(()C*CD(,(?(?$? %/IQTU`QadeQehsbDi

%""9-
.R/ E/! ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7--$GfNcNcDK
 55 		&,,.AMM.QX^XmXmn K ii&&68J8JQWQfQf
rG   NrO   r   rW   r8   rN   rc  r_   c                    UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      nU R	                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  X@R                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R                  U5      nU R                  b   U R                  U5      R                  U5      OUnU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR                  XU R                   5      u  pU R"                  (       a  X4X@R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr   r)   )r  r+   rt  )r  r  rZ  )r   r   r   r   r  r  rV  r9  r  r   r   r   r   r  r  updater   ra  r   r  r\   r  r  r  r	  r  rZ  r   r   r  )rp   rO   r   rW   r8   rN   rc  r  r   r   r   r   r   r   r  r  r  s                    rH   rv   Gemma4TextAttention.forward  s>    $))#2.88b8$--8&{{=166|D{{<0+LsRST#--a3
 ""'7'H$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-jsRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$0:0H__-(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((rG   )r	  r\   r   r
  r9  rY  r  r   r   r  r  r  r  r   r  rZ  ra  r\  r  r   rt   )r:   r;   r<   r=   r>   r.   rx   ri   r?   rE   rB   rC   rD   r   r   r   rv   rF   ry   rz   s   @rH   rV  rV    s    G/
/ /
C /
n )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=) =)rG   rV  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4TextExpertsi  z2Collection of expert weights stored as 3D tensors.r\   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        [        R                  " [        R                  " U R                  SU R                  -  U R                  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R                  5      5      U l        [        UR                     U l        g )Nr)   )rh   ri   num_expertsr   
hidden_dimmoe_intermediate_sizeintermediate_dimr   r   r?   emptygate_up_projr  r   r  r3  r7  s     rH   ri   Gemma4TextExperts.__init__   s    !-- ,, & < <LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV556rG   rO   top_k_indextop_k_weightsr_   c                 X   [         R                  " U5      n[         R                  " 5          [         R                  R                  R                  X R                  S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  5       nS S S 5        W H  nUS   nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
[        R                  R                  XR                  U   5      R                  SSS9u  pU R                  U5      U-  n[        R                  R                  XR                   U   5      nXXS 4   -  nUR#                  SXR%                  UR&                  5      5        M     U$ ! , (       d  f       N= f)Nrr  r)   r+   r   )r   r   r   )r?   
zeros_liker   r   r   ru  rk  r   greaterrv  nonzerorw  rl   rp  chunkr3  r  
index_add_r   r   )rp   rO   rr  rs  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 rH   rv   Gemma4TextExperts.forward	  so    $..}=]]_((--55kO_O_5`K%--aA6K{8'DaHPPRJ 
 %J#AJ---#(;;{:/F#G I)4M}}++M;L;LZ;XY__`agi_jHD$(KK$5$:!$&MM$8$89NP^P^_iPj$k!$9)`dJd<e$e!**1i9Q9QReRkRk9lm % #"# _s   A7F
F))r3  r  rp  rl  rn  rk  )r:   r;   r<   r=   r>   r.   ri   r?   rE   rv   rF   ry   rz   s   @rH   ri  ri    sS    <7/ 7#||# \\# ||	#
 
# #rG   ri  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  4   4S jr	Sr
U =r$ )Gemma4TextRouteri$  r\   c                 $  > [         TU ]  5         Xl        UR                  U l        U R                  S-  U l        UR
                  U l        [        U R                  U R                  SS9U l        [        R                  " UR                  UR                  SS9U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   FrM  ra   )rh   ri   r\   r   scalar_root_sizer!  r   r|   r  r   rk   rk  projr   r?   r   scaleper_expert_scaler7  s     rH   ri   Gemma4TextRouter.__init__%  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K LrG   rO   r_   c                 ^   U R                  U5      nXR                  -  U R                  -  nU R                  U5      n[        R
                  R                  USS9n[        R                  " UU R                  R                  SS9u  pEXDR                  SSS9-  nX@R                  U   -  nX4U4$ )Nr   r   )r  r~   Tr  )r  r  r  r  r   r   r   r?   topkr\   top_k_expertsrv  r  )rp   rO   expert_scoresrouter_probabilitiesrs  rr  s         rH   rv   Gemma4TextRouter.forward1  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
" 	**r4*@@ &(=(=k(JJ#K??rG   )r\   r   r   r  r  r  r  r  )r:   r;   r<   r=   r.   ri   r?   rE   rD   rv   rF   ry   rz   s   @rH   r  r  $  sD    
M/ 
M@U\\ @eELL%,,<V6W @ @rG   r  c                   @  ^  \ rS rSrS\\-  S\4U 4S jjr      SS\R                  S\R                  S\
\\\R                  \R                  4   4   S-  S	\R                  S
\R                  S-  S\R                  S-  S\S-  S\R                  4S jjrSrU =r$ )Gemma4TextDecoderLayeriH  r\   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        X5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        U R                  S[         R"                  " S5      5        UR$                  U l        U R$                  (       a  [&        UR(                     U l        [,        R.                  " U R                  U R$                  SS9U l        [,        R.                  " U R$                  U R                  SS9U l        [        U R                  UR                  S9U l        UR6                  U l        U R6                  (       a  [9        U5      U l        [=        U5      U l        [        U R                  UR                  S9U l         [        U R                  UR                  S9U l!        [        U R                  UR                  S9U l"        g g )Nr  r  layer_scalarr+   Fra   )#rh   ri   r\   r   r   rV  r]  r4  r  r|   r!  r  r  r  r   rm   r?   r   hidden_size_per_layer_inputr   r  r3  r   rk   per_layer_input_gateper_layer_projectionpost_per_layer_input_normenable_moe_blockr  routerri  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      rH   ri   Gemma4TextDecoderLayer.__init__I  s   !--",FP 3,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !rG   NrO   per_layer_inputr8   r   rW   r   rN   r_   c           
      (   Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUn	U R                  U5      nU R	                  U5      nU R
                  (       a  U R                  U5      nU	R                  SU	R                  S   5      nU R                  U5      u  pnU R                  U5      nU R                  XU5      nUR                  U	R                  5      nU R                  U5      nX-   nU R                  U5      nX-   nU R                  (       aN  Un	U R                  U5      nU R!                  U5      nX-  nU R#                  U5      nU R%                  U5      nX-   nXR&                  -  nU$ )N)rO   r   rW   r8   r   rN   r   r9   )r  r]  r  r  r  r  r  r   r   r  r  r  r  r   r  r  r3  r  r  r  )rp   rO   r  r8   r   rW   r   rN   rc  r<  r   hidden_states_1hidden_states_flatrs  rr  hidden_states_2s                   rH   rv   Gemma4TextDecoderLayer.forwarde  s    !,,];>> 
' 3)-%+
 
 55mD 0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)Ak">>?QRO"ll?WO-55hnnEO"??PO ,=M77F 0++$H 55mDM KK6M);M 55mDM ::=IM$4M***rG   )r3  r\   r  r  r   r  r  r   r  r  r  r  r   r  r  r  r  r  r  r]  )NNNNNN)r:   r;   r<   r=   r.   r/   rx   ri   r?   rE   rB   rC   rD   r  r   rv   rF   ry   rz   s   @rH   r  r  H  s    h/2DD hQT h> )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9 9rG   r  c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma4TextScaledWordEmbeddingi  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )Nr  Fr   )rh   ri   scalar_embed_scalerm   r?   rn   )rp   r  r  r  r  rq   s        rH   ri   &Gemma4TextScaledWordEmbedding.__init__  s7    D"-]ELL,ERWXrG   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ rt   )rh   rv   r  r   r   r   )rp   r  rq   s     rH   rv   %Gemma4TextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRrG   )r  )r   )r:   r;   r<   r=   r>   rx   ro   ri   r?   rE   rv   rF   ry   rz   s   @rH   r  r    sM    Ys Y3 YS Y_d Y Y
S S SrG   r  c            	         ^  \ rS rSr% \\S'   SrSr/ SQrSS/r	Sr
SrSrSrSrSrS	r\R$                  " 5       U 4S
 j5       rS rS r   SS\S-  S\S-  S\S\R2                  4U 4S jjjr   SS\S-  S\S-  S\4S jjrSrU =r$ )Gemma4PreTrainedModeli  r\   modelT)r  r  rY  rN   r8   N)imagetextvideoaudioc                 n
  > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g [        U[        5      (       a  SnSnUR                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  n[        R                  " UR                   UR#                  S5      R#                  S5      5        g [        U[$        5      (       aL  [        R&                  " UR(                  UR*                  5        [        R,                  " UR.                  5        g [        U[0        5      (       a  UR2                  R5                  5        H  u  pxSU0n	US:X  a  UR6                  U   S:X  a  S	U	S
'   U" UR8                  40 U	D6u  p[        R                  " [;        X S35      U
5        [        R                  " [;        X S35      U
5        M     g [        U[<        5      (       a  UR6                  S:w  a  [>        UR6                     OUR@                  nU" UR8                  5      u  p[        R                  " URB                  U5        [        R                  " URD                  U5        g [        U[F        5      (       a,  [        R&                  " URH                  URJ                  5        g [        U[L        5      (       aA  [        R
                  " URN                  5        [        R
                  " URP                  5        g [        U[R        5      (       aW  U R8                  RT                  n[        RV                  " URX                  SUS9  [        RV                  " URZ                  SUS9  g [        U[\        5      (       a!  [        R
                  " UR^                  5        g [        U[`        5      (       a  URb                  (       a  [        R&                  " URd                  [g        S5      * 5        [        R&                  " URh                  [g        S5      5        [        R&                  " URj                  [g        S5      * 5        [        R&                  " URl                  [g        S5      5        g [        U[n        5      (       a]  UR8                  Rp                  (       aA  [        R,                  " URr                  5        [        R
                  " URt                  5        g g g )Nr   r   r)   r+   r   r  r@  rA  rB  rC  rD  rE  r  rt  )r   stdrd   );rh   _init_weightsr  rg  initones_rl  r   r   r   r   r   r?   r   r   copy_r   r   r   	constant_r   r   zeros_r   r>  rH  itemsr  r\   r  r  r   r  r  r  r  r  r  r  r  r  ri  initializer_rangenormal_rp  r  r  r  rZ   rj   rc   ro   re   rf   rg   Gemma4VisionModelstandardizestd_bias	std_scale)rp   r  r   r   r   r   r   r  r  rK  rL  r   rope_fnbuffer_valuer  rq   s                  rH   r  #Gemma4PreTrainedModel._init_weights  s   f%f788JJv667 @AAM#M#//14N&*hh}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 455NN6>>6+K+KLKK,,- 9::,2,@,@,F,F,H(
'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U 

76\+CDmT

76\9K+LM}] -I  ;<< ##y0 $F$4$45;; 
 &fmm4OLJJv5JJv//> =>>NN6--v/H/HI 011JJv||$JJv../ 122++//CLL,,3C@LL))= 677JJv**+ 5666;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 122v}}7P7PKK(JJv''( 8Q2rG   c                 .    U R                   R                  $ rt   
base_modelembed_tokens_per_layerrp   s    rH   get_per_layer_input_embeddings4Gemma4PreTrainedModel.get_per_layer_input_embeddings  s    555rG   c                 $    XR                   l        g rt   r  rp   r  s     rH   set_per_layer_input_embeddings4Gemma4PreTrainedModel.set_per_layer_input_embeddings  s    16.rG   new_num_tokenspad_to_multiple_ofmean_resizingr_   c                 J   > [         TU ]  UUUS9nU R                  XU5        U$ )N)r  r  r  )rh   resize_token_embeddings_resize_per_layer_embeddings)rp   r  r  r  r-  rq   s        rH   r  -Gemma4PreTrainedModel.resize_token_embeddings  s:     7)1' 8 

 	)).m\rG   c                    U R                   U R                  R                  5       l        U R                  R                  5       R                  (       a  U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[        XV5        UR                  UR                  R                  5        U R                  U5        g g )N_hf_hook)
vocab_sizer\   get_text_configvocab_size_per_layer_inputr  r  _get_resized_embeddingsr  r  r0   requires_grad_r   r   r  )rp   r  r  r  r  new_embeddings_per_layerhooks          rH   r  2Gemma4PreTrainedModel._resize_per_layer_embeddings  s     DH??##%@;;&&(DD%)%H%H%J"'+'C'C&8J($ -z::-66"#;B$334J4Q4Q4_4_`//0HI ErG   r9   )NNT)r:   r;   r<   r=   r-   rA   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backend_can_record_outputsinput_modalitiesr?   r   r  r  r  rx   r   r   	Embeddingr  r  rF   ry   rz   s   @rH   r  r    s    &*#b#46H"IN!"&:
]]_2) 2)h67
 &*)-"	d
  $J 	
 
   &*)-"	Jd
J  $JJ 	J JrG   r  zAThe base Gemma 4 language model without a language modeling head.c                     ^  \ rS rSr% \\S'   Sr\" \SS9\	\
S.rS\4U 4S jjr\\\       SS	\R"                  S-  S
\R$                  S-  S\R"                  S-  S\S-  S\R(                  S-  S\R$                  S-  S\S-  S\\   S\4S jj5       5       5       rS	\R$                  S-  S\R$                  S-  S\R$                  4S jr SS\R$                  S\R$                  S-  S\R$                  4S jjrSrU =r$ )Gemma4TextModeli  r\   )r  r   )r`  )router_logitsrO   rP   c           
      &  > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        U5      U l        SU l        [+        U R                  R,                  5      U l        UR0                  U l        U R0                  (       a  [        UR2                  UR                  UR0                  -  U R                  UR0                  S-  S9U l        SU l        [        R8                  " UR                  UR                  UR0                  -  SS9U l        UR                  S-  U l        [        UR0                  UR                   S9U l        / U l         [C        U R                  5       HT  u  p4URD                  RF                  (       d  M"  U R@                  RI                  S Vs/ s H  nS	U S
U 3PM     sn5        MV     U RK                  5         g s  snf s  snf )Nr}  )r  r  Fg;f?ra   r   )r   r   r  r  zlayers.z.self_attn.)&rh   ri   pad_token_idr  r  r  r   r\   embed_tokensr   r*  r  r'  r  r+  r|   r!  r  r>  r)  gradient_checkpointingrG  r  unique_layer_typesr  r  r  per_layer_input_scalerk   per_layer_model_projection per_layer_model_projection_scaleper_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumerater]  r9  extend	post_init)rp   r\   r   r  layernamerq   s         rH   ri   Gemma4TextModel.__init__%  s*    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmHMfNfNfHghHg9#F6Hgh
 "&"4"4&:M:MN	3F;&+#"%dkk&=&=">
 ,2+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++.HA11177>>@hi@hwqcTF3@hi / 	I i@ js    J	J
Nr  rW   r   rN   r-  per_layer_inputs	use_cacherc  r_   c           
      2   USL USL-  (       a  [        S5      eUb  U R                  U5      nU R                  (       a%  Uc  U R                  X5      nU R	                  XV5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R                  UUUUS.n[        S0 UD6[!        S0 UD6S.n
Un0 nU R"                   H  nU R%                  XU5      X'   M     UR'                  S	0 5      n[)        U R*                  SU R                  R,                   5       H\  u  nnUb  USS2SS2USS24   OSnU" UU4UXR                  R.                  U      XR                  R.                  U      UUS
.UD6nM^     U R1                  U5      n[3        UUUR5                  SS5      (       a  US9$ SS9$ )u9  
per_layer_inputs (`torch.Tensor` of shape `(batch_size, sequence_length, num_hidden_layers, hidden_size_per_layer_input)`, *optional*):
    Pre-computed per-layer input embeddings. When provided, these are used directly instead of being
    computed from `input_ids` via `get_per_layer_inputs()`. This is primarily used by the multimodal
    model (`Gemma4Model`) which pre-computes per-layer inputs from the original `input_ids` *before*
    merging multimodal soft tokens into `inputs_embeds` — at which point the original token ids are
    no longer recoverable.
N:You must specify exactly one of input_ids or inputs_embedsr\   r   r+   r   r\   r-  rW   rN   r   r@  rX  r8   )r8   r   rW   r   rN   return_shared_kv_statesF)r0  rN   r8   r9   )r  r  r  get_per_layer_inputsproject_per_layer_inputsr   r\   get_seq_lengthr?   r   r   r   r   r  rB   r   r   r  r)  popr  r+  r'  r  r  rR   get)rp   r  rW   r   rN   r-  r	  r
  rc  past_seen_tokenscausal_mask_mappingmask_kwargsrO   r   r  r8   r  r1  r  s                      rH   rv   Gemma4TextModel.forwardU  sQ   , -t";<YZZ  --i8M++'#'#<#<Y#V #<<]]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# & 11J.2oom[e.f+ 2 "::&8"= !*$++6U8U8U*V WA}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S2;;3J3J13MN) /	 	M !X 		-0,++17<UW\1]1]-
 	
 dh
 	
rG   c                    U R                   (       d  [        SU R                   35      eUc  [        R                  " 5          USS2SS2SSS24   U R
                  R                  SSSS2SS24   U R                  R                  S-  -  :H  R                  SS9R                  5       SS2S4   n UR                  UR                  SS 5      n SSS5        U R                  U5      R                  " / UR                  QU R                  R                  PU R                   P76 $ ! [         a    [        S5      ef = f! , (       d  f       Nt= f)a  Compute the token-identity component of Per-Layer Embeddings (PLE).

Looks up `input_ids` in `embed_tokens_per_layer` (a scaled embedding that multiplies
by `sqrt(hidden_size_per_layer_input)`) and reshapes the packed output from
`[batch, seq, num_hidden_layers * hidden_size_per_layer_input]` to
`[batch, seq, num_hidden_layers, hidden_size_per_layer_input]`.

If only `inputs_embeds` is provided (no `input_ids`), reverses the main embedding
to recover `input_ids` for the PLE lookup.
z}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr}  r	   r   r)   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)r  RuntimeErrorr\   r?   r   r  r   r   r  rx  r   r   r  r   r'  )rp   r  r-  s      rH   r  $Gemma4TextModel.get_per_layer_inputs  sK    //**.++8   &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI !$ **95== 
__
KK))
 ,,
 	
 $ &r  !s   A.D>1D%%D;;D>>
Ec                 `   U R                   (       d  [        SU R                   35      eU R                  U5      U R                  -  nUR
                  " / UR                  SS QU R                  R                  PU R                   P76 nU R                  U5      nUc  U$ X2-   U R                  -  $ )aL  Compute the context-aware component of PLE and combine with token-identity.

Projects `inputs_embeds` through `per_layer_model_projection` (Linear), scales by
`1/sqrt(hidden_size)`, reshapes to `[batch, seq, num_layers, ple_dim]`, and normalizes
with `per_layer_projection_norm` (RMSNorm).

If `per_layer_inputs` (the token-identity component from `get_per_layer_inputs()`)
is provided, combines both: `(context_projection + token_identity) * (1/sqrt(2))`.
If `per_layer_inputs` is None (e.g. for multimodal inputs where input_ids are not
available), returns just the context projection.
zAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr   )
r  r  r\   r  r   r   r   r'  r  r  )rp   r-  r	  r  s       rH   r  (Gemma4TextModel.project_per_layer_inputs  s      //226++@ 
  $>>}MPTPuPuu3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$74;U;UUUrG   )r  r  r  r  r  r+  r  r  r  r  r   r  r)  r  r  )NNNNNNNrt   )r:   r;   r<   r=   r.   rA   r  r'   r  r  rV  r  ri   r&   r(   r!   r?   r  rE   r   r@   r   r   r    rR   rv   r  r  rF   ry   rz   s   @rH   r  r    s    '(8B/)./ .`   .2.204(,2604!%T
##d*T
 t+T
 &&-	T

 T
 ((4/T
  ,,-T
 $;T
 +,T
 
'T
    T
l*
ellT.A *
RWR^R^aeRe *
jojvjv *
^ 15!V||!V  ,,-!V 
	!V !VrG   r  z>The base Gemma 4 language model with a language modeling head.c                   h  ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   S	r	S\4U 4S
 jjr
\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R"                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )Gemma4ForCausalLMi  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrO   rM   r\   r  c                 L  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  R                   Vs/ s H  nSU 3PM
     snU l	        U R                  5         g s  snf NFra   zmodel.)rh   ri   r  r  r  r   rk   r   r"  r  r  rp   r\   r  rq   s      rH   ri   Gemma4ForCausalLM.__init__  s     $V,
 ++yy!3!3V5F5FUS )-

(U(U3
(UfTFO(U3
/
 	3
s   9B!Nr  rW   r   rN   r-  labelsr
  logits_to_keeprc  r_   c	           
      6   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  U
R                   S9$ )a"  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma4ForCausalLM

>>> model = Gemma4ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  rW   r   rN   r-  r
  N)rL   rM   rN   rO   rP   r8   r9   )r  r0  r  rx   slicer"  r\   final_logit_softcappingr?   r   loss_functionr  rJ   rN   rO   rP   r8   )rp   r  rW   r   rN   r-  r(  r
  r)  rc  outputsrO   slice_indicesrM   rL   s                  rH   rv   Gemma4ForCausalLM.forward  s   @ 26 2
)%+'2
 2
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD+#33!//))$55
 	
rG   )r  r"  r  r  )NNNNNNNr   )r:   r;   r<   r=   _tied_weights_keys_tp_plan_pp_planr.   rA   r  ri   r"   r!   r?   r  rE   r   r@   r   rx   r   r    rJ   rv   rF   ry   rz   s   @rH   r   r     s+   *,GH23H_-z:;H/   .2.204(,26*.!%-.<
##d*<
 t+<
 &&-	<

 <
 ((4/<
   4'<
 $;<
 ell*<
 +,<
 
&<
  <
rG   r   rZ  c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )zD
This creates uni/bidirectional attention mask with sliding window.
	batch_idxhead_idxq_idxkv_idxr_   c                 H   > T	u  pEX#-
  nUS:  Xd:  -  nUS:  U* U:  -  nXx-  $ rG  r9   )
r5  r6  r7  r8  left_window_sizeright_window_sizedist	left_mask
right_maskrZ  s
            rH   
inner_mask0sliding_window_mask_function.<locals>.inner_maskY  sC    .<+~QY4#:;	QhD5+<#<=
%%rG   rx   r   )rZ  r?  s   ` rH   sliding_window_mask_functionrB  T  s3    
&c &S & &c &d & rG   c                   8  ^  \ rS rSr% Sr\\S'   SrSr\	\
S.rS\4U 4S jjrS\R                  S	\R                  4S
 jr\\\" SS9 SS\R                  S\R                  S-  S\\   S	\\R                  \R,                  4   4S jj5       5       5       rSrU =r$ )Gemma4AudioModelid  znAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.r\   r'  zmodel.audio_towerrO   rP   c           	        > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR                  UR                  SS9U l        U R#                  5         g s  snf )NTra   )rh   ri   r\   r  subsample_conv_projectionr   rel_pos_encr   r*  r  r'  rY  r+  rk   r   output_proj_dimsoutput_projr  r   s      rH   ri   Gemma4AudioModel.__init__o  s     )KF)S&;FCmmBGH`H`BabBaYf0Bab
 99V%7%79P9PW[\	 cs   B?mask_4dr_   c                 ^   UR                   u  p#pCUR                  nU R                  R                  nU R                  R                  S-
  nU R                  R
                  nXF-   S-
  U-  n	X-  n
X-
  n[        R                  " USUSU4SS9nUR                  USXU
5      n[        R                  " XU4SS9n[        R                  " XS9U-  n[        R                  " Xg-   U-   US9nUSS2S4   USSS24   -   nUSSSS2SSS24   R                  USSUS5      nUR                  SU5      $ )z
Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
`[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
r+   r   F)r  r   Nr   )r   r   r\   r   r   r   r   r   r   r?   r   r  gather)rp   rL  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   rH   _convert_4d_mask_to_blocked_5d/Gemma4AudioModel._convert_4d_mask_to_blocked_5d|  s:   
 %,MM!
w[[55
;;==A![[@@*Q.:=
#0#-
%%!ZJ!?uM//*aX%%4F!GuU||J>K,,z<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--rG   z&Encodes audio features to soft tokens.r1   NrW   rc  c           	         U R                  X5      u  pEU R                  U5      n[        U R                  UU[	        U R                  R
                  S-
  U R                  R                  45      S9nUb  U R                  U5      nU R                  S U R                  R                    H  nU" U4UUS.UD6nM     U R                  U5      n[        XES9$ )Nr+   )r\   r-  rW   and_mask_function)rW   r   )r0  rW   )rG  rH  r   r\   rB  r   r   rU  r+  r'  rJ  rU   )rp   r'  rW   rc  rO   output_maskr   encoder_layers           rH   rv   Gemma4AudioModel.forward  s     &*%C%CN%c""..}=2;;'&:33a79\9\]	
 %!@@PN![[)H4;;+H+HIM)-$7 	M J ((7%bbrG   )r\   r+  rJ  rH  rG  rt   )r:   r;   r<   r=   r>   r,   rA   main_input_namer  rY  r   r  ri   r?   rE   rU  r&   r(   r!   r   r    rD   rX   rv   rF   ry   rz   s   @rH   rD  rD  d  s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   crG   rD  c                      ^  \ rS rSrSr\r\\S.r	S\4U 4S jjr
\\\" SS9S\R                  S	\R                   S
\\   S\4S j5       5       5       rSrU =r$ )r  i  zThe Gemma 4 Vision Encoder.rE  r\   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  R                  (       at  U R                  S[        R                  " U R                  R                  5      5        U R                  S[        R                  " U R                  R                  5      5        U R                  5         g )Nr  r  )rh   ri   rg  patch_embedderr%  encoderr  poolerr\   r  rm   r?   ro  r   r  r7  s     rH   ri   Gemma4VisionModel.__init__  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSrG   z1Encodes image pixels to soft tokens from patches.r1   r{  rn  rc  r_   c                    U R                   R                  nUR                  S   XD-  -  nUS:H  R                  SS9nU R	                  XU5      nU R
                  " SUU) US.UD6nU R                  UR                  UUUS9u  pX   n	U R                   R                  (       a  XR                  -
  U R                  -  n	[        U	S9$ )a  
pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
    The images to encode. Either a single `[batch, channels, height, width]` tensor
    (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
ru  r   r   )r-  rW   rn  )rO   rn  ro  r  r/  r9   )r\   pooling_kernel_sizer   r  r_  r`  ra  r0  r  r  r  r   )rp   r{  rn  rc  rd  r  ro  r-  r  rO   pooler_masks              rH   rv   Gemma4VisionModel.forward  s      #kk==$**2.3F3\]/25::r:B++LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
" &2;;""*]]:dnnLM&GGrG   )r`  r_  ra  )r:   r;   r<   r=   r>   r/   r\   r  r  r  ri   r&   r(   r!   r?   r@   r  r   r    r   rv   rF   ry   rz   s   @rH   r  r    s    %F1+

1 
  !TU&H''&H ",,&H +,	&H
 
!&H V   &HrG   r  c                   x   ^  \ rS rSrSrS\\-  S\4U 4S jjrS\	R                  S\	R                  4S jrS	rU =r$ )
Gemma4MultimodalEmbedderi  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 D  > [         TU ]  5         [        USUR                  5      U l        UR
                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l
        [        U R                  U R                  SS9U l        g )NrI  Fra   rM  )rh   ri   r  r   multimodal_hidden_sizer!  r   text_hidden_sizer   rk   embedding_projectionr|   embedding_pre_projection_norm)rp   ri  rj  rq   s      rH   ri   !Gemma4MultimodalEmbedder.__init__  s    
 	&-.?ASUfUrUr&s#$11 + 7 7$&IId.I.I4K`K`gl$m!-:4;V;V\`\d\dqv-w*rG   r-  r_   c                 F    U R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.
Args:
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.
Returns:
    A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
)ro  rn  )rp   r-  embs_normeds      rH   rv    Gemma4MultimodalEmbedder.forward  s%     88G((55rG   )ro  rn  r   rl  rm  )r:   r;   r<   r=   r>   r,   r/   r.   ri   r?   rE   rv   rF   ry   rz   s   @rH   rh  rh    sF    [x,/AAx &x6U\\ 6ell 6 6rG   rh  token_type_idsimage_group_idsc           
      \   ^ U c  gS[         S[         S[         S[         S[        4
U4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
Nr5  r6  r7  r8  r_   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   r+   )r   r   )r   ru   r?   rw  )
r5  r6  r7  r8  r   q_idx_clampedkv_idx_clampedq_groupkv_groupru  s
            rH   r?  0token_type_ids_mask_function.<locals>.inner_mask#  s    $**2.
 
Q7*q.9 ")":;"9#<=++e0'2>;;v2HbA#155rG   rA  )rt  ru  r?  s    ` rH   token_type_ids_mask_functionr}    s>     6c 6S 6 6c 6d 6 rG   r\   r-  rN   mm_token_type_idsis_first_iterationc                    U R                  5       UUUUS.nUR                  5       n	Ub  US:H  US:H  -  n
[        R                  " U
SSS9nSUS'   X) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " XS5      n[        UR                  UR                  5      U5      U	S	'   [        S0 UD6[        S0 U	D6S
.$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Gemma4 uses a bidirectional mask for images.

Uses `pixel_values` as an optional input to disambiguate edge cases.
r  r+   r)   r   )shiftsdimsFr  r   or_mask_functionr  r9   )r  copyr?   rollcumsumrx   rw  r}  r   r   r   r   )r\   r-  rW   rN   r   r~  r  rc  r  sliding_mask_kwargs	is_visionis_prev_visionnew_vision_startsvision_group_idss                 rH   create_causal_mask_mappingr  5  s    " ((*&(*$K &**,$ '!+0AQ0FG	IabA!&v%7 <<(9(=(=(?QG!K ;;yBG2N  !5!568H3
./
 -;{;>UATU rG   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "         ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
" SS9 S#S
\R                  S\R                  S	-  S\\   S\4S jj5       5       r  S$S\R                  S	-  S\R                  S	-  S\\R&                  \R&                  \R&                  4   4S jjr\\	\
             S%S\R                  S	-  S
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R,                  S	-  S\R,                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\\   S\4S jj5       5       5       rS rS r\	\
" SS9S\R,                  S\R,                  S\\   S\\-  4S j5       5       r\	\
" S S9 S#S\R                  S\R                  S	-  S\\   S\4S! jj5       5       rS"r U =r!$ )&Gemma4Modelid  Fr\   c                    > [         TU ]  U5        UR                  b   [        R                  " UR                  5      OS U l        UR                  R                  U l        [        R                  " UR                  S9nX l        UR                  R                  U l	        UR                  b   [        R                  " UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        U R                  R                   Vs/ s H  nSU 3PM
     snU l        U R!                  5         g s  snf )Nr  zlanguage_model.)rh   ri   vision_configr*   from_configvision_towerrj  r  language_modelr  audio_configaudio_towerrh  embed_visionembed_audior  r  )rp   r\   r  r  rq   s       rH   ri   Gemma4Model.__init__n  sF    KQK_K_KkI11&2F2FGqu ,,77"..f6H6HI,*0*<*<*W*W'IOI\I\Ih9001D1DEnr ##/ %V%9%96;M;MN 	 "". %V%8%8&:L:LM 	 261D1D1g1g3
1godV$1g3
/ 	3
s   E;c                 6    U R                   R                  5       $ rt   )r  get_input_embeddingsr  s    rH   r   Gemma4Model.get_input_embeddings  s    ""7799rG   c                 :    U R                   R                  U5        g rt   )r  set_input_embeddingsr  s     rH   r   Gemma4Model.set_input_embeddings  s    007rG   zOProjects the last hidden state from the vision model into language model space.r1   Nr{  image_position_idsrc  r_   c                 p    U R                   " SUUS.UD6nUR                  nU R                  US9Ul        U$ )z
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
r{  rn  r-  r9   )r  r0  r  pooler_output)rp   r{  r  rc  vision_outputsr0  s         rH   get_image_featuresGemma4Model.get_image_features  sT     ** 
%1
 

 +<<'+'8'8GX'8'Y$rG   r  r-  c           	         UbJ  XR                   R                  :H  nXR                   R                  :H  nXR                   R                  :H  nGO8UU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nX4U4$ )a;  
Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
`config.image_token_id`. Same goes for audio and video masks

Args:
    input_ids: A tensor containing the hard token IDs from the text tokenizer.
    inputs_embeds: A tensor containing the embeddings for all hard text tokens.

Returns:
    image_mask, video_mask, audio_mask
)r   r   r   )
r\   image_token_idvideo_token_idaudio_token_idr  r?   rn   r  r   r  )rp   r  r-  special_image_maskspecial_video_maskspecial_audio_masks         rH   get_placeholder_mask Gemma4Model.get_placeholder_mask  sA   &  !*kk.H.H!H!*kk.H.H!H!*kk.H.H!H ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  "7IIIrG   pixel_values_videosr'  rW   r(  r   rN   r~  r
  video_position_idsc                 D   USL U
SL-  (       a  [        S5      eU R                  X5      u  nnnUU-  U-  nSnU
cI  UR                  5       nU R                  R                  R
                  UU'   U R                  5       " U5      n
U R                  R                  5       R                  (       a  U R                  R                  R                  U R                  R                  R
                  SS24   n[        R                  " US   UR                  SSS5      U
5      nU R                  R                  UU5      nOSnUGb  U R!                  X,SS9R"                  nUR%                  U
R&                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        X   R3                  5       UR3                  5       :H  SU S	UR4                  S
    35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UGb  U R9                  X=SS9R"                  nUR%                  U
R&                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S	UR4                  S
    35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UGb  UGb  U R;                  XFSS9nUR"                  nUR<                  nUU   nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S	UR4                  S
   UR4                  S   -   35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UcU  Ub  UR?                  5       OS
n[        R@                  " U
R4                  S   U
R&                  S9U-   nUR-                  S
5      n[C        U=n [D        5      (       dZ  U R                  R                  5       RF                  S:X  a  [I        U R                  U
UUUU	S9n O[K        U R                  U
UUU5      n U R                  " SUU UUU
USS.UD6n![M        U!RN                  U!RP                  U!RR                  U!RT                  Ub  WOSUb  WOSU!RV                  S9$ )  
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
Nr  r   r+   r   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   vision)r-  rW   rN   r   r~  )r	  rW   r   rN   r-  r
  r  )r0  rN   rO   rP   r6   r7   r8   r9   ),r  r  r  r\   rj  r  r  r  r  r  r  r   r?   rw  r   r  r  r  r   r   r   rv  r   	expand_asr$   numelr   masked_scatterget_video_featuresget_audio_featuresrW   r  r   r  rB   r^  r  r   r4   r0  rN   rO   rP   r8   )"rp   r  r{  r  r'  rW   r(  r   rN   r~  r-  r
  r  r  rc  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsr	  image_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensr  r  r.  s"                                     rH   rv   Gemma4Model.forward  s   < -t";<YZZ-1-F-Fy-`*
J
$z1J>  %OO-M-1[[-D-D-Q-QM/* 557FM;;&&(DD //<<CCDKKD[D[DhDhjkDklM %OI,FHZHZ[\^_acHdfs t#22GGWhi# #!44\cg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ")//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>dh2iL)77N&2&A&A#
 ,,CDN'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF{{**,HHHT&@KK"/#1$3!-&7'# '@KK!"# '# %% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2LRV$55
 	
rG   c                 .    U R                   R                  $ rt   r  r  r  s    rH   r  *Gemma4Model.get_per_layer_input_embeddingsn	  s    ""999rG   c                 $    XR                   l        g rt   r  r  s     rH   r  *Gemma4Model.set_per_layer_input_embeddingsq	  s    5:2rG   zPProjects the last hidden state from the audio encoder into language model space.c                     U R                   c  [        S5      eU R                   " X4SS0UD6nU R                  UR                  S9Ul        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.r  Tr  )r  r  r  r0  r  )rp   r'  r(  rc  audio_outputss        rH   r  Gemma4Model.get_audio_featurest	  sc     #R 
 ((iZ^ibhi&*&6&6]EdEd&6&e#rG   zQProjects the last hidden state from the vision encoder into language model space.c                     UR                  SS5      nUR                  SS5      nU R                  " SUUS.UD6nUR                  nU R                  US9Ul        U$ )a  
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
r   r+   r  r  r9   )flattenr  r0  r  r  )rp   r  r  rc  r  r0  s         rH   r  Gemma4Model.get_video_features	  sz     299!Q?/771=** 
,1
 

 +<<'+'8'8GX'8'Y$rG   )r  r  r  r  r  r  r  r  rt   rT  )NNNNNNNNNNNNN)"r:   r;   r<   r=   accepts_loss_kwargsr-   ri   r  r  r"   r!   r?   r@   r  r   r    r   r  rD   rX   r  r&   rE   r   r   r4   rv   r  r  rU   r  r  rF   ry   rz   s   @rH   r  r  d  s*     | 4:8 !rs 7;'' ",,t3 +,	
 
$ t * .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:Y
##d*Y
 ''$.Y
 #..5	Y

 ))D0Y
 t+Y
 #\\D0Y
 &&-Y
 Y
 !++d2Y
 ((4/Y
 $;Y
 ",,t3Y
 ",,t3Y
 +,Y
  
#!Y
    Y
v:; !st #\\ +,	
 
'	' u . !tu 7;".. ",,t3 +,	
 
$ v rG   r  z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            %       b  ^  \ rS rSrSS0rSrSrS\4U 4S jjr\	 S"S	\
R                  S
\
R                  S-  S\\   4S jj5       r\\	               S#S\
R                  S-  S	\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R"                  S-  S\
R"                  S-  S\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R"                  -  S\\   S\4"S jj5       5       r             S$U 4S jjrS rS r\  S%S\S\
R"                  S\
R"                  S-  S\S-  S\
R"                  S-  S\
R"                  S-  S\S-  S\4S  jj5       rS!rU =r$ )&Gemma4ForConditionalGenerationi	  r!  z(model.language_model.embed_tokens.weightFr  r\   c                 R  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  R                   Vs/ s H  nSU 3PM
     snU l
        U R                  5         g s  snf r%  )rh   ri   r  r  r   rk   rj  r   r  r"  r  r  r&  s      rH   ri   'Gemma4ForConditionalGeneration.__init__	  s      (
yy!3!3!?!?ASASA^A^ejk )-

(U(U3
(UfTFO(U3
/ 	3
s   <B$Nr{  r  rc  c                 <    U R                   R                  " X40 UD6$ )a  
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
)r  r  )rp   r{  r  rc  s       rH   r  1Gemma4ForConditionalGeneration.get_image_features	  s     zz,,\XQWXXrG   r  r  r'  rW   r(  r   r  rN   r~  r-  r(  r
  r)  r_   c                 x   U R                   " SUUUUUUUU
UUUUUU	SS.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnUb6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  S9$ )r  T)r  r{  r  r'  rW   r(  r   rN   r~  r-  r(  r
  r  r  r  N)rL   rM   rN   rO   rP   r6   r7   r8   r9   )r  r0  r  rx   r+  r"  r\   r  r,  r?   r   r-  r  rJ   rN   rO   rP   r6   r7   r8   )rp   r  r{  r  r'  rW   r(  r   r  r  rN   r~  r-  r(  r
  r)  rc  r.  rO   r/  rM   r,  rL   s                          rH   rv   &Gemma4ForConditionalGeneration.forward	  sU   > ** 
% 3)) 3%+/'11
  !
&  118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^ibhiD+#33!//)) ' ; ; ' ; ;$55	
 		
rG   c                    > [         TU ]  " U4UUUUUUU
US.UD6nU(       d  U(       d  UUS'   UUS'   UUS'   U	US'   U$ S US'   U$ )N)rN   r-  rW   r   r
  r)  rt  r  r{  r  r'  r(  r~  )rh   prepare_inputs_for_generation)rp   r  rN   r-  r   r{  r  r'  rW   r(  rt  r
  r)  r(  r  rc  model_inputsrq   s                    rH   r  <Gemma4ForConditionalGeneration.prepare_inputs_for_generation
  s    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./
  15L,-rG   c                 6    U R                   R                  5       $ rt   )r  r  r  s    rH   r  =Gemma4ForConditionalGeneration.get_per_layer_input_embeddings?
  s    zz88::rG   c                 :    U R                   R                  U5        g rt   )r  r  r  s     rH   r  =Gemma4ForConditionalGeneration.set_per_layer_input_embeddingsB
  s    

11%8rG   r  c           
          [        U R                  5       SS 5      S:X  a>  [        U UUUUU4SU0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6$ [	        XX#U40 UD6$ s  sn	nf )Nr^  r  r  r{  )r  r  r  r  r   )
r\   r-  rW   rN   r   r~  r  rc  r  vs
             rH   r   8Gemma4ForConditionalGeneration.create_masks_for_generateE
  s     6))+-JDQU]]-!	 $6	 %+LLNJNDAa>6I414NJ	 	 -~X^ 	 Ks   A,A,)r  r"  r  rt   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNNF)NF) r:   r;   r<   r=   r1  r  r  r-   ri   r!   r?   r@   r  r   r    r  r"   rE   r   r   rx   rJ   rv   r  r  r  r  r   rB   r   rF   ry   rz   s   @rH   r  r  	  s    +,VW|   7;Y''Y ",,t3Y +,	Y Y  .2158<37.237046:6:(,5926*.!%-.!F
##d*F
 ''$.F
 #..5	F

 ))D0F
 t+F
 #\\D0F
 &&-F
 ",,t3F
 ",,t3F
 F
 !++d2F
 ((4/F
   4'F
 $;F
  ell*!F
" +,#F
$ 
&%F
  F
V    *X;9  26*/ || t+ 	
 llT) !<<$. !4K 
 rG   r  )rD  r   r  r  r  r  r  )r+   )rt  NN)r)   rT  )}r   collections.abcr   dataclassesr   	functoolsr   typingr   r?   r   torch.nnr   r    r
   r  activationsr   cache_utilsr   r   configuration_utilsr   
generationr   integrationsr   r   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r    r!   r"   r#   r$   utils.genericr%   r&   utils.output_capturingr'   r(   auto.modeling_autor*   configuration_gemma4r,   r-   r.   r/   accelerate.hooksr0   r4   rJ   rR   rU   ModulerZ   r|   r   r   r
  r  r,  Conv1dr@  rJ  rY  rg  r  r  r  r  rE   rx   r  r  ro   rD   r  r   r  r  r%  r4  r>  rV  ri  r  r  r  r  r  r  r   rB  rD  r  rh  r}  r   rB   r  r  r  __all__r9   rG   rH   <module>r     s  *  $ ! %    $ & ! . 3 ) K  C 9 S K F &  H E * g g 3 
Q 7 Q Q2 
 Q; Q QD 
Q$; 
Q 
Q 
37 3  3BII :4BII 4*7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l3		 3D80 80vbii  L")) L^(.ELL .u|| .%,, ._b .,	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%N 5&||5&	5& 
5& ,,	5&
 5& \\5&p )*B)BII B) +B)J)9 )X)H")) )HXBII &W<		 W<tq)")) q)h $#		 $# $#N!@ryy !@HV7 VrSBLL S hJO hJ hJV `a_V+ _V b_VD ]^R
- R
 _R
jsCx X  Sc, Scl>H- >HB6ryy 68LL4'\\D( _H .2&*,,<<, LL4', T\	,
 ,,%, ||d*, t, 
,^ y' yyx	 t%:O ttnrG   