
    Z j}                       S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJr  S SK	J
r  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0J1r1  SSK2J3r3J4r4  SSK5J6r6  SSK7J8r8J9r9J:r:J;r;J<r<J=r=J>r>  SSK?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrH  SSKIJJrJ  SSKKJLrL  SSKMJNrN  SSKOJPrPJQrQJRrRJSrS  \," 5       (       a   \-R                  " \U5      rV " S S\C5      rW " S  S!\@5      rX\ " S" S#\5      5       rY\*\ " S$ S%\5      5       5       rZ " S& S'\R                  5      r\ " S( S)\F5      r] " S* S+\R                  5      r^ " S, S-\R                  5      r_ " S. S/\R                  5      r` " S0 S1\R                  5      ra " S2 S3\R                  5      rb " S4 S5\R                  5      rd " S6 S7\R                  5      re " S8 S9\R                  5      rf " S: S;\R                  5      rg " S< S=\R                  5      rh " S> S?\;5      ri S}S@\R                  SA\R                  SB\R                  SC\R                  SD\kSE\R                  4SF jjrl " SG SH\J5      rm " SI SJ\85      rn " SK SL\95      ro " SM SN\R                  5      rp " SO SP\;5      rq " SQ SR\<5      rr " SS ST\R                  5      rs " SU SV\L5      rt " SW SX\R                  5      ru " SY SZ\95      rv " S[ S\\>5      rw " S] S^\E5      rx\*" S_S`9 " Sa Sb\=5      5       ry\*" ScS`9 " Sd Se\:5      5       rz " Sf Sg\x5      r{ " Sh Si\x5      r| " Sj Sk\D5      r}Sl\R                  S-  Sm\R                  S-  SE\S-  4Sn jr~  S~So\Sp\R                  Sq\R                  S-  Sr\S-  SC\R                  S-  Ss\R                  S-  St\S-  SE\4Su jjr\*" SvS`9 " Sw Sx\B5      5       r\*" SyS`9 " Sz S{\A5      5       r/ S|Qrg)    N)Callable)	dataclass)cached_property)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_accelerate_availableloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel)Gemma3AttentionGemma3DecoderLayerGemma3ForCausalLM	Gemma3MLPGemma3RotaryEmbeddingGemma3TextModelGemma3TextScaledWordEmbedding)	Gemma3nCausalLMOutputWithPastGemma3nForConditionalGenerationGemma3nModelGemma3nModelOutputWithPastGemma3nMultimodalEmbedderGemma3nPreTrainedModelGemma3nRMSNormapply_rotary_pos_embeager_attention_forward)LlamaRotaryEmbedding)MixtralExperts)sliding_window_mask_function   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfigc                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4ModelOutputWithPastS   aw  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nshared_kv_states __name__
__module____qualname____firstlineno____doc__rA   dictstrtupletorchTensor__annotations____static_attributes__rB       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma4/modular_gemma4.pyr?   r?   S   s7    " MQd3ellELL&@ AABTIPrP   r?   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4CausalLMOutputWithPasth   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
NrA   rB   rC   rB   rP   rQ   rS   rS   h   s7    * MQd3ellELL&@ AABTIPrP   rS   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4TextModelOutputWithPast   a!  
BaseModelOutputWithPast extended with shared_kv_states for KV sharing.

Args:
    shared_kv_states (`dict`, *optional*):
        Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
        Used to pass shared KV states between layers during KV sharing.
NrA   rB   rC   rB   rP   rQ   rV   rV      s7     MQd3ellELL&@ AABTIPrP   rV   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma4AudioModelOutput   z
attention_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
Nattention_maskrB   )
rD   rE   rF   rG   rH   r[   rL   
BoolTensorrN   rO   rB   rP   rQ   rY   rY      s    
 /3NE$$t+2rP   rY   c                   |   ^  \ rS rSrS\\-  S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr
S
rU =r$ )Gemma4ClippableLinear   configin_featuresout_featuresreturnNc                   > [         TU ]  5         UR                  U l        [        R                  " X#SS9U l        U R                  (       a  U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        g g )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferrL   tensorfloat)selfr`   ra   rb   	__class__s       rQ   rm   Gemma4ClippableLinear.__init__   s     	#)#=#= iiF##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $rP   hidden_statesc                    U R                   (       a+  [        R                  " XR                  U R                  5      nU R                  U5      nU R                   (       a+  [        R                  " XR                  U R                  5      nU$ N)rn   rL   clamprg   ri   rp   rj   rk   )rt   rw   s     rQ   forwardGemma4ClippableLinear.forward   sX    ##!KK~~t~~VMM2##!KKXMrP   )rp   rn   )rD   rE   rF   rG   r=   r:   intrm   rL   rM   r{   rO   __classcell__ru   s   @rQ   r^   r^      sY    K"%66K K 	K
 
K 	U\\ 	ell 	 	rP   r^   c                       \ rS rSrSrg)Gemma4RMSNorm   rB   NrD   rE   rF   rG   rO   rB   rP   rQ   r   r          rP   r   c                      ^  \ rS rSr% Sr\R                  \S'   S\4U 4S jjr	\R                  " 5       S\R                  S\R                  4S j5       rS	rU =r$ )
 Gemma4AudioRelPositionalEncoding   zSinusoidal relative positional encoding for the audio encoder.

Produces position embeddings of shape [1, context_size // 2 + 1, hidden_size] with
concatenated [sin..., cos...] layout matching the original Gemma4 convention.
inv_timescalesr`   c                   > [         TU ]  5         UR                  U l        UR                  UR                  -   S-
  UR
                  -   U l        SnSnU R                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  nU R                  SUR                  S5      R                  S5      SS9  g )	Nr9         ?     @r$   r   r   F
persistent)rl   rm   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxrL   exparangerq   	unsqueeze)rt   r`   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   ru   s          rQ   rm   )Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((=+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/WdijrP   rw   rc   c                 b   [         R                  " U R                  S-  SSUR                  S9nUS   nX R                  R                  UR                  S9-  n[         R                  " [         R                  " U5      [         R                  " U5      /SS9nUR                  UR                  S9$ )Nr$   device.Ndimdtype)
rL   r   r   r   r   tocatsincosr   )rt   rw   position_idsscaled_time	pos_embeds        rQ   r{   (Gemma4AudioRelPositionalEncoding.forward   s    ||D$5$5$:B=K_K_`#I."%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66rP   )r   r   )rD   rE   rF   rG   rH   rL   rM   rN   r:   rm   no_gradr{   rO   r~   r   s   @rQ   r   r      sS     LL k0 k ]]_7U\\ 7ell 7 7rP   r   c                   f  ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr
S\R                  S\R                  4S	 jrS
\R                  S\R                  4S jr SS\R                  S\R                  S\R                  S-  S\\R                  S4   4S jjrSrU =r$ )Gemma4AudioAttention   z3Chunked local attention with relative position biasr`   	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR                  UR                  -  U l        UR                  U l	        U R                  S-  [        R                  " S5      -  U l        [        R                  " S[        R                  -   5      [        R                  " S5      -  U l        UR                  U l        UR"                  S-
  U l        UR&                  U l        U R                   U R$                  -   U R(                  -   U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  UR                  5      U l        [6        R8                  " UR                  U R                  U R                  -  SS9U l        [6        R<                  " [>        R@                  " U R                  5      5      U l!        U RE                  S[>        RF                  " U R
                  5      SS9  g )N      r$   r9   Fre   softcapr   )$rl   rm   r`   r   attention_logit_capattention_logits_soft_capr   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler   
chunk_sizer   max_past_horizonr   max_future_horizonr   r^   q_projk_projv_projpostr   ro   relative_k_proj	ParameterrL   zerosper_dim_scalerq   rr   rt   r`   r   ru   s      rQ   rm   Gemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg)&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(TafgrP   rw   rc   c           	         UR                   u  p#pEX0R                  -   S-
  U R                  -  nX`R                  -  U-
  n[        R                  " USSSSSU45      nUR	                  X&U R                  XE5      R                  5       $ )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r9   r   )shaper   Fpadreshape
contiguous)rt   rw   
batch_sizeseq_lenr   r   
num_blocksr   s           rQ   _convert_to_block&Gemma4AudioAttention._convert_to_block   s|    3@3F3F0
Y/!3G
??*W4maAq!S-AB$$ZT__ibmmoorP   c           
      @   UR                   u  p#pE[        R                  " USSSSU R                  U R                  U R
                  -   S-
  45      nUR                  SU R                  U R
                  5      n[        R                  " USS5      nUR                  5       $ )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r9   r   r$   )r   r   r   r   r   r   unfoldr   rL   movedimr   )rt   rw   r   r   r   r   s         rQ   _extract_block_context+Gemma4AudioAttention._extract_block_context   s    3@3F3F0
YAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))rP   xc                     UR                   u  p#pEnU R                  n[        R                  " USUS-   U-
  45      nUR	                  X#XEUS-   -  5      nUSSXW-  24   nUR	                  X#XEU5      $ )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r9   .N)r   r   r   r   view)rt   r   r   r   r   
block_sizeposition_lengthr   s           rQ   
_rel_shiftGemma4AudioAttention._rel_shift	  s    IJF
z((EE!a)O;<=FF:*LSTDT6UVc.Z.../vvjZ\RRrP   Nposition_embeddingsr[   c                    UR                   u  pEnXEU R                  U R                  4nU R                  U5      R	                  5       R                  U5      nU R                  U5      R	                  5       R                  U5      n	U R                  U5      R	                  5       R                  U5      n
XR                  -  [        R                  " U R                  5      -  nXR                  -  n	U R                  U5      nU R                  U	5      n	U R                  U
5      n
UR                   S   nU R                  U5      nUR                  SU R                  U R                  5      nUR!                  UR"                  S9nUR%                  SSSSS5      nXR%                  SSSSS5      -  nUR'                  X@R                  SU R                  5      nXR%                  SSS5      -  nUR'                  X@R                  XR(                  S5      nU R+                  U5      nUU-   nUU R,                  -  n[.        R0                  " U5      nUU R,                  -  nUb4  UR3                  UR5                  5       U R6                  R8                  5      n[        R:                  " US[.        R<                  S9R!                  U
R"                  5      nUU
R%                  SSSSS5      -  nUR%                  SSSSS5      R'                  XKU R(                  -  S5      nUS S 2S U24   R?                  5       nU RA                  UR!                  U R@                  RB                  RD                  R"                  S95      nUU4$ )	Nr9   r   r   r   r   r$      )r   r   )#r   r   r   r   rs   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   rL   tanhmasked_filllogical_notr`   attention_invalid_logits_valuesoftmaxfloat32r   r   rp   weight)rt   rw   r   r[   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      rQ   r{   Gemma4AudioAttention.forward  s    %2$7$7!
"N{{=1779>>|L[[/557<<\J
{{=1779>>|L#ll2QZZ@R@R5SS,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q1500Aq!Q??	z>>2t}}U #>#>q!Q#GG	%%j..*oo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@Z^ZiZiMikmn!![j[.1<<>iiTYY5E5E5L5L5R5R STL((rP   )r   r   r`   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ry   )rD   rE   rF   rG   rH   r:   r}   rm   rL   rM   r   r   r   r\   rK   r{   rO   r~   r   s   @rQ   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1) 1)rP   r   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	'Gemma4AudioSubSampleConvProjectionLayeriF  c           	         > [         TU ]  5         [        R                  " UUSSSSS9U l        [        R
                  " X#SSS9U l        [        R                  " 5       U l        g )N)r   r   )r$   r$   r9   F)in_channelsout_channelskernel_sizestridepaddingrf   T)epselementwise_affinerf   )	rl   rm   r   Conv2dconv	LayerNormnormReLUact)rt   r   r  norm_epsru   s       rQ   rm   0Gemma4AudioSubSampleConvProjectionLayer.__init__G  sU    II#%
	 LLPT[`a	779rP   Nrw   maskc           
         Ub(  UR                  UR                  S9nXS S 2S S S 2S 4   -  nU R                  UR                  U R                  R                  R                  5      5      nU R                  U R                  UR                  SSSS5      5      R                  SSSS5      R                  5       5      nUb  US S 2S S S24   nX4$ )Nr   r   r$   r   r9   )	r   r   r  r   r   r  r
  r   r   )rt   rw   r  s      rQ   r{   /Gemma4AudioSubSampleConvProjectionLayer.forwardT  s    77-"6"677D)D!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<D""rP   )r  r  r
  ry   )
rD   rE   rF   rG   rm   rL   rM   r{   rO   r~   r   s   @rQ   r   r   F  s-    #U\\ #9L # #rP   r   c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S-  S\\R                  \R                  4   4S jjr	S	r
U =r$ )"Gemma4AudioSubSampleConvProjectionib  r`   c                 d  > [         TU ]  5         [        SUR                  S   UR                  S9U l        [        UR                  S   UR                  S   UR                  S9U l        UR                  S   S-  UR                  S   -  n[        R                  " X!R                  SS9U l
        g )Nr9   r   )r   r  r  r   Fre   )rl   rm   r   subsampling_conv_channelsrms_norm_epslayer0layer1r   ro   r   input_proj_linear)rt   r`   proj_input_dimru   s      rQ   rm   +Gemma4AudioSubSampleConvProjection.__init__c  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>;M;MTY!ZrP   Ninput_featuresinput_features_maskrc   c                    UR                  S5      nU R                  X25      u  p4U R                  X45      u  p4UR                  u  pVpvUR	                  SSSS5      R                  5       R                  XWS5      nU R                  U5      U4$ )Nr9   r   r$   r   r   )r   r  r  r   r   r   r   r  )rt   r  r  rw   r  r   r   r   s           rQ   r{   *Gemma4AudioSubSampleConvProjection.forwardr  s    
 '003"kk-M"kk->$1$7$7!
w%--aAq9DDFNNzdfg%%m4d::rP   )r  r  r  ry   )rD   rE   rF   rG   r:   rm   rL   rM   rK   r{   rO   r~   r   s   @rQ   r  r  b  s\    [0 [$ 48;; #\\D0; 
u||U\\)	*	; ;rP   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioFeedForwardi  r`   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  S-  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l	        [        UR                     U l        UR                  U l        UR                  U l        g )Nr   )rl   rm   r`   r^   r   ffw_layer_1ffw_layer_2r   pre_layer_normpost_layer_normr
   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalert   r`   ru   s     rQ   rm   Gemma4AudioFeedForward.__init__  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6rP   rw   rc   c                    [        U R                  [        R                  " U R                  R
                  R                  R                  5      R                  5      nUn[        R                  " X* U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nXR                  -  nX-  nU$ ry   )minr)  rL   finfor#  rp   r   r   r   rz   r%  r(  r$  r&  r+  )rt   rw   r)  residuals       rQ   r{   Gemma4AudioFeedForward.forward  s     6 6DDTDTD[D[DbDbDhDh8i8m8mn M3EGXY++M:((7M2((7M3EGXY,,];...!rP   )r(  r`   r#  r$  r)  r&  r+  r%  rD   rE   rF   rG   r:   rm   rL   rM   r{   rO   r~   r   s   @rQ   r!  r!    s0    70 7U\\ ell  rP   r!  c                   l   ^  \ rS rSr\S 5       rS\R                  S\R                  4U 4S jjrSr	U =r
$ )Gemma4AudioCausalConv1di  c                 n    U R                   S   S-
  U R                  S   -  S-   nXR                  S   -
  $ )Nr   r9   )r  dilationr  )rt   effective_kernel_sizes     rQ   left_pad Gemma4AudioCausalConv1d.left_pad  s<    !%!1!1!!4q!8DMM!<L Lq P${{1~55rP   r   rc   c                 x   > [         R                  R                  XR                  S45      n[        TU ]  U5      $ )Nr   )r   r   r   r9  rl   r{   )rt   r   ru   s     rQ   r{   Gemma4AudioCausalConv1d.forward  s1     MMa--!34wq!!rP   rB   )rD   rE   rF   rG   r   r9  rL   rM   r{   rO   r~   r   s   @rQ   r5  r5    s;     6 6"<<" 
	" "rP   r5  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioLightConv1di  r`   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  UR                  5      U l        [        UR                  UR                  UR                  UR                  SS9U l	        [        UR                  UR                  SS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        UR"                  U l        g )Nr$   F)r   r  r  groupsrf   Tr  
with_scale)rl   rm   r`   r^   r   linear_start
linear_endr5  conv_kernel_sizedepthwise_conv1dr   r  r%  	conv_normr
   r'  r(  r)  r,  s     rQ   rm   Gemma4AudioLightConv1d.__init__  s    1&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9rP   rw   rc   c                 B   UnU R                  U5      nU R                  U5      n[        R                  R	                  USS9nU R                  UR                  SS5      5      R                  SS5      n[        U R                  [        R                  " U R                  R                  R                  R                  5      R                  5      n[        R                  " X* U5      nU R!                  U5      nU R#                  U5      nU R%                  U5      nX-  nU$ )Nr   r   r9   r$   )r%  rC  r   r   glurF  	transposer/  r)  rL   r0  rp   r   r   r   rz   rG  r(  rD  )rt   rw   r1  r)  s       rQ   r{   Gemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6DDUDUD\D\DcDcDiDi8j8n8noM3EGXY}5M26!rP   )r(  r`   rG  rF  r)  rD  rC  r%  r3  r   s   @rQ   r>  r>    s0    :0 :(U\\ ell  rP   r>  c            
          ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  S-  S\R                  S	\
\   S
\R                  4
S jrSrU =r$ )Gemma4AudioLayeri  r`   r   c                 l  > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        X5      U l        [        U5      U l	        [        UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        g ry   )rl   rm   r`   r!  feed_forward1feed_forward2r   	self_attnr>  lconv1dr   r   norm_pre_attnnorm_post_attnnorm_outr)  r   s      rQ   rm   Gemma4AudioLayer.__init__  s    3F;3F;-f@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9rP   rw   r[   Nr   kwargsrc   c                 8   [        U R                  [        R                  " U R                  R
                  R                  5      R                  5      nU R                  U5      nUn[        R                  " X* U5      nU R	                  U5      nU R                  UUUS9u  p[        R                  " X* U5      nU R                  U5      nX-  nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nU$ )N)rw   r   r[   )r/  r)  rL   r0  rT  r   r   r   rP  rz   rR  rU  rS  rQ  rV  )rt   rw   r[   r   rX  r)  r1  r   s           rQ   r{   Gemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M3EGXY**=9>>' 3) * 
 M3EGXY++M:!]3**=9M3EGXYm4rP   )	r`   rP  rQ  r)  rS  rV  rU  rT  rR  )rD   rE   rF   rG   r:   r}   rm   rL   rM   r\   r   r   r{   rO   r~   r   s   @rQ   rN  rN    sn    :0 :S : ||  ((4/  #\\	 
 +,  
   rP   rN  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4VisionPatchEmbedderi$  r`   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  U l        [        R                  " SU R                  S-  -  U R                  SS9U l        [        R                  " [        R                  " SU R
                  U R                  5      5      U l        g )Nr   r$   Fre   )rl   rm   r`   r   
patch_sizeposition_embedding_sizer   ro   
input_projr   rL   onesposition_embedding_tabler,  s     rQ   rm   "Gemma4VisionPatchEmbedder.__init__%  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%rP   pixel_position_idspadding_positionsrc   c                 B   UR                  SS9n[        R                  " X0R                  S9nUR	                  SSSS5      R                  U R                  5      nX@R                  -  nUR                  SS9n[        R                  " UR                  S5      S	U5      nU$ )
zDPrepare patch positions map for matmul with positon embedding table.r   r/  )num_classesr$   r9   r   r   r           )rz   r   one_hotr_  r   r   rb  sumrL   wherer   )rt   rd  re  clamped_positionsrj  r   s         rQ   _position_embeddings.Gemma4VisionPatchEmbedder._position_embeddings/  s     /444;))-;W;WX//!Q1-001N1NO%(E(EE155!5<#kk*;*E*Eb*I3Pcd""rP   pixel_valuesc                     SUS-
  -  nU R                  UR                  U R                   R                  R                  5      5      nU R	                  X#5      nXE-   $ )Nr$         ?)r`  r   r   r   rn  )rt   rp  rd  re  rw   r   s         rQ   r{   !Gemma4VisionPatchEmbedder.forward<  sU     L3./8N8N8T8T(UV"778J^22rP   )r`   r   r`  r^  r_  rb  )rD   rE   rF   rG   r=   rm   rL   rM   rn  r{   rO   r~   r   s   @rQ   r\  r\  $  sz    t1 t#u|| #X]XdXd #iniuiu #3!LL3>Cll3_d_k_k3	3 3rP   r\  c                   @  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	S\
\R                  \R                  4   4S	 jr SS\R                  S\R                  S\R                  S\	S
-  S\
\R                  \R                  4   4
S jjrSrU =r$ )Gemma4VisionPooleriF  z9Scaling and optional spatial pooling for vision encodingsr`   c                 l   > [         TU ]  5         UR                  U l        U R                  S-  U l        g )Nrr  )rl   rm   r   root_hidden_sizer,  s     rQ   rm   Gemma4VisionPooler.__init__I  s/    !-- $ 0 0# 5rP   rw   rd  lengthrc   c                 z   UR                   S   n[        XC-  S-  5      nUS-  nXc-  U:w  a'  [        SUR                    SU SU< SU< SU S	35      eUR                  S
S9nUS   R	                  SSS9S
   S-   n[
        R                  " XuSS9n	U	S   X-  U	S   -  -   n	[        R                  " U	R                  5       U5      R                  5       U-  n
U
R                  SS5      UR                  5       -  n[
        R                  " U
S
:H  R                  SS95      nUR                  UR                  5      U4$ )z
2D spatial pooling according to patch positions.
Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
input and output lengths
r9   rr  r$   zCannot pool z to z: k=z^2 times length=z	 must be .r   rg  .r   r   Tr   keepdimfloor)rounding_mode).r9   r   )r   r}   
ValueErrorrz   r   rL   divr   rj  longrs   rK  r   allr   r   )rt   rw   rd  ry  input_seq_lenk	k_squaredrm  max_xkernel_idxsweightsoutputr  s                rQ   _avg_pool_by_positions)Gemma4VisionPooler._avg_pool_by_positionsN  s`    &++A.(S01qD	.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 1GL!&)UZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33rP   Nre  output_lengthc                    XAR                   S   :  a  [        SU SUR                   S    S35      eUR                  UR                  S5      S5      nUR                   S   U:w  a  U R	                  XU5      u  pXR
                  -  nX4$ )Nr9   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.r   ri  )r   r  r   r   r  rw  )rt   rw   rd  re  r  s        rQ   r{   Gemma4VisionPooler.forwardi  s     ..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J=0,M 	...//rP   )r   rw  ry   )rD   rE   rF   rG   rH   r=   rm   rL   rM   r}   rK   r  r{   rO   r~   r   s   @rQ   ru  ru  F  s    C61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0 0rP   ru  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Gemma4VisionMLPi  r`   c                   > [         TU ]  X5        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        g ry   )rl   rm   r^   r   intermediate_size	gate_projup_proj	down_projr,  s     rQ   rm   Gemma4VisionMLP.__init__  s^    &.v7G7GI_I_`,V5E5EtG]G]^.v7M7MtO_O_`rP   )r  r  r  )rD   rE   rF   rG   r=   rm   rO   r~   r   s   @rQ   r  r    s    a1 a arP   r  r   r   r   r   unsqueeze_dimrc   c           
         UR                   S   nU R                   S   nSUSU-  -  -  nUS::  a  [        SU SU SU S35      eU/U-  n[        R                  " XSS9n	[        R                  " XSS9n
[        R                  " X(SS9n[	        U5       Vs/ s H  n[        X   X   X   US	9PM     nn[        R                  " USS9$ s  snf )
a#  Applies multidimensional RoPE to inputs.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
        Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
        `apply_rotary_pos_emb()`, and then concatenated back together.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

Returns:
  Tensor of shape [B, L, N, H] with RoPE applied.
r   r$   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   )r   r   r   r  )r   r  rL   splitranger4   r   )r   r   r   r   r  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsr  y_partss                 rQ   apply_multidimensional_roper    s
   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk!b1GC"5IC"5I t A 	j'		
    99W"%%s   C
c                       \ rS rSr\   SS\S-  S\R                  S-  S\S-  S\	S\
4   4S jj5       r\R                  " 5       \S	 5       5       rS
rg)Gemma4VisionRotaryEmbeddingi  Nr`   r   r   rc   ztorch.Tensorc           	      "   U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nUS-  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nr$   r   r   r   )r   r   )	rope_parametersgetattrr   r   rL   r   int64r   rs   )r`   r   r   baser   spatial_dimattention_factorinv_freqs           rQ   compute_default_rope_parameters;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 ))rP   c                 $   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn/ / pe[        S5       H  nUS S 2S S 2U4   nUS S 2S S S 24   R                  5       n	[        USS9   UR                  5       U	R                  5       -  R                  SS5      n
[        R                  " X4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  nS S S 5        UR#                  W5        UR#                  W5        M     [        R                  " USS	9R	                  UR$                  S
9n[        R                  " USS	9R	                  UR$                  S
9nX4$ ! , (       d  f       N= f)Nr   r   r9   mpscpur$   F)device_typeenabledr   r   )r  rs   expandr   r   r   
isinstancetyperJ   r  r    rK  rL   r   r   attention_scalingr   appendr   )rt   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 rQ   r{   #Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E'E!((--[`J`ahhmmfk rqA+Aq!G4(8D!(D(J(J(L%KG*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	 H
 NN3NN3  iiR(++!''+:iiR(++!''+:x HGs   6BH
H	rB   NNN)rD   rE   rF   rG   staticmethodr=   rL   r   r}   rK   rs   r  r   r   r{   rO   rB   rP   rQ   r  r    s    ,0&*" *"T) *t# * t * 
~u$	%	 *  *D ]]_  rP   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma4VisionAttentioni  r`   r   c                 &  > [         TU ]  XU5        U ?U ?U ?SU l        SU l        [        XR                  UR                  U R                  -  5      U l        [        XR                  UR                  U R                  -  5      U l        [        XR                  UR                  U R                  -  5      U l        [        XR                  U R                  -  UR                  5      U l        [!        U R                  UR"                  SS9U l        g )Nr   FrA  )rl   rm   attn_logit_softcappingsliding_window
is_slidingscaling	is_causalr^   r   num_key_value_headsr   r   r   r   r   o_projr   r  v_normr   s      rQ   rm   Gemma4VisionAttention.__init__  s    y1'O+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4N4NQUQ^Q^4^`f`r`rs#DMMv7J7JW\]rP   Nrw   r   r[   r   rX  rc   c                 L   UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      n
U R	                  U
5      n
[        XX5      n
U
R                  SS5      n
U R                  U5      R                  U5      nU R                  U5      n[        XX5      nUR                  SS5      nU R                  U5      R                  U5      nU R                  U5      nUR                  SS5      n[        R                  " U R                  R                  [        5      nU" U U
UUU4U R                   (       a  U R"                  OSU R$                  S.UD6u  pUR&                  " / UQSP76 R)                  5       nU R+                  U5      nX4$ )Nr   r9   r$   ri  )dropoutr  )r   r   r   r   q_normr  rK  r   k_normr   r  r   get_interfacer`   _attn_implementationr5   trainingattention_dropoutr  r   r   r  )rt   rw   r   r[   r   rX  input_shaper   r   r   r   r   r   attention_interfacer   r   s                   rQ   r{   Gemma4VisionAttention.forward  s    $))#2.88b8$--8&{{=166|D{{<02<cX#--a3[[/44\B
[[,
0#T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
! "));;;;FFHkk+.((rP   )r  r   r  r   r  r  r   r  )rD   rE   rF   rG   r=   r}   rm   rL   rM   
LongTensorr   r   rK   r{   rO   r~   r   s   @rQ   r  r    s    ^1 ^c ^  -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,) ,)rP   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma4VisionEncoderLayeri<  r`   r   c                 b   > [         TU ]  XU5        [        XS9U l        [	        U5      U l        g Nr`   r   )rl   rm   r  rR  r  mlpr   s      rQ   rm   !Gemma4VisionEncoderLayer.__init__=  s*    y1.fR"6*rP   Nrw   r   r[   r   rX  rc   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pU R                  U5      nXa-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXa-   nU$ )N)rw   r   r[   r   rB   )input_layernormrR  post_attention_layernormpre_feedforward_layernormr  post_feedforward_layernorm)rt   rw   r   r[   r   rX  r1  r   s           rQ   r{    Gemma4VisionEncoderLayer.forwardB  s     !,,];>> 
' 3)%	

 
 55mD 0 66}E/77F 0rP   )r  rR  r  )rD   rE   rF   rG   r=   r}   rm   rL   rM   r  r   r   rK   FloatTensorr{   rO   r~   r   s   @rQ   r  r  <  s    +1 +c + -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	U rP   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\	\
   S	\4
S
 jjrSrU =r$ )Gemma4VisionEncoderia  r`   c           
        > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " [        U R                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf r  )rl   rm   r`   num_hidden_layers
num_layersr  
rotary_embr   
ModuleListr  r  layers)rt   r`   r  ru   s      rQ   rm   Gemma4VisionEncoder.__init__b  se     225f=mmKPQUQ`Q`KabKaa%VAKab
bs   A>Ninputs_embedsr[   rd  rX  rc   c                     [        U R                  UUS9nUnU R                  XS5      nU R                  SU R                  R                    H  nU" U4UUUS.UD6nM     [        US9$ )zw
pixel_position_ids (torch.Tensor):
    Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
)r`   r  r[   N)r[   r   r   last_hidden_state)r   r`   r  r  r  r   )rt   r  r[   rd  rX  rw   r   decoder_layers           rQ   r{   Gemma4VisionEncoder.forwardk  s     3;;')
 &"oomP "[[)H4;;+H+HIM)-$7/	
 M J 'GGrP   )r`   r  r  r  ry   )rD   rE   rF   rG   r=   rm   rL   rM   r  r   r   r   r{   rO   r~   r   s   @rQ   r  r  a  so    
1 
 7;	H||H H ",,t3	H
 +,H 
!H HrP   r  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Gemma4TextMLPi  r`   r   c                    > UR                   UR                  -
  nX#s=:  =(       a    S:  Os  nUR                  =(       a    Un[        TU ]  5         UR
                  U(       a  SOS-  U l        g )Nr   r$   r9   )r  num_kv_shared_layersuse_double_wide_mlprl   rm   r  )rt   r`   r   first_kv_shared_layer_idxis_kv_shared_layerr  ru   s         rQ   rm   Gemma4TextMLP.__init__  sa    $*$<$<v?Z?Z$Z!&GGaG$88O=O!'!9!9BUQ[\!]rP   )r  )	rD   rE   rF   rG   r<   r}   rm   rO   r~   r   s   @rQ   r  r    s     ^/ ^C ^ ^rP   r  c                   &    \ rS rSrSS\4S jjrSrg)Gemma4TextRotaryEmbeddingi  Nr`   c                    [         R                  R                  U 5        UR                  U l        UR                  U l        Xl        [        UR                  5      U l        0 U l	        0 U l
        U R                   H  nU R                  R                  U   nUc  M!  US   =nS:w  a
  [        U   nOU R                  nX`R                  U'   XPR                  U'   X#S.nUS:X  a  US:X  a  SUS'   U" U R                  40 UD6u  pU R                  U S3US	S
9  U R                  U S3UR                  5       S	S
9  [!        X S3U	5        M     g )N	rope_typedefault)r   
layer_typefull_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFr   _original_inv_freq_attention_scaling)r   Modulerm   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr`   setlayer_typesrope_init_fnsr	  r  r   r  rq   clonesetattr)
rt   r`   r   r  rope_paramsr	  rope_init_fnrope_init_fn_kwargscurr_inv_freqcurr_attention_scalings
             rQ   rm   "Gemma4TextRotaryEmbedding.__init__  sW   
		4 "("@"@$*$B$B!v112SU)+**J++55jAK"(55	)C29=#CC-9z*)2NN:&-3"N--)~2M6G#N34@4dPc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST) +rP   )r`   r  r  r  r  r	  NN)rD   rE   rF   rG   r<   rm   rO   rB   rP   rQ   r  r    s    U/ U UrP   r  c                   &  ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S	\R                  S-  S
\
\\\R                  \R                  4   4   S\S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma4TextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr`   r   c                    > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        U R                  (       d  UR                  (       a  UR                  OUR                  U l
        UR                  =(       a    U R                  (       + U l        U R                  (       a  UR                  OUR                  nUR                  U-  U l        SU l        U R
                  R$                  U l        UR&                  S:g  U l        U R
                  R*                  [-        U R
                  SS5      -
  nX$s=:  =(       a    S:  Os  U l        UR                  S U nU R.                  (       + =(       a6    U[1        U5      S-
  US S S2   R3                  UR                  U   5      -
  :H  U l        [6        R8                  " UR:                  UR                  U R                  -  UR<                  S	9U l        [A        U R                  URB                  S
9U l"        U R.                  (       d  [A        U R                  URB                  S
9U l#        [A        U R                  URB                  SS9U l$        [6        R8                  " UR:                  X0R                  -  UR<                  S	9U l%        U R                  (       d6  [6        R8                  " UR:                  X0R                  -  UR<                  S	9OS U l&        [6        R8                  " UR                  U R                  -  UR:                  UR<                  S	9U l'        g )Nr  sliding_attentionr   r  r  r   r9   r   re   )r   r  FrA  )(rl   rm   hasattrr  r  r`   r   r  r  r  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   num_key_value_groupsr  r  use_bidirectional_attentionr  r  r  r  lenindexstore_full_length_kvr   ro   r   attention_biasr   r   r  r  r  r  r   r   r  )rt   r`   r   r  r  prev_layersru   s         rQ   rm   Gemma4TextAttention.__init__  s   ;B6=;Y;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+"M"MA"M(()C*CD(,(?(?$? %/IQTU`QadeQehsbDi

%""9-
.R/ E/! ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7--$GfNcNcDK
 55 		&,,.AMM.QX^XmXmn K ii&&68J8JQWQfQf
rP   Nrw   r   r[   rA   past_key_valuesrX  rc   c                    UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      nU R	                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  X@R                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R                  U5      nU R                  b   U R                  U5      R                  U5      OUnU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR                  XU R                   5      u  pU R"                  (       a  X4X@R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr   r$   )r  r9   ri  )r  r  r  )r   r   r   r   r  r4   rK  r  r  r   r   r   r   r  r  updater   r/  r   r  r`   r  r5   r  r  r  r  r   r   r  )rt   rw   r   r[   rA   r3  rX  r  r   r   r   r   r   r   r  r   r   s                    rQ   r{   Gemma4TextAttention.forward  s>    $))#2.88b8$--8&{{=166|D{{<0+LsRST#--a3
 ""'7'H$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-jsRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$0:0H__-(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((rP   )r  r`   r   r  r  r  r  r   r   r  r+  r  r  r   r  r  r/  r)  r  r   ry   )rD   rE   rF   rG   rH   r<   r}   rm   rL   rM   rI   rJ   rK   r   r   r   r{   rO   r~   r   s   @rQ   r$  r$    s    G/
/ /
C /
n )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=) =)rP   r$  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Gemma4TextExpertsi/  r`   c                    > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        g ry   )rl   rm   num_expertsmoe_intermediate_sizeintermediate_dimr
   hidden_activationr(  r,  s     rQ   rm   Gemma4TextExperts.__init__0  s<    !-- & < <V556rP   )r(  r<  r:  )rD   rE   rF   rG   r<   rm   rO   r~   r   s   @rQ   r8  r8  /  s    7/ 7 7rP   r8  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  4   4S jr	Sr
U =r$ )Gemma4TextRouteri7  r`   c                 $  > [         TU ]  5         Xl        UR                  U l        U R                  S-  U l        UR
                  U l        [        U R                  U R                  SS9U l        [        R                  " UR                  UR                  SS9U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   FrA  re   )rl   rm   r`   r   scalar_root_sizer  r  r   r
  r   ro   r:  projr   rL   ra  scaleper_expert_scaler,  s     rQ   rm   Gemma4TextRouter.__init__8  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K LrP   rw   rc   c                 ^   U R                  U5      nXR                  -  U R                  -  nU R                  U5      n[        R
                  R                  USS9n[        R                  " UU R                  R                  SS9u  pEXDR                  SSS9-  nX@R                  U   -  nX4U4$ )Nr   r   )r  r   Tr}  )r
  rD  rB  rC  r   r   r   rL   topkr`   top_k_expertsrk  rE  )rt   rw   expert_scoresrouter_probabilitiestop_k_weightstop_k_indexs         rQ   r{   Gemma4TextRouter.forwardD  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
" 	**r4*@@ &(=(=k(JJ#K??rP   )r`   r  r   r
  rE  rC  rB  rD  )rD   rE   rF   rG   r<   rm   rL   rM   rK   r{   rO   r~   r   s   @rQ   r@  r@  7  sD    
M/ 
M@U\\ @eELL%,,<V6W @ @rP   r@  c                   @  ^  \ rS rSrS\\-  S\4U 4S jjr      SS\R                  S\R                  S\
\\\R                  \R                  4   4   S-  S	\R                  S
\R                  S-  S\R                  S-  S\S-  S\R                  4S jjrSrU =r$ )Gemma4TextDecoderLayeri[  r`   r   c                 z  > [         TU ]  X5        [        XS9U l        [	        X5      U l        U R                  S[        R                  " S5      5        UR                  U l	        U R                  (       a  [        UR                     U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [%        U R                  UR&                  S9U l        UR*                  U l        U R*                  (       a  [-        U5      U l        [1        U5      U l        [%        U R                  UR&                  S9U l        [%        U R                  UR&                  S9U l        [%        U R                  UR&                  S9U l        g g )Nr  layer_scalarr9   Fre   r  )rl   rm   r$  rR  r  r  rq   rL   ra  hidden_size_per_layer_inputr
   r=  r(  r   ro   r   per_layer_input_gateper_layer_projectionr   r  post_per_layer_input_normenable_moe_blockr@  routerr8  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      rQ   rm   Gemma4TextDecoderLayer.__init__\  sR   +,FP 3^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !rP   Nrw   per_layer_inputrA   r   r[   r   r3  rc   c           
      (   Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUn	U R                  U5      nU R	                  U5      nU R
                  (       a  U R                  U5      nU	R                  SU	R                  S   5      nU R                  U5      u  pnU R                  U5      nU R                  XU5      nUR                  U	R                  5      nU R                  U5      nX-   nU R                  U5      nX-   nU R                  (       aN  Un	U R                  U5      nU R!                  U5      nX-  nU R#                  U5      nU R%                  U5      nX-   nXR&                  -  nU$ )N)rw   r   r[   rA   r   r3  r   rB   )r  rR  r  r  r  rX  r[  r   r   rY  r]  rZ  r\  r  rT  rU  r(  rV  rW  rR  )rt   rw   r_  rA   r   r[   r   r3  rX  r1  r   hidden_states_1hidden_states_flatrL  rM  hidden_states_2s                   rQ   r{   Gemma4TextDecoderLayer.forwardq  s    !,,];>> 
' 3)-%+
 
 55mD 0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)Ak">>?QRO"ll?WO-55hnnEO"??PO ,=M77F 0++$H 55mDM KK6M);M 55mDM ::=IM$4M***rP   )r(  rX  rZ  rT  r  rU  rV  r[  r\  rW  r]  rY  rR  )NNNNNN)rD   rE   rF   rG   r<   r=   r}   rm   rL   rM   rI   rJ   rK   r  r   r{   rO   r~   r   s   @rQ   rP  rP  [  s    h/2DD hQT h0 )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9 9rP   rP  c                       \ rS rSrSrg)Gemma4TextScaledWordEmbeddingi  rB   Nr   rB   rP   rQ   rf  rf    r   rP   rf  c                   R    \ rS rSr/ SQrSrSr\R                  " 5       S 5       r	Sr
g)Gemma4PreTrainedModeli  )rP  r  rN  )imagetextvideoaudioNc                 z
   [         R                  " U5        [        U[        5      (       a!  [        R
                  " UR                  5        g [        U[        5      (       a  SnSnUR                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  n[        R                  " UR                   UR#                  S5      R#                  S5      5        g [        U[$        5      (       aL  [        R&                  " UR(                  UR*                  5        [        R,                  " UR.                  5        g [        U[0        5      (       a  UR2                  R5                  5        H  u  pxSU0n	US:X  a  UR6                  U   S:X  a  S	U	S
'   U" UR8                  40 U	D6u  p[        R                  " [;        X S35      U
5        [        R                  " [;        X S35      U
5        M     g [        U[<        5      (       a  UR6                  S:w  a  [>        UR6                     OUR@                  nU" UR8                  5      u  p[        R                  " URB                  U5        [        R                  " URD                  U5        g [        U[F        5      (       a,  [        R&                  " URH                  URJ                  5        g [        U[L        5      (       aA  [        R
                  " URN                  5        [        R
                  " URP                  5        g [        U[R        5      (       aW  U R8                  RT                  n[        RV                  " URX                  SUS9  [        RV                  " URZ                  SUS9  g [        U[\        5      (       a!  [        R
                  " UR^                  5        g [        U[`        5      (       a  URb                  (       a  [        R&                  " URd                  [g        S5      * 5        [        R&                  " URh                  [g        S5      5        [        R&                  " URj                  [g        S5      * 5        [        R&                  " URl                  [g        S5      5        g [        U[n        5      (       a]  UR8                  Rp                  (       aA  [        R,                  " URr                  5        [        R
                  " URt                  5        g g g )Nr   r   r$   r9   r   r  r  r  r  r  r  r  r
  ri  )meanstdrh   );r   _init_weightsr  r\  initones_rb  r   r   r   r   r   rL   r   r   copy_r   r   r   	constant_r   r   zeros_r   r  r  itemsr	  r`   r  r  r   r  r  original_inv_freqrf  embed_scalescalar_embed_scaler@  rD  rE  r8  initializer_rangenormal_gate_up_projr  rP  rR  r^   rn   rg   rs   ri   rj   rk   Gemma4VisionModelstandardizestd_bias	std_scale)rt   moduler   r   r   r   r   r  r  r  r  r   rope_fnbuffer_valuero  s                  rQ   rp  #Gemma4PreTrainedModel._init_weights  s   %%f-f788JJv667 @AAM#M#//14N&*hh}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 455NN6>>6+K+KLKK,,- 9::,2,@,@,F,F,H(
'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U 

76\+CDmT

76\9K+LM}] -I  ;<< ##y0 $F$4$45;; 
 &fmm4OLJJv5JJv//> =>>NN6--v/H/HI 011JJv||$JJv../ 122++//CLL,,3C@LL))= 677JJv**+ 5666;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 122v}}7P7PKK(JJv''( 8Q2rP   rB   )rD   rE   rF   rG   _no_split_modulesinput_modalities_can_record_outputsrL   r   rp  rO   rB   rP   rQ   rh  rh    s,    b:
]]_2) 2)rP   rh  zAThe base Gemma 4 language model without a language modeling head.custom_introc                      ^  \ rS rSr% \\S'   \" \SS9\\	S.r
S\4U 4S jjrS\R                  S-  S	\R                  S-  S
\R                  4S jr SS	\R                  S\R                  S-  S
\R                  4S jjr\\\       SS\R&                  S-  S\R                  S-  S\R&                  S-  S\S-  S	\R*                  S-  S\R                  S-  S\S-  S\\   S
\4S jj5       5       5       rSrU =r$ )Gemma4TextModeli  r`   r   )r.  )router_logitsrw   
attentionsc           
        > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        U5      U l	        [        U R                  R                  5      U l        UR                  U l        U R                  (       a  [        UR                   UR
                  UR                  -  U R"                  UR                  S-  S9U l        SU l        [        R(                  " UR*                  UR
                  UR                  -  SS9U l        UR*                  S-  U l        [1        UR                  UR2                  S9U l        / U l        [9        U R                  5       HT  u  p4UR:                  R<                  (       d  M"  U R6                  R?                  S Vs/ s H  nS	U S
U 3PM     sn5        MV     g s  snf s  snf )Nrr  )rx  g;f?Fre   r   rS  )r   r   r  r  zlayers.z.self_attn.) rl   rm   r   r  r  r  rP  r  r  r  r  r`   r  unique_layer_typesrT  rf  vocab_size_per_layer_inputpadding_idxembed_tokens_per_layerper_layer_input_scalero   r   per_layer_model_projection per_layer_model_projection_scaler   r  per_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumeraterR  r  extend)rt   r`   r   r  layernameru   s         rQ   rm   Gemma4TextModel.__init__  s    mmHMfNfNfHghHg9#F6Hgh
 4F;"%dkk&=&=">
 ,2+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++.HA11177>>@hi@hwqcTF3@hi /7 i< js   G'	G,
	input_idsNr  rc   c                    U R                   (       d  [        SU R                   35      eUc  [        R                  " 5          USS2SS2SSS24   U R
                  R                  SSSS2SS24   U R                  R                  S-  -  :H  R                  SS9R                  5       SS2S4   n UR                  UR                  SS 5      n SSS5        U R                  U5      R                  " / UR                  QU R                  R                  PU R                   P76 $ ! [         a    [        S5      ef = f! , (       d  f       Nt= f)a  Compute the token-identity component of Per-Layer Embeddings (PLE).

Looks up `input_ids` in `embed_tokens_per_layer` (a scaled embedding that multiplies
by `sqrt(hidden_size_per_layer_input)`) and reshapes the packed output from
`[batch, seq, num_hidden_layers * hidden_size_per_layer_input]` to
`[batch, seq, num_hidden_layers, hidden_size_per_layer_input]`.

If only `inputs_embeds` is provided (no `input_ids`), reverses the main embedding
to recover `input_ids` for the PLE lookup.
z}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nrr  r   r   r$   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)rT  RuntimeErrorr`   rL   r   embed_tokensr   r   r  nonzeror   r   r  r   r  )rt   r  r  s      rQ   get_per_layer_inputs$Gemma4TextModel.get_per_layer_inputs  sK    //**.++8   &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI !$ **95== 
__
KK))
 ,,
 	
 $ &r  !s   A.D>1D%%D;;D>>
Eper_layer_inputsc                 `   U R                   (       d  [        SU R                   35      eU R                  U5      U R                  -  nUR
                  " / UR                  SS QU R                  R                  PU R                   P76 nU R                  U5      nUc  U$ X2-   U R                  -  $ )aL  Compute the context-aware component of PLE and combine with token-identity.

Projects `inputs_embeds` through `per_layer_model_projection` (Linear), scales by
`1/sqrt(hidden_size)`, reshapes to `[batch, seq, num_layers, ple_dim]`, and normalizes
with `per_layer_projection_norm` (RMSNorm).

If `per_layer_inputs` (the token-identity component from `get_per_layer_inputs()`)
is provided, combines both: `(context_projection + token_identity) * (1/sqrt(2))`.
If `per_layer_inputs` is None (e.g. for multimodal inputs where input_ids are not
available), returns just the context projection.
zAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr   )
rT  r  r`   r  r  r   r   r  r  r  )rt   r  r  rV  s       rQ   project_per_layer_inputs(Gemma4TextModel.project_per_layer_inputsH  s      //226++@ 
  $>>}MPTPuPuu3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$74;U;UUUrP   r[   r   r3  	use_cacherX  c           
      2   USL USL-  (       a  [        S5      eUb  U R                  U5      nU R                  (       a%  Uc  U R                  X5      nU R	                  XV5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R                  UUUUS.n[        S0 UD6[!        S0 UD6S.n
Un0 nU R"                   H  nU R%                  XU5      X'   M     UR'                  S	0 5      n[)        U R*                  SU R                  R,                   5       H\  u  nnUb  USS2SS2USS24   OSnU" UU4UXR                  R.                  U      XR                  R.                  U      UUS
.UD6nM^     U R1                  U5      n[3        UUUR5                  SS5      (       a  US9$ SS9$ )u9  
per_layer_inputs (`torch.Tensor` of shape `(batch_size, sequence_length, num_hidden_layers, hidden_size_per_layer_input)`, *optional*):
    Pre-computed per-layer input embeddings. When provided, these are used directly instead of being
    computed from `input_ids` via `get_per_layer_inputs()`. This is primarily used by the multimodal
    model (`Gemma4Model`) which pre-computes per-layer inputs from the original `input_ids` *before*
    merging multimodal soft tokens into `inputs_embeds` — at which point the original token ids are
    no longer recoverable.
N:You must specify exactly one of input_ids or inputs_embeds)r`   r   r9   r   r`   r  r[   r3  r   r  r&  rA   )rA   r   r[   r   r3  return_shared_kv_statesF)r  r3  rA   rB   )r  r  rT  r  r  r   r`   get_seq_lengthrL   r   r   r   r   r  rI   r   r   r  r  popr  r  r  r  r
  rV   get)rt   r  r[   r   r3  r  r  r  rX  past_seen_tokenscausal_mask_mappingmask_kwargsrw   r   r  rA   r  r  r_  s                      rQ   r{   Gemma4TextModel.forwardk  sQ   , -t";<YZZ  --i8M++'#'#<#<Y#V #<<]]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# & 11J.2oom[e.f+ 2 "::&8"= !*$++6U8U8U*V WA}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S2;;3J3J13MN) /	 	M !X 		-0,++17<UW\1]1]-
 	
 dh
 	
rP   )
r  r  rT  r  r  r  r  r  r  r  ry   )NNNNNNN)rD   rE   rF   rG   r<   rN   r"   r@  rP  r$  r  rm   rL   rM   r  r  r!   r#   r   r  r   r  boolr   r   rV   r{   rO   r~   r   s   @rQ   r  r    s   '(8B/)"/ "H*
ellT.A *
RWR^R^aeRe *
jojvjv *
^ 15!V||!V  ,,-!V 
	!VF   .2.204(,2604!%T
##d*T
 t+T
 &&-	T

 T
 ((4/T
  ,,-T
 $;T
 +,T
 
'T
    T
rP   r  z>The base Gemma 4 language model with a language modeling head.c                   <  ^  \ rS rSrSrS\4U 4S jjr\\        SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\S-  S
\	R                  S-  S\	R                  S-  S\S-  S\\	R                  -  S\\   S\4S jj5       5       rSrU =r$ )Gemma4ForCausalLMi  modelr`   c                    > [         TU ]  U5        U R                  R                   Vs/ s H  nSU 3PM
     snU l        g s  snf )Nzmodel.)rl   rm   r  r  )rt   r`   r  ru   s      rQ   rm   Gemma4ForCausalLM.__init__  sD      )-

(U(U3
(UfTFO(U3
/ 3
s   ANr  r[   r   r3  r  labelsr  logits_to_keeprX  rc   c	           
      6   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  U
R                   S9$ )a"  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma4ForCausalLM

>>> model = Gemma4ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  r[   r   r3  r  r  N)losslogitsr3  rw   r  rA   rB   )r  r  r  r}   slicelm_headr`   final_logit_softcappingrL   r   loss_function
vocab_sizerS   r3  rw   r  rA   )rt   r  r[   r   r3  r  r  r  r  rX  outputsrw   slice_indicesr  r  s                  rQ   r{   Gemma4ForCausalLM.forward  s   @ 26 2
)%+'2
 2
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD+#33!//))$55
 	
rP   )r  )NNNNNNNr   )rD   rE   rF   rG   base_model_prefixr<   rm   r   r   rL   r  rM   r   r  r  r}   r   r   rS   r{   rO   r~   r   s   @rQ   r  r    s    
/ 
  .2.204(,26*.!%-.<
##d*<
 t+<
 &&-	<

 <
 ((4/<
   4'<
 $;<
 ell*<
 +,<
 
&<
  <
rP   r  c                   8  ^  \ rS rSr% Sr\\S'   SrSr\	\
S.rS\4U 4S jjrS\R                  S	\R                  4S
 jr\\\" SS9 SS\R                  S\R                  S-  S\\   S	\\R                  \R,                  4   4S jj5       5       5       rSrU =r$ )Gemma4AudioModeli  znAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.r`   r  zmodel.audio_towerrw   r  c           	        > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR                  UR                  SS9U l        U R#                  5         g s  snf )NTre   )rl   rm   r`   r  subsample_conv_projectionr   rel_pos_encr   r  r  r  rN  r  ro   r   output_proj_dimsoutput_proj	post_initr   s      rQ   rm   Gemma4AudioModel.__init__  s     )KF)S&;FCmmBGH`H`BabBaYf0Bab
 99V%7%79P9PW[\	 cs   B?mask_4drc   c                 ^   UR                   u  p#pCUR                  nU R                  R                  nU R                  R                  S-
  nU R                  R
                  nXF-   S-
  U-  n	X-  n
X-
  n[        R                  " USUSU4SS9nUR                  USXU
5      n[        R                  " XU4SS9n[        R                  " XS9U-  n[        R                  " Xg-   U-   US9nUSS2S4   USSS24   -   nUSSSS2SSS24   R                  USSUS5      nUR                  SU5      $ )z
Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
`[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
r9   r   F)valuer   Nr   )r   r   r`   r   r   r   r   r   r   rL   r   r  gather)rt   r  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   rQ   _convert_4d_mask_to_blocked_5d/Gemma4AudioModel._convert_4d_mask_to_blocked_5d)  s:   
 %,MM!
w[[55
;;==A![[@@*Q.:=
#0#-
%%!ZJ!?uM//*aX%%4F!GuU||J>K,,z<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--rP   z&Encodes audio features to soft tokens.r  Nr[   rX  c           	         U R                  X5      u  pEU R                  U5      n[        U R                  UU[	        U R                  R
                  S-
  U R                  R                  45      S9nUb  U R                  U5      nU R                  S U R                  R                    H  nU" U4UUS.UD6nM     U R                  U5      n[        XES9$ )Nr9   )r`   r  r[   and_mask_function)r[   r   )r  r[   )r  r  r   r`   r8   r   r   r  r  r  r  rY   )rt   r  r[   rX  rw   output_maskr   encoder_layers           rQ   r{   Gemma4AudioModel.forwardD  s     &*%C%CN%c""..}=2;;'&:33a79\9\]	
 %!@@PN![[)H4;;+H+HIM)-$7 	M J ((7%bbrP   )r`   r  r  r  r  ry   )rD   rE   rF   rG   rH   r:   rN   main_input_namer  rN  r   r  rm   rL   rM   r  r!   r#   r   r   r   rK   r\   r{   rO   r~   r   s   @rQ   r  r    s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   crP   r  c                      ^  \ rS rSrSr\r\\S.r	S\4U 4S jjr
\\\" SS9S\R                  S	\R                   S
\\   S\4S j5       5       5       rSrU =r$ )r}  ig  zThe Gemma 4 Vision Encoder.r  r`   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  R                  (       at  U R                  S[        R                  " U R                  R                  5      5        U R                  S[        R                  " U R                  R                  5      5        U R                  5         g )Nr  r  )rl   rm   r\  patch_embedderr  encoderru  poolerr`   r~  rq   rL   emptyr   r  r,  s     rQ   rm   Gemma4VisionModel.__init__p  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSrP   z1Encodes image pixels to soft tokens from patches.r  rp  rd  rX  rc   c                    U R                   R                  nUR                  S   XD-  -  nUS:H  R                  SS9nU R	                  XU5      nU R
                  " SUU) US.UD6nU R                  UR                  UUUS9u  pX   n	U R                   R                  (       a  XR                  -
  U R                  -  n	[        U	S9$ )a  
pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
    The images to encode. Either a single `[batch, channels, height, width]` tensor
    (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
r   r   )r  r[   rd  )rw   rd  re  r  r  rB   )r`   pooling_kernel_sizer   r  r  r  r  r  r~  r  r  r   )rt   rp  rd  rX  r  r  re  r  r  rw   pooler_masks              rQ   r{   Gemma4VisionModel.forward|  s      #kk==$**2.3F3\]/25::r:B++LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
" &2;;""*]]:dnnLM&GGrP   )r  r  r  )rD   rE   rF   rG   rH   r=   r`   r  r  r  rm   r!   r#   r   rL   r  r  r   r   r   r{   rO   r~   r   s   @rQ   r}  r}  g  s    %F1+

1 
  !TU&H''&H ",,&H +,	&H
 
!&H V   &HrP   r}  c                   t   ^  \ rS rSrS\\-  S\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	Gemma4MultimodalEmbedderi  multimodal_configtext_configc                    > [         TU ]  X5        U ?U ?U ?U ?U ?U ?[        USUR                  5      U l
        [        U R                  U R                  SS9U l        g )Nr  FrA  )rl   rm   	embeddinghard_embedding_normsoft_embedding_normvocab_offsetr  embedding_post_projection_normr  r   multimodal_hidden_sizer   r  embedding_pre_projection_norm)rt   r  r  ru   s      rQ   rm   !Gemma4MultimodalEmbedder.__init__  sn     	*8N$$O/&-.?ASUfUrUr&s#-:4;V;V\`\d\dqv-w*rP   r  rc   c                 F    U R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.
Args:
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.
Returns:
    A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
)r  embedding_projection)rt   r  embs_normeds      rQ   r{    Gemma4MultimodalEmbedder.forward  s%     88G((55rP   )r  r  )rD   rE   rF   rG   r:   r=   r<   rm   rL   rM   r{   rO   r~   r   s   @rQ   r  r    sC    x,/AAx &x$6U\\ 6ell 6 6rP   r  token_type_idsimage_group_idsc           
      \   ^ U c  gS[         S[         S[         S[         S[        4
U4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
N	batch_idxhead_idxq_idxkv_idxrc   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   r9   )r   r   )r   rz   rL   rl  )
r  r  r  r  r   q_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            rQ   
inner_mask0token_type_ids_mask_function.<locals>.inner_mask  s    $**2.
 
Q7*q.9 ")":;"9#<=++e0'2>;;v2HbA#155rP   )r}   r  )r  r  r  s    ` rQ   token_type_ids_mask_functionr    s>     6c 6S 6 6c 6d 6 rP   r`   r  r[   r3  mm_token_type_idsis_first_iterationc                    U R                  5       UUUUS.nUR                  5       n	Ub  US:H  US:H  -  n
[        R                  " U
SSS9nSUS'   X) -  n[        R                  " UR                  5       SS9S-
  n[        R                  " XS5      n[        UR                  UR                  5      U5      U	S	'   [        S0 UD6[        S0 U	D6S
.$ )a  
Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
for all kinds of forward passes. Gemma4 uses a bidirectional mask for images.

Uses `pixel_values` as an optional input to disambiguate edge cases.
r  r9   r$   r   )shiftsdimsFr|  r   or_mask_functionr  rB   )get_text_configcopyrL   rollcumsumr}   rl  r  r   r   r   r   )r`   r  r[   r3  r   r  r  rX  r  sliding_mask_kwargs	is_visionis_prev_visionnew_vision_startsvision_group_idss                 rQ   create_causal_mask_mappingr     s    " ((*&(*$K &**,$ '!+0AQ0FG	IabA!&v%7 <<(9(=(=(?QG!K ;;yBG2N  !5!568H3
./
 -;{;>UATU rP   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "         ^  \ rS rSrS\4U 4S jjrS rS r\\	" SS9 S S	\
R                  S
\
R                  S-  S\\   S\4S jj5       5       r\\	" SS9 S S\
R                  S\
R                  S-  S\\   S\4S jj5       5       r  S!S\
R                  S-  S\
R                  S-  S\\
R&                  \
R&                  \
R&                  4   4S jjr\\\	             S"S\
R                  S-  S	\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R,                  S-  S\
R,                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S
\
R                  S-  S\
R                  S-  S\\   S\4S jj5       5       5       r\\	" SS9S\
R,                  S\
R,                  S\\   S\\-  4S j5       5       rSrU =r$ )#Gemma4Modeli  r`   c                   > [         TU ]  U5        UR                  b   [        R                  " UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        UR                  b   [        R                  " UR                  5      OS U l
        UR                  b&  [        UR                  UR                  5      U l        g S U l        g ry   )rl   rm   vision_configr%   from_configvision_towerr  r  embed_visionaudio_configaudio_towerembed_audior,  s     rQ   rm   Gemma4Model.__init__  s     KQK_K_KkI11&2F2FGqu ##/ %V%9%96;M;MN 	
 JPI\I\Ih9001D1DEnr "". %V%8%8&:L:LM 	  	rP   c                 .    U R                   R                  $ ry   language_modelr  rt   s    rQ   get_per_layer_input_embeddings*Gemma4Model.get_per_layer_input_embeddings*  s    ""999rP   c                 $    XR                   l        g ry   r-  rt   r  s     rQ   set_per_layer_input_embeddings*Gemma4Model.set_per_layer_input_embeddings-  s    5:2rP   zOProjects the last hidden state from the vision model into language model space.r  Nrp  image_position_idsrX  rc   c                 p    U R                   " SUUS.UD6nUR                  nU R                  US9Ul        U$ )z
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
rp  rd  r  rB   )r&  r  r'  pooler_output)rt   rp  r6  rX  vision_outputsr  s         rQ   get_image_featuresGemma4Model.get_image_features0  sT     ** 
%1
 

 +<<'+'8'8GX'8'Y$rP   zQProjects the last hidden state from the vision encoder into language model space.pixel_values_videosvideo_position_idsc                     UR                  SS5      nUR                  SS5      nU R                  " SUUS.UD6nUR                  nU R                  US9Ul        U$ )a  
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
r   r9   r8  r9  rB   )flattenr&  r  r'  r:  )rt   r>  r?  rX  r;  r  s         rQ   get_video_featuresGemma4Model.get_video_featuresE  sz     299!Q?/771=** 
,1
 

 +<<'+'8'8GX'8'Y$rP   r  r  c           	         UbJ  XR                   R                  :H  nXR                   R                  :H  nXR                   R                  :H  nGO8UU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nX4U4$ )a;  
Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
`config.image_token_id`. Same goes for audio and video masks

Args:
    input_ids: A tensor containing the hard token IDs from the text tokenizer.
    inputs_embeds: A tensor containing the embeddings for all hard text tokens.

Returns:
    image_mask, video_mask, audio_mask
)r   r   r   )
r`   image_token_idvideo_token_idaudio_token_idget_input_embeddingsrL   rr   r  r   r  )rt   r  r  special_image_maskspecial_video_maskspecial_audio_masks         rQ   get_placeholder_mask Gemma4Model.get_placeholder_mask]  sA   &  !*kk.H.H!H!*kk.H.H!H!*kk.H.H!H ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  "7IIIrP   r  r[   r  r   r3  r  r  c                 D   USL U
SL-  (       a  [        S5      eU R                  X5      u  nnnUU-  U-  nSnU
cI  UR                  5       nU R                  R                  R
                  UU'   U R                  5       " U5      n
U R                  R                  5       R                  (       a  U R                  R                  R                  U R                  R                  R
                  SS24   n[        R                  " US   UR                  SSS5      U
5      nU R                  R                  UU5      nOSnUGb  U R!                  X,SS9R"                  nUR%                  U
R&                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        X   R3                  5       UR3                  5       :H  SU S	UR4                  S
    35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UGb  U R9                  X=SS9R"                  nUR%                  U
R&                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S	UR4                  S
    35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UGb  UGb  U R;                  XFSS9nUR"                  nUR<                  nUU   nUR+                  5       nUR-                  S5      R/                  U
5      R%                  U
R&                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S	UR4                  S
   UR4                  S   -   35        U
R7                  UR%                  U
R&                  5      UR%                  U
R&                  5      5      n
UcU  Ub  UR?                  5       OS
n[        R@                  " U
R4                  S   U
R&                  S9U-   nUR-                  S
5      n[C        U=n [D        5      (       dZ  U R                  R                  5       RF                  S:X  a  [I        U R                  U
UUUU	S9n O[K        U R                  U
UUU5      n U R                  " SUU UUU
USS.UD6n![M        U!RN                  U!RP                  U!RR                  U!RT                  Ub  WOSUb  WOSU!RV                  S9$ )  
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
Nr  r   r9   r   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   vision)r  r[   r3  r   r  )r  r[   r   r3  r  r  rP  )r  r3  rw   r  image_hidden_statesaudio_hidden_statesrA   rB   ),r  rL  r  r`   r  pad_token_idrH  r  rT  r.  r  r   rL   rl  r   r  r<  r:  r   r   r   rk  r   	expand_asr   numelr   masked_scatterrB  get_audio_featuresr[   r  r   r  rI   r,  r   r   r?   r  r3  rw   r  rA   )"rt   r  rp  r>  r  r[   r  r   r3  r  r  r  r6  r?  rX  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsr  image_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensr  r  r  s"                                     rQ   r{   Gemma4Model.forward  s   < -t";<YZZ-1-F-Fy-`*
J
$z1J>  %OO-M-1[[-D-D-Q-QM/* 557FM;;&&(DD //<<CCDKKD[D[DhDhjkDklM %OI,FHZHZ[\^_acHdfs t#22GGWhi# #!44\cg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ")//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>dh2iL)77N&2&A&A#
 ,,CDN'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF{{**,HHHT&@KK"/#1$3!-&7'# '@KK!"# '# %% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2LRV$55
 	
rP   zPProjects the last hidden state from the audio encoder into language model space.c                     U R                   c  [        S5      eU R                   " X4SS0UD6nU R                  UR                  S9Ul        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.rP  Tr9  )r)  r  r*  r  r:  )rt   r  r  rX  audio_outputss        rQ   rX  Gemma4Model.get_audio_features(  sc     #R 
 ((iZ^ibhi&*&6&6]EdEd&6&e#rP   )r)  r*  r'  r&  ry   r"  )NNNNNNNNNNNNN)rD   rE   rF   rG   r;   rm   r0  r4  r   r   rL   r  r  r   r   r   r<  rB  rK   r\   rL  r!   rM   r   r  r?   r{   rY   rX  rO   r~   r   s   @rQ   r"  r"    s   
| 
:; !rs 7;'' ",,t3 +,	
 
$ t & !tu 7;".. ",,t3 +,	
 
$ v 0 .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:Y
##d*Y
 ''$.Y
 #..5	Y

 ))D0Y
 t+Y
 #\\D0Y
 &&-Y
 Y
 !++d2Y
 ((4/Y
 $;Y
 ",,t3Y
 ",,t3Y
 +,Y
  
#!Y
    Y
v !st #\\ +,	
 
'	' u rP   r"  z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #       .  ^  \ rS rSrSrS rS r               SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4"S jjr\ SS\R                  S\R                  S-  S\\   4S jj5       r\  S S\S\R                  S
\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\4S jj5       r             S!U 4S jjrSrU =r$ )"Gemma4ForConditionalGenerationiB  r  c                 6    U R                   R                  5       $ ry   )r  r0  r/  s    rQ   r0  =Gemma4ForConditionalGeneration.get_per_layer_input_embeddingsK  s    zz88::rP   c                 :    U R                   R                  U5        g ry   )r  r4  r3  s     rQ   r4  =Gemma4ForConditionalGeneration.set_per_layer_input_embeddingsN  s    

11%8rP   Nr  rp  r>  r  r[   r  r   r6  r?  r3  r  r  r  r  r  rX  rc   c                 x   U R                   " SUUUUUUUU
UUUUUU	SS.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnUb6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  S9$ )rO  T)r  rp  r>  r  r[   r  r   r3  r  r  r  r  r6  r?  rP  N)r  r  r3  rw   r  rR  rS  rA   rB   )r  r  r  r}   r  r  r`   r  r  rL   r   r  r  rS   r3  rw   r  rR  rS  rA   )rt   r  rp  r>  r  r[   r  r   r6  r?  r3  r  r  r  r  r  rX  r  rw   r  r  r  r  s                          rQ   r{   &Gemma4ForConditionalGeneration.forwardQ  sU   : ** 
% 3)) 3%+/'11
  !
&  118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^ibhiD+#33!//)) ' ; ; ' ; ;$55	
 		
rP   c                 <    U R                   R                  " X40 UD6$ )a  
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
)r  r<  )rt   rp  r6  rX  s       rQ   r<  1Gemma4ForConditionalGeneration.get_image_features  s     zz,,\XQWXXrP   r`   r  c           
          [        U R                  5       SS 5      S:X  a>  [        U UUUUU4SU0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6$ [	        XX#U40 UD6$ s  sn	nf )Nr,  rQ  r  rp  )r  r  r   rv  r   )
r`   r  r[   r3  r   r  r  rX  r  vs
             rQ   r   8Gemma4ForConditionalGeneration.create_masks_for_generate  s     6))+-JDQU]]-!	 $6	 %+LLNJNDAa>6I414NJ	 	 -~X^ 	 Ks   A,A,c                    > [         TU ]  " U4UUUUUUU
US.UD6nU(       d  U(       d  UUS'   UUS'   UUS'   U	US'   U$ S US'   U$ )N)r3  r  r[   r   r  r  r  r  rp  r>  r  r  r  )rl   prepare_inputs_for_generation)rt   r  r3  r  r   rp  r>  r  r[   r  r  r  r  r  r  rX  model_inputsru   s                    rQ   rz  <Gemma4ForConditionalGeneration.prepare_inputs_for_generation  s    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./
  15L,-rP   rB   )NNNNNNNNNNNNNNr   ry   )NF)NNNNNNNNNTNNF)rD   rE   rF   rG   r  r0  r4  rL   r  r  rM   r   r  r}   r   r   rS   r{   r   r<  r  r   rI   r   rz  rO   r~   r   s   @rQ   rm  rm  B  s     ;9
 .2158<37.237046:6:(,5926*.!%-.!F
##d*F
 ''$.F
 #..5	F

 ))D0F
 t+F
 #\\D0F
 &&-F
 ",,t3F
 ",,t3F
 F
 !++d2F
 ((4/F
   4'F
 $;F
  ell*!F
" +,#F
$ 
&%F
P  7;Y''Y ",,t3Y +,	Y Y  26*/ || t+ 	
 llT) !<<$. !4K 
 >    * *rP   rm  )r  r  rm  r"  rh  r  r}  )r$   r"  )r   collections.abcr   dataclassesr   	functoolsr   rL   r   torch.nnr   r    r	   rq  activationsr
   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr    r!   utils.output_capturingr"   r#   auto.modeling_autor%   gemma3.modeling_gemma3r&   r'   r(   r)   r*   r+   r,   gemma3n.modeling_gemma3nr-   r.   r/   r0   r1   r2   r3   r4   r5   llama.modeling_llamar6   mixtral.modeling_mixtralr7   0moonshine_streaming.modeling_moonshine_streamingr8   configuration_gemma4r:   r;   r<   r=   
get_loggerrD   loggerr?   rS   rV   rY   r  r^   r   r   r   r   r  r!  Conv1dr5  r>  rN  r\  ru  r  rM   r}   r  r  r  r  r  r  r  r$  r8  r@  rP  rf  rh  r  r  r  r}  r  r  r  rI   r   r"  rm  __all__rB   rP   rQ   <module>r     sx    $ ! %   $ & ! . 3  C S K F &  H E *  
 
 
 8 5 [ g g  
		H	%Q : Q*Q#@ Q2 
Q$; 
Q 
Q 
37 3  3BII :	N 	7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l3		 3D80 80vai a 5&||5&	5& 
5& ,,	5&
 5& \\5&p:"6 :z:)O :)|"1 "J)H")) )H^^I ^U 5 UDq)")) q)h7 7!@ryy !@HO/ Od	$A 	8)2 8)v `aR
o R
 bR
j ]^H
) H
 _H
VSc, Scl>H- >HB68 6>LL4'\\D( _H .2&*,,<<, LL4', T\	,
 ,,%, ||d*, t, 
,^ e, eeP	 f%D ffRrP   