
    Z j.                    B   S SK r S SKJrJr  S SKJr  S SKJr  S SKrS SK	J
r
  S SKJ
s  Jr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.J/r/  SSK0J1r1J2r2  SSK3J4r4  SSK5J6r6  SSK7J8r8J9r9J:r:J;r;  \." 5       (       a  S SK<J=r=  \,\ " S S\5      5       5       r>\\," SS9 " S S\5      5       5       r?\," SS9\ " S S \*5      5       5       r@ " S! S"\
R                  5      rB " S# S$\
R                  5      rC " S% S&\
R                  5      rD " S' S(\
R                  5      rE " S) S*\
R                  5      rF " S+ S,\
R                  5      rG " S- S.\
R                  5      rH " S/ S0\
R                  5      rI " S1 S2\
R                  5      rJ " S3 S4\
R                  5      rK " S5 S6\
R                  5      rM " S7 S8\
R                  5      rN " S9 S:\
R                  5      rO " S; S<\
R                  5      rPS= rQS>\R                  S?\SS@\R                  4SA jrT   SiSB\
R                  SC\R                  SD\R                  SE\R                  SF\R                  S-  SG\U\S-  SH\US-  SI\US-  S@\V\R                  \R                  4   4SJ jjrWSjSK\R                  SL\R                  SM\R                  SN\S4SO jjrX " SP SQ\
R                  5      rY " SR SS\5      rZ\, " ST SU\&5      5       r[ " SV SW\[5      r\ " SX SY\
R                  5      r]\," SZS9 " S[ S\\[5      5       r^\," S]S9 " S^ S_\[\5      5       r_ " S` Sa\
R                  5      r`\," SbS9 " Sc Sd\[5      5       ra\," SeS9 " Sf Sg\[\5      5       rb/ ShQrcg)k    N)CallableSequence)	dataclass)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_accelerate_availabletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfig)add_hook_to_modulec                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3nAudioEncoderModelOutput:   zm
audio_mel_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`
Naudio_mel_mask )
__name__
__module____qualname____firstlineno____doc__r,   torch
BoolTensor__annotations____static_attributes__r-       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr*   r*   :   s    
 /3NE$$t+2r7   r*   zL
    Base class for Gemma3n outputs, with hidden states and attentions.
    custom_introc                   j    \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
g)Gemma3nModelOutputWithPastE   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nimage_hidden_statesaudio_hidden_statesr-   )r.   r/   r0   r1   r2   r>   r3   FloatTensorr5   r?   r6   r-   r7   r8   r<   r<   E   s5     59**T1848**T18r7   r<   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   *   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Srg)Gemma3nCausalLMOutputWithPast_   a
  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr>   r?   r-   )r.   r/   r0   r1   r2   rD   r3   r@   r5   rE   rF   r
   rG   tuplerH   r>   r?   r6   r-   r7   r8   rB   rB   _   s    $ &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18r7   rB   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  4S jr
S\R                  S\R                  4S	 jrS
rU =r$ )Gemma3nRMSNorm   dimeps
with_scalec                    > [         TU ]  5         X l        X0l        U R                  (       a/  [        R
                  " [        R                  " U5      SS9U l        g g )NT)requires_grad)	super__init__rN   rO   nn	Parameterr3   onesweight)selfrM   rN   rO   	__class__s       r8   rS   Gemma3nRMSNorm.__init__   s>    $??,,uzz#dKDK r7   rG   c                     UR                  S5      R                  SSS9U R                  -   nU[        R                   " US5      -  $ )Nr!   T)keepdim      )powmeanrN   r3   )rX   rG   mean_squareds      r8   _normGemma3nRMSNorm._norm   sA    $((+00T0BTXXMuyyt<<<r7   returnc                     U R                  UR                  5       5      nU R                  (       a  X R                  R                  5       -  nUR	                  U5      $ N)rb   floatrO   rW   type_as)rX   rG   normed_outputs      r8   forwardGemma3nRMSNorm.forward   sF    

=#6#6#89??)KK,=,=,??M$$]33r7   )rN   rW   rO   )gư>T)r.   r/   r0   r1   intrg   boolrS   r3   Tensorrb   rj   r6   __classcell__rY   s   @r8   rK   rK      sW    LC Le L L L=5<< =
4U\\ 4ell 4 4r7   rK   c                   &  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S\R                  S	\
S
\
S\
S\
S\
S\
S\R                  4S jrS\R                  S\R                  S\R                  4S jrSrU =r$ )%Gemma3nAudioRelativePositionEmbedding   configc                 L  > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R                  U R                  -  U l        [        SU R                  R                  S-
  5      U l
        U R                  R                  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        SnSnU R                  S-  n[         R"                  " [%        U5      [%        U5      -  5      [        US-
  S5      -  nU[&        R(                  " [&        R*                  " U5      U* -  5      -  nU R-                  SUR%                  5       R/                  S5      R/                  S5      SS	9  g )
Nr   r#   Fbias      ?     @r!   inv_timescales
persistent)rR   rS   rt   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrT   Linearpos_projmathlogrg   r3   exparangeregister_buffer	unsqueeze)rX   rt   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrz   rY   s          r8   rS   .Gemma3nAudioRelativePositionEmbedding.__init__   sJ   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r7   positiondtyperd   c                 H   UR                  5       R                  S5      nXR                  R                  UR                  [
        R                  S9-  n[
        R                  " [
        R                  " U5      [
        R                  " U5      /SS9nUR                  U5      $ )Nr\   devicer   rM   )rg   r   rz   tor   r3   float32catsincostype)rX   r   r   scaled_timetiming_signals        r8   _get_timing_signal_1d_pos?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s{    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r7   term_bd_before_shift
batch_sizer~   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     US-   U-
  nSU4n	[         R                  R                  X5      n
U
R                  UUUXVS-   -  45      nUSS2SS2SS2SXV-  24   nUR                  UUUUU45      nU$ )a"  Performs the relative shift.

Args:
  term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
    (B), num_heads (N), num_query_blocks (U), query_block_size (W),
    key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

Returns:
  Tensor of shape [B, N, U, W, C].
r#   r   N)rT   
functionalpadreshape)rX   r   r   r~   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r8   _relative_shift5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  q$89	
 *!Q3X5E5X3X*XY )00   
 r7   querieskeysc           	      >   UR                   u  p4pVnUR                   u    p  n[        R                  " U R                  U R                  * S-
  SUR
                  S9R                  S5      n
U
R                   S   nU R                  XR                  S9nU R                  U5      nUR                  SXR                  U R                  5      R                  S5      nUR                  SSSSS5      nUR                  SSSSS5      n[        R                  " UU5      nUR                  SSSSS5      nUR                  SSS5      nUR                  X6XE-  U5      n[        R                  " UU5      nUR                  UUUUU5      nU R!                  UUUUUU	U5      nUU-   $ )	Nr#   r\   r   r   r   r   r!      )shaper3   r   r   r   r   r   r   r   r   r   r~   r   squeezepermutematmulr   )rX   r   r   r   r   r   r~   r   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r8   rj   -Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
&68'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
?O?bdlm

 #(,,z:"F 3::
 ..
 ((r7   )r   rt   r   r   r   r~   r   )r.   r/   r0   r1   r$   rS   r3   rn   r   r   rl   r   rj   r6   ro   rp   s   @r8   rr   rr      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L) L)r7   rr   c                   >  ^  \ rS rSrS\4U 4S jjrS rS\R                  S\	S\	S\R                  4S	 jr
S
\R                  S\R                  4S jrS
\R                  S\R                  4S jrS
\R                  S\R                  S\R                  4S jrSrU =r$ )Gemma3nAudioAttentioniD  rt   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R
                  U R                  -  U l        U R                  R                  U l        U R                  R                  U l
        [        SU R                  R                  S-
  5      U l        U R                  R                  U l        U R                  U R                  -   U R                  -   U l        [#        U5      U l        [&        R(                  " [*        R,                  " U R                  45      5      U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        U R                  S-  nS[*        R&                  R8                  R;                  [*        R<                  " S5      5      -  nU R?                  SX#-  RA                  5       RC                  5       SS	9  U RE                  5       nU R?                  S
USS	9  U R?                  S[*        R<                  " U R                  5      RG                  5       SS	9  g )Nr   r#   Frv   r^   rx           q_scaler{   local_causal_valid_masksoftcap)$rR   rS   rt   r}   r~   r   r   conf_attention_chunk_size
chunk_sizer   max_future_horizonr   r   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizerr   relative_position_embeddingrT   rU   r3   zerosper_dim_scaler   q_projk_projv_projr   softplustensorr   clonedetachcreate_local_causal_valid_maskrg   )rX   rt   r   r_softplus_0r   rY   s        r8   rS   Gemma3nAudioAttention.__init__E  s%   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY)?(F(F(H(O(O(Q^cd"&"E"E"G68O\abLL778>>@ 	 	
r7   c                    [         R                  " [         R                  " U R                  U R                  4[         R
                  S9SS9R                  n[         R                  " [         R                  " U R                  U R                  4[         R
                  S9U R                  U R                  -   S9n[         R                  " U R                  U R                  4[         R
                  S9nX1-  U-  nU$ )Nr   r   )diagonal)	r3   trilrV   r   r   rm   Tr   r   )rX   lower_causal_maskupper_causal_maskr   s       r8   r   4Gemma3nAudioAttention.create_local_causal_valid_maskg  s    !JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9"MPa"a&&r7   xpad_left	pad_rightrd   c                     UR                   tpEnUR                  XB/UQ75      nUR                  XC/UQ75      n[        R                  " XqU/SS9nU$ )Nr#   r   )r   	new_zerosr3   r   )	rX   r   r   r   batchr   
tail_shapeleftrights	            r8   	_pad_dim1Gemma3nAudioAttention._pad_dim1t  sV     !:{{E9j9:U;
;<IIt&A.r7   rG   c                 "   UR                   nUSS u  p4X@R                  -   S-
  U R                  -  nXPR                  -  U-
  =nS:  a  U R                  USU5      nX5U R                  4USS -   nUR                  U5      R	                  5       nU$ )a  Turns a sequence to non overlapping blocks.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, block_size, ...], with necessary
    paddings,
    where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
Nr!   r#   r   )r   r   r   r   
contiguous)rX   rG   r   bt
num_blockspadding_lenpermute_dimss           r8   _convert_to_block'Gemma3nAudioAttention._convert_to_block{  s     ##Ray//)A-$//A
%7!;;Kq@ NN=![IMt7%)C%--l;FFHr7   c                 R   U R                   nU R                  U R                  -   S-
  nU R                  XU5      nU R                  nU R                  nUR                  SXES9nUR                  S:  a&  UR                  S:  a  [        R                  " USSS9nUR                  5       $ )a  Extracts temporal context for every block.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, context_size, ...], with necessary
    paddings,
    where context_size = block_size + left_context + right_context,
    and output[:, i, ...] are x[:, start-left_context:end+right_context,
    ...],
    start = i * block_size, end = (i + 1) * block_size.
r#   )	dimensionsizestepr!   r   r\   )sourcedestination)
r   r   r   r   r   unfoldndimr3   movedimr   )rX   rG   r   r   	frame_len
frame_step
x_unfoldeds          r8   _extract_block_context,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}	J%%	__
 #))AI)W
 !joo&9 z"!LJ$$&&r7   maskc                    / UR                   S S QU R                  PU R                  P7nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       n[        R                  R                  R                  U R                  5      nSSSU R                  4nUR                  U5      n	X@R                  -  U	-  nUR                   S S u  pU R                  U5      nU R!                  U5      nU R!                  U5      nUR                   S   nU) nU R!                  U5      nUR"                  S:X  aI  UR                   S   UR                   S   -  U R$                  :X  a  UR	                  XU R$                  5      nUR                   U
UU R$                  4:w  a,  ['        SUR                    SU
 SU SU R$                   S	3	5      eUR)                  S5      R)                  S
5      nU R*                  R)                  S5      R)                  S5      R)                  S5      n[        R,                  " UUR/                  UR0                  5      5      nU R3                  X5      nU R4                  R/                  UR0                  5      nUU-  n[        R6                  " U5      nUU-  n[        R8                  " UU[        R:                  " UR<                  5      R>                  5      n[        R                  R                  RA                  US[        RB                  S9R/                  UR<                  S9nUR                   u  nnnnnUR                   S   nURE                  SSSSS5      R	                  SUU5      nURE                  SSSSS5      R	                  SUU5      n[        RF                  " UU5      n U R	                  UUUUU5      RE                  SSSSS5      n!U!R	                  U
XRH                  -  U R                  U R                  45      n!U!S S 2S U24   n!U!$ )Nr\   r#   r!   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rM   r   r   )%r   r~   r   r   r   r   r   r   r3   rT   r   r   r   viewr   r  r  r  r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rX   rG   r  	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_whererE   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r8   rj   Gemma3nAudioAttention.forward  sI   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#ll25OO)//3
--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*Md.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 ??2	
 *!WfW*5r7   )r   r   rt   r   r   r   r   r   r   r~   r   r   r   r   )r.   r/   r0   r1   r$   rS   r   r3   rn   rl   r   r  r  r4   rj   r6   ro   rp   s   @r8   r   r   D  s     
1  
D'5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell d dr7   r   c                      ^  \ rS rSrSr SS\S\\   S\4U 4S jjjrS\	R                  S\	R                  4S	 jrS
rU =r$ )Gemma3nAudioCumulativeGroupNormi(  a  Applies Group Normalization cumulatively over the time dimension.

This layer normalizes the input by calculating the mean and variance
cumulatively over the time dimension (dim 1). The statistics are computed
over all feature dimensions (specified by `feature_dims` and `num_channels`)
for elements marked as valid by the optional `mask`.

If a `mask` is provided (True for valid, False for invalid/padded),
invalid time steps do not contribute to the statistics calculation, and
their corresponding output values are zeroed out.

Scale and bias, if enabled, are applied per-channel (last dimension).
This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
and `cumulative=True`.
num_channelsfeature_dimsrN   c           	        > [         TU ]  5         Xl        [        U5      U l        X0l        [        R                  " [        R                  " U5      5      U l
        [        [        SS[        U R                  5      -   S-   5      5      U l        g )Nr!   r#   )rR   rS   r>  rI   r?  rN   rT   rU   r3   rV   rW   rangelenreduction_axes)rX   r>  r?  rN   rY   s       r8   rS   (Gemma3nAudioCumulativeGroupNorm.__init__9  sn     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr7   rG   rd   c                    U R                   U R                  4-   nUR                  SS U:w  a  [        SUR                  SS  SU 35      eUR                  n[
        R                  nUR                  U5      n[
        R                  " XTS9n[
        R                  " XPR                  SS9n[
        R                  " USS	9n[
        R                  " X`R                  SS9n	[
        R                  " U	SS	9n
[
        R                  " U
S
S9nX-  nX\-
  R                  S5      n[
        R                  " XR                  SS9n[
        R                  " USS	9nX-  nX\-
  [
        R                  " UU R                  -   5      -  nU R                   R                  U5      nS/UR#                  5       S-
  -  U R                  /-   nUUR%                  U5      -  nUU-  nUR                  U5      $ )zApplies cumulative group norm, optionally using a mask.

Args:
  hidden_states: Input tensor, shape [B, T, *feature_dims, C].

Returns:
  Normalized tensor with the same shape as x.
r!   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrM   r]   r#   r   rx   )r  )r?  r>  r   r  r   r3   r   r   	ones_likesumrC  cumsumclampr_   rsqrtrN   rW   rM   r  )rX   rG   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r8   rj   'Gemma3nAudioCumulativeGroupNorm.forwardK  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF=	  ))F0C0CTRo1= "'9:M:MW[!\"\\*@aH"'++.@c"J ";
 #)"3!8!8!; 99%;ATAT^bc  ,,'7Q? '@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r7   )rN   r?  r>  rC  rW   )gMbP?)r.   r/   r0   r1   r2   rl   r   rg   rS   r3   rn   rj   r6   ro   rp   s   @r8   r=  r=  (  s`    ( 	NN smN 	N N$G,U\\ G,ell G, G,r7   r=  c                      ^  \ rS rSrSr SS\S\S\S\\\\\4   4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )Gemma3nAudioSSCPConvBlocki  zA single convolution block for the SubSampleConvProjection.

This block consists of a 2D convolution, followed by CumulativeGroupNorm,
and a ReLU activation. It handles manual padding for the convolution.
rt   idxinput_freq_dimmanual_paddingc           	      6  > [         TU ]  5         Xl        X@l        US:X  a  SOU R                  R                  US-
     nU R                  R                  U   nU R                  R
                  U   u  pxU R                  R                  U   u  p[        R                  " UUUU4X4SSS9U l	        X0R                  S   -   U R                  S   -   nX-
  U
-  S-   n[        UU4U R                  R                  S9U l        [        R                  " 5       U l        g )Nr   r#   )r   r   F)in_channelsout_channelskernel_sizestridepaddingrw   )r>  r?  rN   )rR   rS   rt   rd  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerT   Conv2dconvr=  sscp_conv_group_norm_epsnormReLU
activation)rX   rt   rb  rc  rd  rf  rg  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrY   s                r8   rS   "Gemma3nAudioSSCPConvBlock.__init__  s    	, !8a)K)KCRSG)T{{99#>![[>>sC![[>>sCII#% '

	 %':':1'==@S@STU@VV!,9A=
3%$44
	 '')r7   audio_encodingsrd   c                    [         R                  " XR                  SSS9R                  U R                  R
                  R                  5      nU R	                  U5      nUR                  SSSS5      R                  5       nU R                  U5      nUR                  SSSS5      R                  5       nU R                  U5      $ )Nconstantr   )modevaluer   r!   r   r#   )Fr   rd  r   ro  rW   r   r   r   rq  rs  )rX   r{  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r8   rj   !Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h!k!kII"""

  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r7   )rs  rt   ro  rd  rq  ))r   r   r   r   )r.   r/   r0   r1   r2   r$   rl   rI   rS   r3   rn   rj   r6   ro   rp   s   @r8   ra  ra    sm     5A)$")$ )$ 	)$
 c3S01)$ )$V7u|| 7 7 7r7   ra  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )#Gemma3nAudioSubSampleConvProjectioni  rt   c                 Z  > [         TU ]  5         Xl        UR                  n/ n/ n[	        S5       Hk  nUR
                  U   u  pgUR                  U   u  pSn
US-
  nSnSnUUU
U4nUR                  U5        X,-   U-   nX-
  U	-  S-   nUR                  U5        UnMm     [        SUR                  UUS   S9U l	        [        SUS   UUS   S9U l
        UR                  S   nUS   nUU-  U l        [        R                  " U R                  U R                  R                  SS9U l        g )Nr!   r   r#   )rb  rc  rt   rd  r\   Frv   )rR   rS   rt   input_feat_sizerA  rl  rm  appendra  conv_0conv_1rk  input_proj_in_featuresrT   r   r   input_proj_linear)rX   rt   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsirt  ru  rv  rw  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerx  f_out_after_convfinal_c_outfinal_f_outrY   s                      r8   rS   ,Gemma3nAudioSubSampleConvProjection.__init__  sr   $*$:$:!#%  "qA!'!=!=a!@H!'!=!=a!@H I#a<L JK 	$  %++,@A 4@;NK + 68CaG!(()9:(8%= @ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr7   r{  rd   c                    UR                  S5      nU R                  U5      nU R                  U5      nUR                  u  pEpgUR	                  SSSS5      R                  5       nUR                  XFXu-  5      n	U R                  U	5      n
U
$ )Nr#   r   r!   r   )r   r  r  r   r   r   r  r  )rX   r{  audio_encodings_reshapedr   r   c_outt_outf_out
x_permutedoutput_flattenedoutputs              r8   rj   +Gemma3nAudioSubSampleConvProjection.forward  s     $3#<#<Q#? KK01KKN!"%YYq!Q*557
%??1U]C''(89r7   )rt   r  r  r  r  r.   r/   r0   r1   r$   rS   r3   rn   rj   r6   ro   rp   s   @r8   r  r    s3    7m1 7mru||   r7   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerAttentioni#  rt   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l
        [        U5      U l        [        R                  " U R                  U R                  R                  SS9U l        [        U R                  R                  5      U l        g )Ngradient_clippingFr{   rv   )rR   rS   rt   r   post_in_featuresr   r3   r   r  rK   pre_attn_normr   attnrT   r   post	post_normrX   rt   rY   s     r8   rS   'Gemma3nAudioConformerAttention.__init__$  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r7   r{  r,   rd   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  XB5      nUR
                  u  pgpUR                  XgX-  5      n
U R                  U
5      n[         R                  " XR                  * U R                  5      nX0R                  U5      -   $ rf   )	r3   rJ  r  r  r  r   r   r  r  )rX   r{  r,   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   r~   r   r  s              r8   rj   &Gemma3nAudioConformerAttention.forward.  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A#R  %=$B$B!i#;#C#CA)J^#_ ))$<=++o8N8N7NPTPfPfg,~~o/NNNr7   )r  rt   r  r  r  r  r.   r/   r0   r1   r$   rS   r3   rn   r4   rj   r6   ro   rp   s   @r8   r  r  #  sG    A1 AOu|| OUEUEU OZ_ZfZf O Or7   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerFeedForwardi?  rt   c                 ~  > [         TU ]  5         Xl        U R                  S[        R
                  " U R                  R                  5      SS9  [        U R                  R                  5      U l	        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  S-  U R                  R                  SS9U l        [        U R                  R                  5      U l        U R                  R                  U l        g )Nr  Fr{   r   rv   )rR   rS   rt   r   r3   r   r  rK   r   pre_layer_normrT   r   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r8   rS   )Gemma3nAudioConformerFeedForward.__init__@  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF $ @ @r7   r{  rd   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  U5      n[
        R                  R                  U5      nU R                  U5      n[         R                  " XR                  * U R                  5      nU R                  U5      nX!U R                  -  -   $ rf   )r3   rJ  r  r  r  rT   r   silur  r  r  )rX   r{  residuals      r8   rj   (Gemma3nAudioConformerFeedForward.forwardL  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..?T-B-BBCCr7   )rt   r  r  r  r  r  r  rp   s   @r8   r  r  ?  s6    
A1 
A	Du|| 	D 	D 	Dr7   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerLightConv1diX  rt   c           
        > [         TU ]  5         Xl        [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l	        [        R                  " U R                  R                  U R                  R                  U R                  R                  SSU R                  R                  SS9U l        U R                  S[        R                  " U R                  R                   5      SS	9  [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        U R                  R                  S-
  U l        g )
NrN   r!   Frv   r#   r   )rf  rg  rh  ri  rj  groupsrw   r  r{   )rR   rS   rt   rK   r   rms_norm_epsr  rT   r   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1dr   r3   r   r  	conv_norm
linear_endcausal_paddingr  s     r8   rS   )Gemma3nAudioConformerLightConv1d.__init__Y  sB   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr7   r{  rd   c                 2   UnU R                  U5      nU R                  U5      n[        R                  R                  R                  USS9nUR                  SSS5      n[        R                  " X0R                  S45      nU R                  U5      nUR                  SSS5      n[        R                  " XR                  * U R                  5      nU R                  U5      n[        R                  R                  U5      nU R                  U5      nX-   nU$ )Nr\   r   r   r!   r#   )r  r  r3   rT   r   glur   r  r   r  r  rJ  r  r  r  r  )rX   r{  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr  s         r8   rj   (Gemma3nAudioConformerLightConv1d.forwardn  s    #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0HK^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: ;r7   )r  rt   r  r  r  r  r  r  rp   s   @r8   r  r  X  s2    D1 D*u||   r7   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerBlocki  rt   c                   > [         TU ]  5         Xl        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l	        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l        g )Nr  Fr{   )rR   rS   rt   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endr   r3   r   r  rK   r   rq  r  s     r8   rS   #Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r7   r{  r,   rd   c                 f   U R                  U5      nU R                  X5      nU) nXR                  S5      R                  UR                  5      -  nU R                  U5      nU R                  U5      n[        R                  " XR                  * U R                  5      nU R                  U5      nU$ )Nr\   )r  r  r   r   r   r  r  r3   rJ  r  rq  )rX   r{  r,   validity_mask_for_lconvaudio_encodings_for_lconv_inputr  s         r8   rj   "Gemma3nAudioConformerBlock.forward  s    ..?..I#1/*9<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r7   )r  rt   r  r  r  rq  r  rp   s   @r8   r  r    s@    	<1 	<u|| UEUEU Z_ZfZf  r7   r  c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3nTextScaledWordEmbeddingi  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )Nr  Fr{   )rR   rS   scalar_embed_scaler   r3   r   )rX   r  r  r  r  rY   s        r8   rS   'Gemma3nTextScaledWordEmbedding.__init__  s7    D"-]ELL,ERWXr7   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ rf   )rR   rj   r  r   rW   r   )rX   r  rY   s     r8   rj   &Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr7   )r  )rx   )r.   r/   r0   r1   r2   rl   rg   rS   r3   rn   rj   r6   ro   rp   s   @r8   r  r    sM    Ys Y3 YS Y_d Y Y
S S Sr7   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nTextLaurelBlocki  z Learned Augmented Residual Layerrt   c                   > [         TU ]  5         Xl        [        R                  " U R                  R
                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R
                  SS9U l        [        U R                  R
                  U R                  R                  S9U l        g )NFrv   r  )rR   rS   rt   rT   r   r   laurel_ranklinear_leftlinear_rightrK   r  post_laurel_normr  s     r8   rS   Gemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er7   rG   rd   c                 p    U R                  U5      nU R                  U5      nU R                  U5      nX-   $ rf   )r  r  r  )rX   rG   laurel_hidden_statesnormed_laurel_hidden_statess       r8   rj   Gemma3nTextLaurelBlock.forward  s@    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#::r7   )rt   r  r  r  )r.   r/   r0   r1   r2   r&   rS   r3   rn   rj   r6   ro   rp   s   @r8   r  r    s5    *f0 f;U\\ ;ell ; ;r7   r  c                      ^  \ rS rSrSS\S\4U 4S jjjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
rU =r$ )Gemma3nTextMLPi  rt   	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR                  U   U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        UR                  U   U l        g )NFrv   )rR   rS   rt   r   intermediate_sizerT   r   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrX   rt   r   rY   s      r8   rS   Gemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r7   rG   rd   c                     U R                  U5      nU R                  S:  a  U R                  U5      nU R                  U5      nU R	                  U5      nU R                  X4-  5      nU$ )Nr   )r  r	  _gaussian_topkr  r  r  )rX   rG   r  activationsr  r  s         r8   rj   Gemma3nTextMLP.forward  sa    NN=1	##c)++I6Ikk),,,}-NN;#89	r7   inputsc                    [         R                  " U R                  [         R                  UR                  S9n[         R
                  R                  R                  SS5      nUR                  U5      nUR                  UR                  5      n[         R                  " USSS9n[         R                  " USSSS9nXVU-  -   n[        R                  R                  X-
  5      $ )	Nr   r   r   r#   r\   TrF  F)rM   r]   unbiased)r3   r   r	  r   r   distributionsnormalNormalicdfr   r   r`   stdrT   r   relu)rX   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r8   r  Gemma3nTextMLP._gaussian_topk  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&"344r7   )r  r	  rt   r  r  r   r  r  )r   )r.   r/   r0   r1   r&   rl   rS   r3   rn   rj   r  r6   ro   rp   s   @r8   r  r    s[    	Q0 	QS 	Q 	QU\\ ell 5U\\ 5ell 5 5r7   r  c                   n  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS\R                  S\R                  4S jrSrU =r$ )Gemma3nTextAltUpi  a  Alternating Updates (AltUp)

The AltUp module wraps transformer layers. The `predict` step modifies the
input to the transformer layer, and the `correct` step propagates the output
of the transformer layer to the sparsely updated dimensions.

See more in the research paper:

https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
rt   c                 2  > [         TU ]  5         Xl        [        R                  " [
        R                  " U R                  R                  5      5      U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        U R                  R                  U R                  R                  S9U l        U R#                  S[
        R$                  " U R                  R                  S-  5      SS9  g )NFrv   r!   r  router_input_scale      r{   )rR   rS   rt   rT   rU   r3   r   r   correct_output_scaler   altup_num_inputscorrection_coefsprediction_coefsmodality_routerrK   r  router_normr   r   r  s     r8   rS   Gemma3nTextAltUp.__init__  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr7   r   rd   c                     U R                  U5      U R                  -  nU R                  U5      n[        R                  " UR                  5       5      R                  U5      $ rf   )r+  r$  r*  r3   r  rg   rh   )rX   r   router_inputsrouteds       r8   compute_router_modalities*Gemma3nTextAltUp.compute_router_modalities  sM    ((+d.E.EE%%m4zz&,,.)11!44r7   rG   c                    U R                  XR                  R                     5      nU R                  (       ap  U R                  R                  bY  U R
                  R                  R                  R                  U R                  R                  * U R                  R                  5        U R                  U5      R                  " / UR                  SS QU R                  R                  PU R                  R                  P76 R                  SSSS5      n[        R                  " UR                  SSSS5      U5      nUR                  SSSS5      nXA-  nUR                  5       R!                  U5      $ )a  Predicts the output of a layer using a trainable map.

Args:
    hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
Nr\   r   r#   r   r!   )r0  rt   altup_active_idxtrainingaltup_coef_clipr)  rW   dataclamp_r   r   r'  r   r3   r   r   rh   )rX   rG   
modalities	all_coefspredictionss        r8   predictGemma3nTextAltUp.predict  s?    33M++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSno !!*-W i &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15$%%'//>>r7   r:  	activatedc                    U R                  U5      nX!U R                  R                     -
  nUR                  U R                  R                  SSS5      nU R
                  (       a  U R                  R                  b{  U R                  R                  R                  U R                  R                  * U R                  R                  5      n[        R                  R                  R                  X5SS9S-   nOU R                  U5      S-   nUR                  SSS5      R                  S5      n[        R                   " XF5      nXq-  nUR#                  5       R%                  U5      $ )a  Corrects the predictions relative to the

Args:
    predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
    activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
        predictions relative to the activated input embeddings.
r#   Nrv   rx   r!   r   r\   )r0  rt   r3  repeatr'  r4  r5  r(  rW   rJ  r3   rT   r   linearr   r   mulr   rh   )rX   r:  r=  r8  
innovationrW   r9  	correcteds           r8   correctGemma3nTextAltUp.correct  s$    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
==T[[88D**11779T9T8TVZVaVaVqVqrF++22:D2QTWWI--j9C?I
 %%aA.88<	IIj4	 	##%--i88r7   rC  c                 p    UR                  U R                  5      U R                  -  R                  U5      $ )z
This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
(which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
`scale_corrected_output`
)rh   r&  rX   rC  s     r8   rj   Gemma3nTextAltUp.forward:  s2     !!$";";<t?X?XXaabkllr7   c                 $    U R                  U5      $ )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)rj   rG  s     r8   scale_corrected_output'Gemma3nTextAltUp.scale_corrected_outputB  s    ||I&&r7   )rt   r&  r(  r*  r)  r+  )r.   r/   r0   r1   r2   r&   rS   r3   rn   r0  r;  rD  rj   rJ  r6   ro   rp   s   @r8   r"  r"    s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9>m m%,, m' ' ' 'r7   r"  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr\   r!   r   )r   r3   r   )r   x1x2s      r8   rotate_halfrO  G  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   rG   n_reprd   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r#   N)r   expandr   )rG   rP  r   num_key_value_headsslenr   s         r8   	repeat_kvrU  N  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr7   modulequerykeyr  attention_maskdropoutscalingr   c                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr^   r!   r   r\   r  )pr4  r#   )r   rU  num_key_value_groupsr3   r   	transposer  rT   r   r  r   r   r   rZ  r4  r   )rV  rW  rX  r  rY  rZ  r[  r   kwargsr!  r"  attn_weightsattn_outputs                r8   eager_attention_forwardrc  Z  s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r7   r   r   r   unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )a$  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   rO  )r   r   r   rd  s       r8   apply_rotary_pos_embrf  |  s6    " --
&C
--
&CGA,--r7   c                   *  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S	\	S-  S
\
\\\R                  \R                  4   4   S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma3nTextAttentioni  rt   r   c                   > [         TU ]  5         Xl        X l        [	        US5      (       a  UR
                  U   OS U l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l        SU l        U R                  R                   U l        SU l        U R                  R$                  U R                  R&                  -
  nX#s=:  =(       a    S:  Os  U l        UR
                  S U nU R(                  (       a@  [+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  U l        S	U l        OBS U l        U[+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  :H  U l        [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l        [;        UR                  UR<                  S9U l        U R(                  (       d  [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l         [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l!        [;        UR                  UR<                  S9U l"        [;        UR                  UR<                  S	S9U l#        [2        R4                  " UR                  U R                  -  UR                  UR6                  S
9U l$        g )Nlayer_typessliding_attentionr   rx   Tr   r#   r\   Frv   )rM   rN   )rM   rN   rO   )%rR   rS   rt   r   hasattrrj  
layer_type
is_slidingsliding_windowgetattrr   num_attention_headsr   rS  r^  r[  attention_dropout	is_causalnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerrB  indexkv_shared_layer_indexstore_full_length_kvrT   r   attention_biasr   rK   r  q_normr   r   k_normv_normo_proj)rX   rt   r   first_kv_shared_layer_idxprev_layersrY   s        r8   rS   Gemma3nTextAttention.__init__  s   ";B6=;Y;Y&,,Y7_c//-@@7;f33D
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>$(KK$A$ADKKDdDd$d!"+"L"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D% ii : :T]] JQWQfQf
 %f>Q>QR &&))""F$>$>$NU[UjUjDK ))""F$>$>$NU[UjUjDK )V__&BUBUVDK(V__&BUBUbghDKii&&68J8JQWQfQf
r7   NrG   position_embeddingsrY  rF   shared_kv_statesr`  rd   c                    UR                   S S n/ UQSPU R                  R                  P7nUu  pU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  XPR                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      R	                  U5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR!                  XU R"                  5      u  pU R$                  (       a  X4XPR"                  '   [&        R(                  " U R                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr\   r!   )rd  r#   r   )rZ  r[  ro  )r   rt   r   r   r  r{  rf  r_  rv  rx  r   r   r   r|  r   r}  updater   ry  r   get_interface_attn_implementationrc  r4  rr  r[  ro  r   r   r~  )rX   rG   r  rY  rF   r  r`  input_shapehidden_shaper   r   r   r!  r"  attention_interfacerb  ra  s                    r8   rj   Gemma3nTextAttention.forward  s:    $))#2.??b?$++*>*>?&{{=166|D{{<0+LsRST#--a3
 ""'78R8R'S$J#|':':;J'??<+>+>?L]388FJZ0J-jsRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$/9/G^^,(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r7   )rr  rt   r   rs  rv  rn  r|  r   rx  r   rm  r^  r~  r{  r   r[  ro  ry  r}  r   NN)r.   r/   r0   r1   r&   rl   rS   r3   rn   r
   dictrI   r   r   rj   r6   ro   rp   s   @r8   rh  rh    s    .
0 .
S .
j )-PT;)||;) #\\;) t+	;)
 ;) sE%,,*D$EEFM;) +,;) 
u||U\\D00	1;) ;)r7   rh  c                     ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S	\	\\
\R                  \R                  4   4   S-  S
\R                  S-  S\R                  S-  S\S-  S\\   S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma3nTextDecoderLayeri  rt   r   c                 D  > [         TU ]  5         Xl        UR                  U l        X l        [        X5      U l        [        XS9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR                  U l        [         UR"                     U l        ['        U5      U l        [+        U5      U l        [.        R0                  " U R                  U R                  SS9U l        [.        R0                  " U R                  U R                  SS9U l        [        U R                  UR                  S9U l        g )N)r   r  Frv   )rR   rS   rt   r   r   rh  	self_attnr  mlprK   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r"  altupr  laurelrT   r   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr
  s      r8   rS    Gemma3nTextDecoderLayer.__init__  sB   !--"-f@!&>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r7   NrG   r  per_layer_inputr  rY  position_idsrF   r`  rd   c           
      z   U R                   R                  U5      n	XR                  R                     n
U R	                  U
5      nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nX-   [        R                  " S5      -  nU R                  U5      nU R                  U5      nU R                  U5      nUU-   nU R                   R                  U	U5      nUU R                  R                     R                  5       nU R                  R                  (       a  U R                   R!                  U5      nU R#                  U5      nU R%                  U5      n[&        R(                  " UU5      nU R+                  U5      nU R-                  U5      nUSS === U-  sss& U$ )N)rG   rY  r  r  r  rF   r!   r#   r-   )r  r;  rt   r3  r  r  r  r  r   sqrtr  r  r  rD  r   altup_correct_scalerJ  r  r  r3   multiplyr  r  )rX   rG   r  r  r  rY  r  rF   r`  r:  active_predictionactive_prediction_normedlaurel_outputr  r   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                          r8   rj   Gemma3nTextDecoderLayer.forward  s    jj((7'(D(DE#'#7#78I#J $<=.. 
2)-% 3+
 
 ,,T2&-
!1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!$$r7   )r  r  rt   r   r  r  r  r   r  r  r  r  r  r  r  r  )NNNNNN)r.   r/   r0   r1   r&   rl   rS   r3   rn   r  rI   
LongTensorr
   r   r   r@   rj   r6   ro   rp   s   @r8   r  r    s   c0 cS c0 -1(,PT.204(,3%||3% #\\3% 	3%
 sE%,,*D$EEFM3% t+3% &&-3% 3% +,3% 
u||U5#4#4e6G6G#GH4OO	P3% 3%r7   r  c            	         ^  \ rS rSr% \\S'   SrSrS/rSS/r	Sr
SrSrSrSr\\S.rS	r\R(                  " 5       U 4S
 j5       rS rS r   SS\S-  S\S-  S\S\R6                  4U 4S jjjr   SS\S-  S\S-  S\4S jjrSrU =r$ )Gemma3nPreTrainedModeliM  rt   modelTr  rF   r  )rG   rH   )imagetextaudioc                   > [         TU ]  U5        [        U[        5      (       a"  [        R
                  " UR                  5        GO[        U[        5      (       a  [        R                  " UR                  5        UR                  S-  nS[        R                  R                  R                  [        R                  " S5      5      -  n[        R                   " UR"                  X#-  5        [        R$                  " UR&                  UR(                  5        [        R                   " UR*                  UR-                  5       5        GO[        U[.        5      (       a-  [        R$                  " UR0                  UR2                  5        GO[        U[4        5      (       aZ  [        R                  " UR6                  5        [        R$                  " UR8                  U R:                  R<                  S-  5        GO[        U[>        5      (       a  Su  pEUR@                  S-  n[B        RD                  " [G        U5      [G        U5      -  5      [I        US-
  S5      -  nU[        RJ                  " [        RL                  " U5      U* -  5      -  n[        R                   " URN                  URG                  5       RQ                  S5      RQ                  S5      5        GO2[        U[R        5      (       ag  [        R$                  " URT                  U R<                  S-  5        [        R$                  " URV                  S[B        RX                  " S	5      -  5        O[        U[Z        5      (       a  UR\                   H  n	UR^                  n
UR`                  U	   S
:w  a  [b        UR`                  U	      n
U
" UR:                  U	S9u  p[        R                   " [e        X S35      U5        [        R                   " [e        X S35      U5        M     [g        US5      (       a6  [        R$                  " URh                  U R:                  Rh                  5        g g )Nr^   rx   r   r%  )rx   ry   r!   r#   r          @defaultrm  	_inv_freq_original_inv_freqr  )5rR   _init_weights
isinstancer=  initones_rW   r   zeros_r   r   r3   rT   r   r   r   copy_r   	constant_r   r   r   r   r  r  r  r"  r&  r$  rt   r   rr   r   r   r   rg   r   r   r   rz   r   Gemma3nTextModelper_layer_projection_scaleper_layer_input_scaler  Gemma3nRotaryEmbeddingrj  compute_default_rope_parameters	rope_typer   rp  rl  r  )rX   rV  r   r   r   r   r   r   rz   rm  rope_init_fncurr_inv_freqr   rY   s                r8   r  $Gemma3nPreTrainedModel._init_weights`  s	   f%f=>>JJv}}% 566KK,,-oot+G!4!4!=!=ell3>O!PPLJJv~~w'=>NN6>>6+K+KLJJv55v7\7\7^_ >??NN6--v/H/HI 011KK334NN644dkk6M6Mt6ST EFF+5(M#__1N&*hhu]/CeMFZ/Z&[^a"A_ '# +UYYu||N7SWnVn7n-ooNJJv,,n.B.B.D.N.Nq.Q.[.[\].^_ 011NN6<<d>N>NPT>TUNN677TYYs^9KL 677$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 6.//NN633T[[5R5RS 0r7   c                 .    U R                   R                  $ rf   
base_modelembed_tokens_per_layerrX   s    r8   get_per_layer_input_embeddings5Gemma3nPreTrainedModel.get_per_layer_input_embeddings  s    555r7   c                 $    XR                   l        g rf   r  rX   r  s     r8   set_per_layer_input_embeddings5Gemma3nPreTrainedModel.set_per_layer_input_embeddings  s    16.r7   Nnew_num_tokenspad_to_multiple_ofmean_resizingrd   c                 J   > [         TU ]  UUUS9nU R                  XU5        U$ )N)r  r  r  )rR   resize_token_embeddings_resize_per_layer_embeddings)rX   r  r  r  inputs_embedsrY   s        r8   r  .Gemma3nPreTrainedModel.resize_token_embeddings  s:     7)1' 8 

 	)).m\r7   c                    U R                   U R                  R                  5       l        U R                  R                  5       R                  (       a  U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[        XV5        UR                  UR                  R                  5        U R                  U5        g g )N_hf_hook)
vocab_sizert   get_text_configvocab_size_per_layer_inputr  r  _get_resized_embeddingsrl  r  r(   requires_grad_rW   rQ   r  )rX   r  r  r  r  new_embeddings_per_layerhooks          r8   r  3Gemma3nPreTrainedModel._resize_per_layer_embeddings  s     DH??##%@;;&&(DD%)%H%H%J"'+'C'C&8J($ -z::-66"#;B$334J4Q4Q4_4_`//0HI Er7   r-   )NNT) r.   r/   r0   r1   r%   r5   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  rh  _can_record_outputsinput_modalitiesr3   no_gradr  r  r  rl   rm   rT   	Embeddingr  r  r6   ro   rp   s   @r8   r  r  M  s   &*#23#46H"IN!"&0* 2
]]_%T %TN67
 &*)-"	d
  $J 	
 
   &*)-"	Jd
J  $JJ 	J Jr7   r  c                      ^  \ rS rSr% Sr\\S'   SrSrS\4U 4S jjr	\
\S\R                  S\R                  S\\   S	\\-  4S
 j5       5       rSrU =r$ )Gemma3nAudioEncoderi  zp
An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
rt   	audio_melr  c                 
  > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        U R                  5         g s  snf rf   )rR   rS   rt   r  subsample_conv_projectionrT   
ModuleListrA  conf_num_hidden_layersr  	conformer	post_init)rX   rt   r   rY   s      r8   rS   Gemma3nAudioEncoder.__init__  si     )LV)T&9>v?\?\9]^9]A'/9]^
 	 _s   B r,   r`  rd   c                 >   U R                  U5      nUR                  S   nSn[        [        U R                  R
                  5      5       H!  nX`R                  R
                  U   S   -  nM#     [        R                  " XRR                  S9U-  n[        R                  " XR                  S   S-
  S9nUR                  S:  a?  UR                  S:X  a/  UR                  S5      R                  UR                  S   S5      nOcUR                  UR                  :X  aI  UR                  S   S:X  a6  UR                  S   S:w  a#  XXR                  S   :X  a  UR                  S5      n[        R                  " USU5      n	U R                   H  n
U
" XI5      nM     U R                  R                  S:  a@  USS2SSU R                  R                  24   nU	SS2SSU R                  R                  24   n	UR!                  U	R                  S5      S5      n[#        UU	S9$ )	ad  Encodes a batch of MELs.

Args:
    audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
      mel_bins].

Returns:
    audio_encodings: a torch.Tensor of shape
        `[batch_size, self.config.audio_soft_tokens_per_image,
        self.config.audio_config.hidden_size]`
    audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
r#   r   r   )r   r\   Nr   )last_hidden_stater,   )r  r   rA  rB  rt   rm  r3   r   r   rJ  r  r   rR  gatherr  conf_reduction_factormasked_fillr*   )rX   r  r,   r`  r{  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks              r8   rj   Gemma3nAudioEncoder.forward  s   " 88C  %%a($S)J)J%KLO;;#D#D_#UVW#XX  M ,,u-B-BCFYY++g+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^E#OBO $ ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV--'
 	
r7   )rt   r  r  )r.   r/   r0   r1   r2   r$   r5   main_input_namer  rS   r   r    r3   rn   r4   r   r   rI   r*   rj   r6   ro   rp   s   @r8   r  r    sz     !O1   8
8
7<7G7G8
SYZlSm8
	/	/8
   8
r7   r  c                      ^  \ rS rSr% \R
                  \S'   S\4U 4S jjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )r  i   inv_freqrt   c                 f  > [         TU ]  5         UR                  U l        UR                  U l        Xl        [        [        UR                  5      5      U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   U R                  U'   U R                  nU R                  U   S:w  a  [        U R                  U      nU" U R
                  US9u  pVU R                  U S3USS9  U R                  U S3UR                  5       SS9  [        X S3U5        M     g )	Nr  r  r  r  Fr{   r  _attention_scaling)rR   rS   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrt   listsetrj  r  rope_parametersr  r   r   r   setattr)rX   rt   rm  rope_paramsr  r  curr_attention_scalingrY   s          r8   rS   Gemma3nRotaryEmbedding.__init__  s(   "("@"@$*$B$B!F$6$6 78**J++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@Yc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST +r7   Nr   ztorch.deviceseq_lenrm  rd   ztorch.Tensorc           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nrx   r   r!   r   r   )	r  rp  r   rq  r3   r   int64r   rg   )rt   r   r  rm  baserM   attention_factorr  s           r8   r  6Gemma3nRotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r7   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)Nr  r  r   r\   r#   mpscpuF)device_typeenabledr!   r   r   )rp  rg   rR  r   r   r   r  r   strr   r_  r3   r   r   r   r   )rX   r   r  rm  r  attention_scalinginv_freq_expandedposition_ids_expandedr&  freqsembr   r   s                r8   rj   Gemma3nRotaryEmbedding.forward<  sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)rt   rj  r  r  r  NNNNrf   )r.   r/   r0   r1   r3   rn   r5   r&   rS   staticmethodr   rl   r(  rI   rg   r  r  r   rj   r6   ro   rp   s   @r8   r  r     s    llU0 U* +/+/"!%	!*!D(!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <r7   r  zBThe base Gemma 3n language model without a language modeling head.c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	" SS9\
       SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\4S jj5       5       5       rS\R                  S\R                  4S jr SS\R                  S	\R                  S-  S\R                  4S jjrSrU =r$ )r  iO  rt   )r  c                 Z  > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        U5      U l        SU l        UR                  U l        UR*                  U l        [        UR,                  UR                  UR*                  -  U R                  UR*                  S-  S9U l        [        R0                  " U R                  UR                  UR*                  -  SS9U l        [        UR*                  UR                   S9U l        [        R                  " [        SU R                  R6                  5       Vs/ s H-  n[        R0                  " U R                  U R                  SS9PM/     sn5      U l        [        R                  " [        SU R                  R6                  5       Vs/ s H-  n[        R0                  " U R                  U R                  SS9PM/     sn5      U l        U R=                  S[>        R@                  " U R                  S-  5      SS	9  U R=                  S
[>        RB                  " [>        R@                  " S5      5      SS	9  / U l"        [G        U R                  5       HT  u  pEURH                  RJ                  (       d  M"  U RD                  RM                  S Vs/ s H  nSU SU 3PM     sn5        MV     U RO                  5         g s  snf s  snf s  snf s  snf )N      ?)r  r  Frv   r#   r  r^   r{   r  r  )r   r   r|  r}  zlayers.z.self_attn.)(rR   rS   pad_token_idr  r  r  r   rt   embed_tokensrT   r  rA  rt  r  layersrK   r  rq  r  
rotary_embgradient_checkpointingr  r  r  r   per_layer_model_projectionper_layer_projection_normr'  altup_projectionsaltup_unembed_projectionsr   r3   r   rK  "_keys_to_ignore_on_load_unexpected	enumerater  rv  extendr  )rX   rt   r   r   r  layernamerY   s          r8   rS   Gemma3nTextModel.__init__T  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiIhI$V7Ihi
 #6#5#56;N;NO	08&+#!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw"
 *,PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 35/!$++.HA11177>>@hi@hwqcTF3@hi / 	[ j4 x x js    N74N)4N#+N(
F)tie_last_hidden_statesNr  per_layer_inputsrY  r  rF   r  	use_cacher`  rd   c           	      v   USL USL-  (       a  [        S5      eUb"  U R                  U5      nU R                  U5      nU R                  Xb5      nU(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R
                  UUUUS.n[        S0 UD6[        S0 UD6S.n
Un[        R                   " US	-  S
SS9S-  n[        R"                  " S5      nU/n[%        SU R
                  R&                  5       H  nU R(                  US-
     " U5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " USS9n0 n[7        U R
                  R8                  5       H  nU R;                  UUU5      UU'   M     0 n[=        U R>                  SU R
                  R@                   5       HZ  u  nnXR
                  R8                  U      nUSS2SS2USS24   nU" UUU R
                  R8                  U      U4UUUUS.UD6nM\     [        R                   " US   S	-  S
SS9S-  nUS   /n[%        SU R
                  R&                  5       H  nU RB                  US-
     " UU   5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " U5      n[        R                   " USS9nU RE                  U5      n[G        UUS9$ )z
per_layer_inputs (torch.Tensor, *optional*, defaults to None):
    Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
N:You must specify exactly one of input_ids or inputs_embedsrt   r   r#   r   )rt   r  rY  rF   r  )full_attentionrk  r!   r\   TrF  r3  gh㈵>r  r   )r  rY  r  rF   )r  rF   r-   )$r  r5  get_per_layer_inputsproject_per_layer_inputsr   rt   get_seq_lengthr3   r   r   r   r   r  r  r   r   r`   r   rA  r'  r;  r   r   r  maximumr  stackr  rj  r7  r>  r6  rt  r<  rq  r   )rX   r  rD  rY  r  rF   r  rE  r`  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesr  
altup_projcurrent_hidden_statenew_magnituderG   r  rm  r  decoder_layercausal_maskr  altup_unemb_projs                               r8   rj   Gemma3nTextModel.forward  s/   $ -t";<YZZ  --i8M#88C88Y0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# ( !::oq&8b$OSVVd+-.q$++667A//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $6A> dkk556J.2oom\[e.f
+ 7  )$++6U8U8U*V WA}-kk.E.Ea.HIK.q!Qz:O)#DKK$;$;A$>?	 "2*) /	 	M	 !X  !::mA&6!&;TRVYY+A./q$++667A-1-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $67

=a8		-0&++
 	
r7   c                     U R                  U5      R                  " / UR                  QU R                  R                  PU R
                  P76 $ rf   )r  r   r   rt   rt  r  )rX   r  s     r8   rJ  %Gemma3nTextModel.get_per_layer_inputs  sN    **95== 
__
KK))
 ,,
 	
r7   c                    U R                  U5      nX0R                  R                  UR                  UR                  S9-  nUR
                  " / UR                  S S QU R                  R                  PU R                  P76 nU R                  U5      nUc  U$ UR                  UR                  :w  a   USS U R                  R                  2S S 24   nX2-   U R                  R                  UR                  UR                  S9-  $ )Nr  r\   .)r9  r  r   r   r   r   r   rt   rt  r  r:  r  )rX   r  rD  r  s       r8   rK  )Gemma3nTextModel.project_per_layer_inputs  s&   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  4;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$74;U;U;X;X%%.B.I.I <Y <
 
 	
r7   )r=  r;  r<  r5  r  r8  r   r  r6  rq  r  r9  r:  r7  r  )NNNNNNNrf   )r.   r/   r0   r1   r&   r5   r  rS   r   r    r   r3   r  rn   r
   r@   rm   r   r   r   rj   rJ  rK  r6   ro   rp   s   @r8   r  r  O  s[    70 7r  E2 .204.204(,26!%k
##d*k
  ,,-k
 t+	k

 &&-k
 k
 ((4/k
 $;k
 +,k
 
!k
  3  k
Z
e.>.> 
5<< 
 15
||
  ,,-
 
	
 
r7   r  z?The base Gemma 3n language model with a language modeling head.c                   d  ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   S\4U 4S	 jjr	\
\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )Gemma3nForCausalLMi   lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrG   rE   rt   c                 L  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  R                   Vs/ s H  nSU 3PM
     snU l	        U R                  5         g s  snf NFrv   zmodel.)rR   rS   r  r  r  rT   r   r   rd  r=  r  rX   rt   rA  rY   s      r8   rS   Gemma3nForCausalLM.__init__'  s     %f-
 ++yy!3!3V5F5FUS )-

(U(U3
(UfTFO(U3
/
 	3
s   9B!Nr  rY  r  rF   r  labelsrE  logits_to_keepr`  rd   c	           
          U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a$  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3nForCausalLM

>>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  rY  r  rF   r  rE  N)rD   rE   rF   rG   rH   r-   )r  r  r  rl   slicerd  rt   final_logit_softcappingr3   r  loss_functionr  r   rF   rG   rH   )rX   r  rY  r  rF   r  rj  rE  rk  r`  outputsrG   slice_indicesrE   rD   s                  r8   rj   Gemma3nForCausalLM.forward4  s   @ ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD%#33!//))
 	
r7   )r=  rd  r  r  )NNNNNNNr   )r.   r/   r0   r1   _tied_weights_keys_tp_plan_pp_planr&   r5   rS   r   r   r3   r  rn   r
   r@   rm   rl   r   r   r   rj   r6   ro   rp   s   @r8   rb  rb     s%   *,GH23H_-z:;H0   .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
  ;
r7   rb  c                      ^  \ rS rSrSrS\\-  S\4U 4S jjr  SS\	R                  S-  S\	R                  S-  S	\	R                  4S
 jjrSrU =r$ )Gemma3nMultimodalEmbedderit  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 ^  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  5      U l        [        U R                  U R
                  S9U l        [        U R                  U R
                  S9U l        [        R                  " U R                  U R                  SS9U l        [        U R                  U R
                  SS9U l        g )Nr  Frv   )rN   rO   )rR   rS   r   multimodal_hidden_sizer  rN   vocab_offsetr  text_hidden_sizerT   r  	embeddingrK   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rX   rx  ry  rY   s      r8   rS   "Gemma3nMultimodalEmbedder.__init__w  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r7   Nr  r  rd   c                     USL USL-  (       a  [        S5      eUb  U R                  U5      nO.U R                  XR                  -
  5      nU R	                  U5      nU R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.

Args:
    input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
        `[vocab_offset, vocab_offset + vocab_size)`.
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.

Returns:
    A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
NrG  )r  r  r~  r|  r  r  r  )rX   r  r  emb_normhard_embemb_norm_projs         r8   rj   !Gemma3nMultimodalEmbedder.forward  s     -t";<YZZ$//>H~~i2C2C&CDH//9H11(;22=AAr7   )
r~  r  r  rN   r  r{  r  r}  r|  r  r  )r.   r/   r0   r1   r2   r$   r'   r&   rS   r3   r  rn   rj   r6   ro   rp   s   @r8   rw  rw  t  sq    [t-0CCt 't* .2-1B##d*B ||d*B 
	B Br7   rw  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
" SS9S	\R                  S
\\   S\\-  4S j5       5       r    S"S\R$                  S-  S\R                  S-  S\R                  S-  S\R                  S-  4S jjr\	           S#S\R$                  S-  S	\R                  S-  S\R                  S-  S\R(                  S-  S\R(                  S-  S\R$                  S-  S\S-  S\R$                  S-  S\R                  S-  S\R$                  S-  S\S-  S\\   S\4S jj5       rS rS r\	\
" SS9S\R(                  S\R(                  S
\\   S\\-  4S  j5       5       rS!rU =r$ )$Gemma3nModeli  Frt   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        UR                  R                  U l        [        R                  " UR                  S9nX l        UR                  R                  U l	        [        R                  " UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        U R                  R                   Vs/ s H  nSU 3PM
     snU l        U R!                  5         g s  snf )NrH  zlanguage_model.)rR   rS   r"   from_configvision_configvision_towerry  r  language_modelr  audio_configaudio_towerrw  embed_visionembed_audior=  r  )rX   rt   r  rA  rY   s       r8   rS   Gemma3nModel.__init__  s     %119M9MN ,,77"..f6H6HI,*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\] 261D1D1g1g3
1godV$1g3
/ 	3
s   D=c                 6    U R                   R                  5       $ rf   )r  get_input_embeddingsr  s    r8   r  !Gemma3nModel.get_input_embeddings  s    ""7799r7   c                 :    U R                   R                  U5        g rf   )r  set_input_embeddingsr  s     r8   r  !Gemma3nModel.set_input_embeddings  s    007r7   zOProjects the last hidden state from the vision model into language model space.r9   pixel_valuesr`  rd   c                    U R                   " S	USSS.UD6nUR                  nUR                  UR                  S   U R                  R
                  R                  U R                  R                  5      R                  SSS5      nX@R                  R
                  R                  S-  -  nU R                  US9Ul
        U$ )
NFT)r  
do_poolingreturn_dictr   r!   r#   r3  r  r-   )r  r  r   r   rt   r  r   vision_soft_tokens_per_imager   r  pooler_output)rX   r  r`  vision_outputsr  s        r8   get_image_featuresGemma3nModel.get_image_features  s     **sQVdhslrs*<< .55##A&KK%%11KK44
 '!Q
	 	 	[[66BBCGG'+'8'8GX'8'Y$r7   Nr  r  image_featuresaudio_featuresc           	         Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  R                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbP  [        X%   R                  5       UR                  5       :H  SU SUR                  S   UR                  S   -   35        UR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbP  [        X&   R                  5       UR                  5       :H  SU SUR                  S   UR                  S   -   35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r  r\   z6Image features and image tokens do not match, tokens: z, features: r   r#   z6Audio features and audio tokens do not match, tokens: )r  r3   r   rt   image_token_idlongr   allaudio_token_idrH  r   	expand_asr   r   numelr   )	rX   r  r  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r8   get_placeholder_mask!Gemma3nModel.get_placeholder_mask  sN    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;,,.LL!;!;5::VcVjVjk c"g  "+kk.H.H!H!*kk.H.H!H+//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 ,//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 "55r7   input_featuresrY  input_features_maskr  rF   token_type_idsrj  rE  	lm_kwargsc                 T	   USL U	SL-  (       a  [        S5      eUGbz  U R                  5       " U5      n	[        R                  " US:  XR                  :  5      n[        R
                  " X[        R                  " U5      5      nU R                  R                  U5      n[        R                  " XR                  R                  :  XR                  R                  :  5      nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      R#                  U	5      n[        R
                  " UUU	5      n	XR                  R                  :  nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      R#                  U	5      n[        R
                  " UUU	5      n	OSnUbe  U R%                  USS9R&                  nUR                  U	R                  U	R                  5      nU R)                  XUS	9u  nnU	R+                  UU5      n	UGb>  UGb:  U R-                  X5) SS9nUR&                  nUR.                  n[        R0                  " U R                  S-
  //[        R2                  UR                  S
9nU R                  US9n [        R
                  " UR!                  S5      U U5      nUR4                  u  n!n"n#U R6                  R8                  U"-
  n$U R;                  U!U$U#5      n%[        R<                  " UU%4SS9nUR                  U	R                  U	R                  5      nU R)                  XUS9u  nn&U	R+                  U&U5      n	U R                  " SSUUUUU	USS.UD6n'[?        U'R@                  U(       a  U'RB                  OSU'RD                  U'RF                  Ub  WOSUb  WS9$ SS9$ )a  
input_features_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Attention mask for `input_features` where non-zero values mark valid audio frames.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

>>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```
NrG  r   r#   )r  r\   T)r  )r  r  r  r   )r  r  )r  rD  rY  r  rF   r  rE  r  )r  rF   rG   rH   r>   r?   r-   )$r  r  r3   r  r  r  
zeros_liker  rJ  r  r|  r  r  r   r   r   r   r  r  r  r  masked_scatterget_audio_featuresr,   r   r  r   rt   audio_soft_tokens_per_imagerR  r   r<   r  rF   rG   rH   )(rX   r  r  r  rY  r  r  rF   r  r  rj  rE  r  per_layer_inputs_maskper_layer_inputs_tokensrD  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  r   audio_outputsr  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr  rp  s(                                           r8   rj   Gemma3nModel.forward
  s   ` -t";<YZZ  557	BM %*$5$5i1niRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM),,]-A-A=CVCVWM#.#8#8#<#F#F}#U !KK(<m][M #&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL'??=+?+?ATATUL","6"6r":"D"D]"S!KK(;\=YM# #!44\t4TbbN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I 33NDXfj3kM*88N&55J "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%% 

-)%+'

 

 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r7   c                 .    U R                   R                  $ rf   r  r  r  s    r8   r  +Gemma3nModel.get_per_layer_input_embeddings  s    ""999r7   c                 $    XR                   l        g rf   r  r  s     r8   r  +Gemma3nModel.set_per_layer_input_embeddings  s    5:2r7   zPProjects the last hidden state from the audio encoder into language model space.c                 n    U R                   " X4SS0UD6nU R                  UR                  S9nXTl        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
r  Tr  )r  r  r  r  )rX   r  r  r`  r  r  s         r8   r  Gemma3nModel.get_audio_features  sN     9=8H8H9
=A9
EK9
 ''m6U6U'V&2#r7   )r=  r  r  r  r  r  r  r  r/  )NNNNNNNNNNN)r.   r/   r0   r1   accepts_loss_kwargsr%   rS   r  r  r   r   r3   r@   r   r   rI   r   r  r  r  rn   r
   rm   r<   rj   r  r  r*   r  r6   ro   rp   s   @r8   r  r    ss     } $:8 !rs'' +, 
+	+	 t , .2263737*6##d**6 ((4/*6 ))D0	*6
 ))D0*6X  .21537.23704(,2626*.!%F
##d*F
 ''$.F
 ))D0	F

 t+F
 #\\D0F
 &&-F
 F
 ((4/F
 ((4/F
   4'F
 $;F
 ./F
 
$F
 F
P:; !st #\\ +,	
 
/	/ u r7   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                   .  ^  \ rS rSrSS0rSrS\4U 4S jjr\S\	R                  S\\   4S	 j5       r\\            SS\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                   S
-  S\	R                   S
-  S\	R                  S
-  S\S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\S
-  S\\	R                   -  S\\   S\4S jj5       5       r            SU 4S jjrS rS rSrU =r$ )Gemma3nForConditionalGenerationi  rc  z(model.language_model.embed_tokens.weightFrt   c                 R  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  R                   Vs/ s H  nSU 3PM
     snU l
        U R                  5         g s  snf rg  )rR   rS   r  r  rT   r   ry  r   r  rd  r=  r  rh  s      r8   rS   (Gemma3nForConditionalGeneration.__init__  s     !&)
yy!3!3!?!?ASASA^A^ejk )-

(U(U3
(UfTFO(U3
/ 	3
s   <B$r  r`  c                 <    U R                   R                  " U40 UD6$ rf   )r  r  )rX   r  r`  s      r8   r  2Gemma3nForConditionalGeneration.get_image_features  s    zz,,\DVDDr7   Nr  r  rY  r  r  rF   r  r  rj  rE  rk  r  rd   c                 \   U R                   " SUUUUUUUUU	U
USS.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnU
b6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  S9$ )a<  
input_features_mask (torch.Tensor, *optional*, defaults to None):
    The attention mask for the input audio.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in
    `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
T)r  r  r  rY  r  r  rF   r  r  rj  rE  r  N)rD   rE   rF   rG   rH   r>   r?   r-   )r  r  r  rl   rm  rd  rt   r  rn  r3   r  ro  r  rB   rF   rG   rH   r>   r?   )rX   r  r  r  rY  r  r  rF   r  r  rj  rE  rk  r  rp  rG   rq  rE   rn  rD   s                       r8   rj   'Gemma3nForConditionalGeneration.forward  sD   D ** 
%)) 3%+)'
 
   118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^lbklD,#33!//)) ' ; ; ' ; ;
 	
r7   c                 p   > [         TU ]  " U4UUUUU
UU	US.UD6nU(       d  U
(       d  X_S'   XoS'   XS'   U$ )N)rF   r  rY  r  rE  rk  r  is_first_iterationr  r  r  )rR   prepare_inputs_for_generation)rX   r  rF   r  r  r  r  rY  r  r  rE  rk  rj  r  r`  model_inputsrY   s                   r8   r  =Gemma3nForConditionalGeneration.prepare_inputs_for_generation1	  se    $ w<
+')%))1
 
  Y+7(-;)*2E./r7   c                 6    U R                   R                  5       $ rf   )r  r  r  s    r8   r  >Gemma3nForConditionalGeneration.get_per_layer_input_embeddingsZ	  s    zz88::r7   c                 :    U R                   R                  U5        g rf   )r  r  r  s     r8   r  >Gemma3nForConditionalGeneration.set_per_layer_input_embeddings]	  s    

11%8r7   )r=  rd  r  )NNNNNNNNNNNr   )NNNNNNNNTNNF)r.   r/   r0   r1   rs  r  r%   rS   r   r3   r@   r   r   r  r   r  rn   r
   rm   rl   rB   rj   r  r  r  r6   ro   rp   s   @r8   r  r    s    +,VW}  Eu/@/@ EFSeLf E E  .21537.23704(,2626*.!%-.e
##d*e
 ''$.e
 ))D0	e

 t+e
 #\\D0e
 &&-e
 e
 ((4/e
 ((4/e
   4'e
 $;e
 ell*e
 ./e
 
'e
  e
T   'R;9 9r7   r  )r  rb  r  r  r  r  )r   NN)r#   )dr   collections.abcr   r   dataclassesr   typingr   r3   torch.nnrT   torch.nn.functionalr   r   r   r  r  r	   cache_utilsr
   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   r   utils.output_capturingr    autor"   configuration_gemma3nr$   r%   r&   r'   accelerate.hooksr(   r*   r<   rB   ModulerK   rr   r   r=  ra  r  r  r  r  r  r  r  r  r  r"  rO  rn   rl   rU  rg   rI   rc  rf  rh  r  r  r  r  r  rb  rw  r  r  __all__r-   r7   r8   <module>r     s  *  . !      & ! . ) R 9 k k K F &  H 5  l l 3 
3%? 3  3 
9!8 9 9( 
 9K 9 9:4RYY 40g)BII g)TaBII aHj,bii j,ZB7		 B7JF")) FRORYY O8Dryy D2(ryy (V 6SR\\ S;RYY ;$#5RYY #5L`'ryy `'F(	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D.ELL .u|| .%,, ._b .,l)299 l)^I%8 I%X ^J_ ^J ^JBN
0 N
bL<RYY L<^ abM
- M
 cM
` ^_P
/ P
 `P
f/B		 /Bd A) AAH h9&<o h9h9Vr7   