
    Z j             	          S SK r S SKJrJr  S SKJr  S SKJr  S SKrS SK	J
r
  S SKJ
s  Jr  S SKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1  SSK2J3r3  SSK4J5r5J6r6J7r7J8r8  SSK9J:r:  SSK;J<r<J=r=J>r>J?r?J@r@  SSKAJBrBJCrCJDrDJErE  SSKFJGrG  \+" 5       (       a  S SKHJIrI  \,R                  " \K5      rL\)" SS9\ " S S\:5      5       5       rM\)" SS9\ " S S \5      5       5       rN\)" SS9\ " S! S"\G5      5       5       rO\)" SS9\ " S# S$\5      5       5       rP\)\ " S% S&\5      5       5       rQ " S' S(\E5      rR " S) S*\B5      rS " S+ S,\
R                  5      rU " S- S.\
R                  5      rV " S/ S0\
R                  5      rW " S1 S2\
R                  5      rX " S3 S4\
R                  5      rY " S5 S6\
R                  5      rZ " S7 S8\
R                  5      r[ " S9 S:\
R                  5      r\ " S; S<\
R                  5      r] " S= S>\
R                  5      r^ " S? S@\@5      r_ " SA SB\
R                  5      r` " SC SD\55      ra " SE SF\
R                  5      rbSfSG\R                  SH\R                  SI\R                  SJ\d4SK jjre " SL SM\
R                  5      rf " SN SO\<5      rg " SP SQ\65      rh " SR SS\h5      ri " ST SU\>5      rj\)" SVSW9 " SX SY\?5      5       rk\)" SZSW9 " S[ S\\=5      5       rl " S] S^\
R                  5      rm\)" S_SW9 " S` Sa\D5      5       rn\)" SbSW9 " Sc Sd\C5      5       ro/ SeQrpg)g    N)CallableSequence)	dataclass)Any)strict   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONS)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_accelerate_availableloggingtorch_compilable_check)merge_with_config_defaults)capture_outputs   )	AutoModel)	Gemma2MLPGemma2PreTrainedModeleager_attention_forwardrotate_half)Gemma3TextConfig)Gemma3DecoderLayerGemma3ForCausalLMGemma3RotaryEmbeddingGemma3TextModelGemma3TextScaledWordEmbedding)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)TimmWrapperConfig)add_hook_to_modulezgoogle/gemma-3n-E4B)
checkpointc                      \ rS rSr% SrSrSSSSSSSSSSS.
rSS	S
.rSr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\\	   -  \
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\\   S-  \
S'   Sr\\
S '   S!r\	\
S"'   S#r\\
S$'   S%r\\
S&'   S'r\	\
S('   S)r\	\
S*'   S+r\	\
S,'   Sr\\\   -  S-  \
S-'   \ " 5       r!\ " 5       r"\ " 5       r#S. r$S/ r%S0r&g)1Gemma3nTextConfigJ   a  
vocab_size_per_layer_input (`int`, *optional*, defaults to 262144):
    Vocabulary size of the per-layer text embeddings that augment the standard embeddings.
hidden_size_per_layer_input (`int`, *optional*, defaults to 256):
    Dimension of the hidden representations for per-layer embeddings.
altup_active_idx (`int`, *optional*, defaults to 0):
    The index of the prediction from which AltUp will compute additional predictions or correct the active prediction.
altup_coef_clip (`float`, *optional*, defaults to 120.0):
    The maximum amplitude of an AltUp prediction or correction coefficient weight.
altup_correct_scale (`bool`, *optional*, defaults to `True`):
    If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`.
altup_num_inputs (`int`, *optional*, defaults to 4):
    The number of predictions that AltUp should make given the input sequence.
num_kv_shared_layers (`int`, *optional*, defaults to 15):
    The number of layers that share KV cache values. During the forward pass, the last `num_kv_shared_layers`
    layers in the model "share" the KV values in that each local and global layer in this range uses the KV
    cache values computed for the last local or global layer, respectively, before entering this range. The
    value should be a multiple of the attention pattern size (see `layer_types` parameter).
laurel_rank (`int`, *optional*, defaults to 64):
    The intermediate size for the linear projections in the Learned Augmented Residual Layer.
activation_sparsity_pattern (`Sequence[float]`, *optional*):
    The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must
    explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are
    sparse with a sparsity factor of 0.95 and the rest are dense.

```python
>>> from transformers import Gemma3nTextModel, Gemma3nTextConfig

>>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration
>>> configuration = Gemma3nTextConfig()

>>> # Initializing a model from the gemma3n_text-E4B style configuration
>>> model = Gemma3nTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_textcolwisereplicated_with_grad_allreducerowwise)
zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.v_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projg    .A     @)globallocali  
vocab_size   vocab_size_per_layer_input   hidden_size   hidden_size_per_layer_inputi @  intermediate_size#   num_hidden_layersr   num_key_value_headsi   max_position_embeddingsi   sliding_windowNlayer_typesg      >@final_logit_softcappingr   altup_active_idxg      ^@altup_coef_clipTaltup_correct_scale   altup_num_inputs   num_kv_shared_layers@   laurel_rankactivation_sparsity_patternc                    [        U R                  [        5      (       a@  [        U R                  5      =o R                  :w  a  [        SU R                   SU S35      e[        U R                  [        5      (       d  U R                  /U R                  -  U l        U R                  c8  [        U R                  5       Vs/ s H  o3S-   S-  S:X  a  SOSPM     snU l        U R                  c3  U R                  S	:  a  S	OSnS
/U-  S/U R                  U-
  -  -   U l        [        U R                  5      =oPR                  :w  a  [        SU R                   SU S35      e[        R                  " S0 UD6  g s  snf )Nzjintermediate_size must have an explicit intermediate size for every layer or one for all layers. Expected z values but got .      r   full_attentionsliding_attention
   gffffff?        zeactivation_sparsity_pattern must have an explicit activation sparsity value for every layer.Expected  )
isinstancerB   r   lenrD   
ValueErrorrH   rangerS   r   __post_init__)selfkwargsintsize_leninum_sparse_layerslen_asps         |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/gemma3n/modular_gemma3n.pyra   Gemma3nTextConfig.__post_init__   s   t--x88 #D$:$: ;;@V@VV 2233CK=PQS  D22H==&*&<&<%=@V@V%VD"#W\]a]s]sWt WtRSUaK1$4 :MMWt D ++3&*&<&<r&Aq04v8I/ISE&&)::M 0D, 4;;<<GAWAWW 2233CG9AO 
 	&&00! s   E:c                    UR                  SS 5      nSS0SS0S.nU R                  b  U R                  OUU l        Ub  U R                  S   R                  U5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  SU R
                  S   5      5        U R                  R                  S5      c  SS0U R                  S'   U R                  S   R	                  SUR                  S	U R
                  S
   5      5        U R                  5         U$ )Nrope_scaling	rope_typedefault)rY   rX   rX   
rope_thetar9   rY   rope_local_base_freqr:   )poprope_parametersupdateget
setdefaultdefault_thetastandardize_rope_params)rb   rc   rk   default_rope_paramss       rh   convert_rope_params_to_dict-Gemma3nTextConfig.convert_rope_params_to_dict   sI   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&    )rS   rB   rH   rq   )'__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planru   r;   int__annotations__r=   r?   rA   rB   listrD   rE   rF   rG   rH   strrI   floatrJ   rK   rL   boolrN   rP   rR   rS   AttributeErrorattn_logit_softcappinguse_bidirectional_attentionquery_pre_attn_scalarra   rx   __static_attributes__r\   rz   rh   r2   r2   J   sE   %N  J%.%.%.%E%E%E%."+ )"+  +X>MJ&--K'**)/sT#Y/s  #)S)NC$(KcT!(%)U)c"OU" $$c "#"K>Be!4t!;B+-"0"2*,1>rz   r2   c                      \ rS rSr% SrSrSr\\S'   Sr	\\S'   Sr
\\S'   S	r\\S
'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\   \\\4   -  \S!'   S"r\\S#'   S$r\\\\\4   \\\4   4   -  \S%'   S&r\\\\\4   \\\4   4   -  \S''   S(rg))*Gemma3nAudioConfig   a  
vocab_offset (`int`, *optional*, defaults to 262272):
    Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
    0-indexed `Gemma3nMultimodalEmbedder.embedding` table.
input_feat_size (`int`, *optional*, defaults to 128):
    The number of channels in each mel-spectrogram frame.
gradient_clipping (`float`, *optional*, defaults to 10000000000.0):
    Clipping value used to stabilize extremely large gradient values.
conf_attention_chunk_size (`int`, *optional*, defaults to 12):
    The sub-sequence size for local attention processing inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_context_left (`int`, *optional*, defaults to 13):
    The left context size of the local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_context_right (`int`, *optional*, defaults to 0):
    The right context size of the local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_attention_logit_cap (`float`, *optional*, defaults to 50.0):
    Logit cap applied during local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_num_attention_heads (`int`, *optional*, defaults to 8):
    The number of attention heads in local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_num_hidden_layers (`int`, *optional*, defaults to 12):
    The number of layers that use local attention inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_conv_kernel_size (`int`, *optional*, defaults to 5):
    Convolution kernel size for the conformer block inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_reduction_factor (`int`, *optional*, defaults to 4):
    Reduction factor used in the conformer block inside the Conformer ("conf") section of the
    Universal Speech Model.
conf_residual_weight (`float`, *optional*, defaults to 0.5):
    Residual connection weight inside the Conformer ("conf") section of the
    Universal Speech Model.
sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`):
    The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection
    ("sscp") section of the Universal Speech Model.
sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001):
    Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution
    Projection ("sscp") section of the Universal Speech Model.
sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`):
    Kernel sizes of the two convolutional layers in the subsample convolution projection  in the Sub-sample
    Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a
    tuple of height and width for each layer, where the height corresponds to the time dimension and the width
    corresponds to the frequency dimension.
sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`):
    Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample
    Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a
    tuple of height and width for each layer, where the height corresponds to the time dimension and the width
    corresponds to the frequency dimension.

Example:

```python
>>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder

>>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration
>>> configuration = Gemma3nAudioConfig()

>>> # Initializing a model from the gemma3n_audio-E4B style configuration
>>> model = Gemma3nAudioEncoder(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_audio   r;     vocab_offsetinput_feat_sizei   r?   ư>rms_norm_epsg    _Bgradient_clipping   conf_attention_chunk_size   conf_attention_context_leftr   conf_attention_context_rightg      I@conf_attention_logit_cap   conf_num_attention_headsconf_num_hidden_layersrW   conf_conv_kernel_sizerM   conf_reduction_factor      ?conf_residual_weight)r       sscp_conv_channel_sizeMbP?sscp_conv_group_norm_eps)r   r   r   sscp_conv_kernel_size)r   r   r   sscp_conv_stride_sizer\   N)r{   r|   r}   r~   r   r   r;   r   r   r   r   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r   tupler   r   r   r   r\   rz   rh   r   r      s5   BH !JJ%L#%OSKL%/u/%'s''))() #)&*e*$%c%"$C$!"3"!"3""%%%:CDIc3h7C&*e*M4%c3hsCx(H"II M4%c3hsCx(H"II rz   r   c                       \ rS rSr% SrSrSr\\S'   Sr	\
\S'   Sr\\S	'   S
r\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Srg)Gemma3nVisionConfigi:  a_  
architecture (`str`, *optional*, defaults to `"resnet50"`):
    The timm architecture to load.
do_pooling (`bool`, *optional*, defaults to `True`):
    Whether to do pooling for the last_hidden_state in `TimmWrapperModel` or not.
model_args (`dict[str, Any]`, *optional*):
    Additional keyword arguments to pass to the `timm.create_model` function. e.g. `model_args={"depth": 3}`
    for `timm/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k` to create a model with 3 blocks. Defaults to `None`.
vocab_offset (`int`, *optional*, defaults to 262144):
    Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the
    0-indexed `Gemma3nMultimodalEmbedder.embedding` table.

Example:
```python
>>> from transformers import Gemma3nVisionConfig, TimmWrapper

>>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration
>>> configuration = Gemma3nVisionConfig()

>>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration
>>> model = TimmWrapper(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3n_vision{Gz?initializer_rangeF
do_poolingmobilenetv5_300m_encarchitecturer>   r?   r   r;   r<   r   r   r   N
model_argsr\   )r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r?   r   r;   r   r   r   dictr   r\   rz   rh   r   r   :  sc    6 "J#u#J.L#.KJL#L%"Jt"rz   r   c                   r  ^  \ rS rSr% SrSr\\\S.r	Sr
\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   Sr\\\\4   -  S-  \S'   S	r\S-  \S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   U 4S jrSrU =r $ )Gemma3nConfigid  av  
audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
    The number of soft tokens per audio clip.
vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
    The number of soft tokens per image.
boi_token_id (`int`, *optional*, defaults to 255999):
    The begin-of-image token index to wrap the image prompt.
eoi_token_id (`int`, *optional*, defaults to 262144):
    The end-of-image token index to wrap the image prompt.
boa_token_id (`int`, *optional*, defaults to 256000):
    The begin-of-audio token index to wrap the audio prompt.
eoa_token_id (`int`, *optional*, defaults to 262272):
    The end-of-audio token index to wrap the audio prompt.

Example:

```python
>>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig

>>> # Initializing a MobileNet vision config, which is loaded from TIMM
>>> vision_config = Gemma3nVisionConfig()

>>> # Initializing a Gemma3n Audio config
>>> audio_config = Gemma3nAudioConfig()

>>> # Initializing a Gemma3n Text config
>>> text_config = Gemma3nTextConfig()

>>> # Initializing a Gemma3n gemma-3-4b style configuration
>>> configuration = Gemma3nConfig(text_config, vision_config, audio_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3nTextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma3n)text_configvision_configaudio_configNr   r   r      audio_soft_tokens_per_imager@   vision_soft_tokens_per_imagei boi_token_idr<   eoi_token_idi  image_token_idi  boa_token_idr   eoa_token_idi  audio_token_idr   r   Ttie_word_embeddingsc                   > U R                   c%  [        5       U l         [        R                  S5        O9[	        U R                   [
        5      (       a  [        S0 U R                   D6U l         [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        O1U R                  c$  [        5       U l        [        R                  S5        [	        U R                  [
        5      (       a  [        S0 U R                  D6U l        O1U R                  c$  [        5       U l        [        R                  S5        [        TU ],  " S0 UD6  g )NzAtext_config is None, using default Gemma3nTextConfig text config.zGvision_config is None, using default Gemma3nVisionConfig vision config.z7audio_config is None. Using default Gemma3nAudioConfig.r\   )r   r2   loggerinfor]   r   r   r   r   r   superra   )rb   rc   	__class__s     rh   ra   Gemma3nConfig.__post_init__  s    #02DKK[\(($//0D43C3CDDd(($//!4!Jt7I7I!JD'!4!6DKKabd''.. 2 GT5F5F GD& 2 4DKKQR''rz   )r   r   r   )!r{   r|   r}   r~   r   r   r2   r   r   sub_configsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   __classcell__r   s   @rh   r   r   d  s   $L J(,*K >BK"T#s(^3d:AAEM&c3h7$>E?CL$tCH~5<C.1t1/2 #*2&L#*&&L#*&!(NC$J(&L#*&&L#*&!(NC$J(&*ut|*'++( (rz   r   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3nAudioEncoderModelOutputi  zm
audio_mel_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`
Naudio_mel_maskr\   )
r{   r|   r}   r~   r   r   torch
BoolTensorr   r   r\   rz   rh   r   r     s    
 /3NE$$t+2rz   r   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3nModelOutputWithPasti  a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Naudio_hidden_statesr\   
r{   r|   r}   r~   r   r   r   FloatTensorr   r   r\   rz   rh   r   r     s     59**T18rz   r   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3nCausalLMOutputWithPasti  a
  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nr   r\   r   r\   rz   rh   r   r     s    $ 59**T18rz   r   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  4S jr
S\R                  S\R                  4S	 jrS
rU =r$ )Gemma3nRMSNormi  dimeps
with_scalec                    > [         TU ]  5         X l        X0l        U R                  (       a/  [        R
                  " [        R                  " U5      SS9U l        g g )NT)requires_grad)	r   __init__r   r   nn	Parameterr   onesweight)rb   r   r   r   r   s       rh   r   Gemma3nRMSNorm.__init__  s>    $??,,uzz#dKDK rz   hidden_statesc                     UR                  S5      R                  SSS9U R                  -   nU[        R                   " US5      -  $ )Nr   T)keepdim      )powmeanr   r   )rb   r   mean_squareds      rh   _normGemma3nRMSNorm._norm  sA    $((+00T0BTXXMuyyt<<<rz   returnc                     U R                  UR                  5       5      nU R                  (       a  X R                  R                  5       -  nUR	                  U5      $ N)r   r   r   r   type_as)rb   r   normed_outputs      rh   forwardGemma3nRMSNorm.forward  sF    

=#6#6#89??)KK,=,=,??M$$]33rz   )r   r   r   )r   T)r{   r|   r}   r~   r   r   r   r   r   Tensorr   r   r   r   r   s   @rh   r   r     sW    LC Le L L L=5<< =
4U\\ 4ell 4 4rz   r   c                   &  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S\R                  S	\
S
\
S\
S\
S\
S\
S\R                  4S jrS\R                  S\R                  S\R                  4S jrSrU =r$ )%Gemma3nAudioRelativePositionEmbeddingi  configc                 L  > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R                  U R                  -  U l        [        SU R                  R                  S-
  5      U l
        U R                  R                  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        SnSnU R                  S-  n[         R"                  " [%        U5      [%        U5      -  5      [        US-
  S5      -  nU[&        R(                  " [&        R*                  " U5      U* -  5      -  nU R-                  SUR%                  5       R/                  S5      R/                  S5      SS	9  g )
Nr   rV   Fbias      ?r8   r   inv_timescales
persistent)r   r   r   r   	num_headsr?   channelshead_dimmaxr   max_backwardr   max_forwardr   Linearpos_projmathlogr   r   exparangeregister_buffer	unsqueeze)rb   r   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr  r   s          rh   r   .Gemma3nAudioRelativePositionEmbedding.__init__  sJ   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
rz   positiondtyper   c                 H   UR                  5       R                  S5      nXR                  R                  UR                  [
        R                  S9-  n[
        R                  " [
        R                  " U5      [
        R                  " U5      /SS9nUR                  U5      $ )Nr   )devicer  r   )r   r  r  tor  r   float32catsincostype)rb   r  r  scaled_timetiming_signals        rh   _get_timing_signal_1d_pos?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos  s{    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((rz   term_bd_before_shift
batch_sizer  num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     US-   U-
  nSU4n	[         R                  R                  X5      n
U
R                  UUUXVS-   -  45      nUSS2SS2SS2SXV-  24   nUR                  UUUUU45      nU$ )a"  Performs the relative shift.

Args:
  term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
    (B), num_heads (N), num_query_blocks (U), query_block_size (W),
    key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

Returns:
  Tensor of shape [B, N, U, W, C].
rV   r   N)r   
functionalpadreshape)rb   r(  r)  r  r*  r+  r,  r-  pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 rh   _relative_shift5Gemma3nAudioRelativePositionEmbedding._relative_shift!  s    4  0!3F /0**+?O
 *11  q$89	
 *!Q3X5E5X3X*XY )00   
 rz   querieskeysc           	      >   UR                   u  p4pVnUR                   u    p  n[        R                  " U R                  U R                  * S-
  SUR
                  S9R                  S5      n
U
R                   S   nU R                  XR                  S9nU R                  U5      nUR                  SXR                  U R                  5      R                  S5      nUR                  SSSSS5      nUR                  SSSSS5      n[        R                  " UU5      nUR                  SSSSS5      nUR                  SSS5      nUR                  X6XE-  U5      n[        R                  " UU5      nUR                  UUUUU5      nU R!                  UUUUUU	U5      nUU-   $ )	NrV   r   r  r   r  r   r   rM   )shaper   r  r
  r  r  r  r&  r  r  r1  r  r  squeezepermutematmulr8  )rb   r:  r;  r)  r*  r+  r  r  _r,  pos_indicesr-  sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr7  s                           rh   r   -Gemma3nAudioRelativePositionEmbedding.forward^  s    OVmmK
&68'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
?O?bdlm

 #(,,z:"F 3::
 ..
 ((rz   )r  r   r  r
  r  r  r  )r{   r|   r}   r~   r   r   r   r   r  r&  r   r8  r   r   r   r   s   @rh   r   r     s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L) L)rz   r   c                   >  ^  \ rS rSrS\4U 4S jjrS rS\R                  S\	S\	S\R                  4S	 jr
S
\R                  S\R                  4S jrS
\R                  S\R                  4S jrS
\R                  S\R                  S\R                  4S jrSrU =r$ )Gemma3nAudioAttentioni  r   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R
                  U R                  -  U l        U R                  R                  U l        U R                  R                  U l
        [        SU R                  R                  S-
  5      U l        U R                  R                  U l        U R                  U R                  -   U R                  -   U l        [#        U5      U l        [&        R(                  " [*        R,                  " U R                  45      5      U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        U R                  S-  nS[*        R&                  R8                  R;                  [*        R<                  " S5      5      -  nU R?                  SX#-  RA                  5       RC                  5       SS	9  U RE                  5       nU R?                  S
USS	9  U R?                  S[*        R<                  " U R                  5      RG                  5       SS	9  g )Nr   rV   Fr   r   r  r[   q_scaler  local_causal_valid_masksoftcap)$r   r   r   r   r  r?   r  r   
chunk_sizer   max_future_horizonr	  r   max_past_horizonr   attention_logits_soft_capcontext_sizer   relative_position_embeddingr   r   r   zerosper_dim_scaler  q_projk_projv_projr/  softplustensorr  clonedetachcreate_local_causal_valid_maskr   )rb   r   rT  r_softplus_0rU  r   s        rh   r   Gemma3nAudioAttention.__init__  s%   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY)?(F(F(H(O(O(Q^cd"&"E"E"G68O\abLL778>>@ 	 	
rz   c                    [         R                  " [         R                  " U R                  U R                  4[         R
                  S9SS9R                  n[         R                  " [         R                  " U R                  U R                  4[         R
                  S9U R                  U R                  -   S9n[         R                  " U R                  U R                  4[         R
                  S9nX1-  U-  nU$ )Nr>  r   )diagonal)	r   trilr   r[  rW  r   TrY  rX  )rb   lower_causal_maskupper_causal_maskrU  s       rh   rf  4Gemma3nAudioAttention.create_local_causal_valid_mask  s    !JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9"MPa"a&&rz   xpad_left	pad_rightr   c                     UR                   tpEnUR                  XB/UQ75      nUR                  XC/UQ75      n[        R                  " XqU/SS9nU$ )NrV   r  )r?  	new_zerosr   r   )	rb   rp  rq  rr  batchrC  
tail_shapeleftrights	            rh   	_pad_dim1Gemma3nAudioAttention._pad_dim1  sV     !:{{E9j9:U;
;<IIt&A.rz   r   c                 "   UR                   nUSS u  p4X@R                  -   S-
  U R                  -  nXPR                  -  U-
  =nS:  a  U R                  USU5      nX5U R                  4USS -   nUR                  U5      R	                  5       nU$ )a  Turns a sequence to non overlapping blocks.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, block_size, ...], with necessary
    paddings,
    where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
Nr   rV   r   )r?  rW  ry  r1  
contiguous)rb   r   r?  bt
num_blockspadding_lenpermute_dimss           rh   _convert_to_block'Gemma3nAudioAttention._convert_to_block  s     ##Ray//)A-$//A
%7!;;Kq@ NN=![IMt7%)C%--l;FFHrz   c                 R   U R                   nU R                  U R                  -   S-
  nU R                  XU5      nU R                  nU R                  nUR                  SXES9nUR                  S:  a&  UR                  S:  a  [        R                  " USSS9nUR                  5       $ )a  Extracts temporal context for every block.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, context_size, ...], with necessary
    paddings,
    where context_size = block_size + left_context + right_context,
    and output[:, i, ...] are x[:, start-left_context:end+right_context,
    ...],
    start = i * block_size, end = (i + 1) * block_size.
rV   )	dimensionsizestepr   r   r   )sourcedestination)
rY  rX  rW  ry  r[  unfoldndimr   movedimr|  )rb   r   rq  rr  	frame_len
frame_step
x_unfoldeds          rh   _extract_block_context,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}	J%%	__
 #))AI)W
 !joo&9 z"!LJ$$&&rz   maskc                    / UR                   S S QU R                  PU R                  P7nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       n[        R                  R                  R                  U R                  5      nSSSU R                  4nUR                  U5      n	X@R                  -  U	-  nUR                   S S u  pU R                  U5      nU R!                  U5      nU R!                  U5      nUR                   S   nU) nU R!                  U5      nUR"                  S:X  aI  UR                   S   UR                   S   -  U R$                  :X  a  UR	                  XU R$                  5      nUR                   U
UU R$                  4:w  a,  ['        SUR                    SU
 SU SU R$                   S	3	5      eUR)                  S5      R)                  S
5      nU R*                  R)                  S5      R)                  S5      R)                  S5      n[        R,                  " UUR/                  UR0                  5      5      nU R3                  X5      nU R4                  R/                  UR0                  5      nUU-  n[        R6                  " U5      nUU-  n[        R8                  " UU[        R:                  " UR<                  5      R>                  5      n[        R                  R                  RA                  US[        RB                  S9R/                  UR<                  S9nUR                   u  nnnnnUR                   S   nURE                  SSSSS5      R	                  SUU5      nURE                  SSSSS5      R	                  SUU5      n[        RF                  " UU5      n U R	                  UUUUU5      RE                  SSSSS5      n!U!R	                  U
XRH                  -  U R                  U R                  45      n!U!S S 2S U24   n!U!$ )Nr   rV   r   rM   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   )r   r  r>  )%r?  r  r  r_  r1  r|  r`  ra  r   r   r/  rb  r^  viewrT  r  r  r  r[  r_   r  rU  logical_andr  r  r\  rV  tanhwherefinfor  minsoftmaxr  rA  bmmrW  )"rb   r   r  	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr)  q_timequery_blocks
key_blocksvalue_blocksr*  original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherelogitssoftcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     rh   r   Gemma3nAudioAttention.forward*  sI   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#ll25OO)//3
--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*Md.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 ??2	
 *!WfW*5rz   )rZ  rW  r   r[  r  r?   r`  rX  rY  r  r^  r_  r\  ra  )r{   r|   r}   r~   r   r   rf  r   r   r   ry  r  r  r   r   r   r   r   s   @rh   rR  rR    s     
1  
D'5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell d drz   rR  c                      ^  \ rS rSrSr SS\S\\   S\4U 4S jjjrS\	R                  S\	R                  4S	 jrS
rU =r$ )Gemma3nAudioCumulativeGroupNormi  a  Applies Group Normalization cumulatively over the time dimension.

This layer normalizes the input by calculating the mean and variance
cumulatively over the time dimension (dim 1). The statistics are computed
over all feature dimensions (specified by `feature_dims` and `num_channels`)
for elements marked as valid by the optional `mask`.

If a `mask` is provided (True for valid, False for invalid/padded),
invalid time steps do not contribute to the statistics calculation, and
their corresponding output values are zeroed out.

Scale and bias, if enabled, are applied per-channel (last dimension).
This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
and `cumulative=True`.
num_channelsfeature_dimsr   c           	        > [         TU ]  5         Xl        [        U5      U l        X0l        [        R                  " [        R                  " U5      5      U l
        [        [        SS[        U R                  5      -   S-   5      5      U l        g )Nr   rV   )r   r   r  r   r  r   r   r   r   r   r   r`   r^   reduction_axes)rb   r  r  r   r   s       rh   r   (Gemma3nAudioCumulativeGroupNorm.__init__  sn     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMrz   r   r   c                    U R                   U R                  4-   nUR                  SS U:w  a  [        SUR                  SS  SU 35      eUR                  n[
        R                  nUR                  U5      n[
        R                  " XTS9n[
        R                  " XPR                  SS9n[
        R                  " USS	9n[
        R                  " X`R                  SS9n	[
        R                  " U	SS	9n
[
        R                  " U
S
S9nX-  nX\-
  R                  S5      n[
        R                  " XR                  SS9n[
        R                  " USS	9nX-  nX\-
  [
        R                  " UU R                  -   5      -  nU R                   R                  U5      nS/UR#                  5       S-
  -  U R                  /-   nUUR%                  U5      -  nUU-  nUR                  U5      $ )zApplies cumulative group norm, optionally using a mask.

Args:
  hidden_states: Input tensor, shape [B, T, *feature_dims, C].

Returns:
  Normalized tensor with the same shape as x.
r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r>  Tr   r   rV   r  r  )r  )r  r  r?  r_   r  r   r  r  	ones_likesumr  cumsumclampr   rsqrtr   r   r   r  )rb   r   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        rh   r   'Gemma3nAudioCumulativeGroupNorm.forward  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF=	  ))F0C0CTRo1= "'9:M:MW[!\"\\*@aH"'++.@c"J ";
 #)"3!8!8!; 99%;ATAT^bc  ,,'7Q? '@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++rz   )r   r  r  r  r   )r   )r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r  r    s`    ( 	NN smN 	N N$G,U\\ G,ell G, G,rz   r  c                      ^  \ rS rSrSr SS\S\S\S\\\\\4   4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )Gemma3nAudioSSCPConvBlocki  zA single convolution block for the SubSampleConvProjection.

This block consists of a 2D convolution, followed by CumulativeGroupNorm,
and a ReLU activation. It handles manual padding for the convolution.
r   idxinput_freq_dimmanual_paddingc           	      6  > [         TU ]  5         Xl        X@l        US:X  a  SOU R                  R                  US-
     nU R                  R                  U   nU R                  R
                  U   u  pxU R                  R                  U   u  p[        R                  " UUUU4X4SSS9U l	        X0R                  S   -   U R                  S   -   nX-
  U
-  S-   n[        UU4U R                  R                  S9U l        [        R                  " 5       U l        g )Nr   rV   )r   r   F)in_channelsout_channelskernel_sizestridepaddingr  )r  r  r   )r   r   r   r  r   r   r   r   Conv2dconvr  r   normReLU
activation)rb   r   r  r  r  r  r  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convr   s                rh   r   "Gemma3nAudioSSCPConvBlock.__init__  s    	, !8a)K)KCRSG)T{{99#>![[>>sC![[>>sCII#% '

	 %':':1'==@S@STU@VV!,9A=
3%$44
	 '')rz   audio_encodingsr   c                    [         R                  " XR                  SSS9R                  U R                  R
                  R                  5      nU R	                  U5      nUR                  SSSS5      R                  5       nU R                  U5      nUR                  SSSS5      R                  5       nU R                  U5      $ )Nconstantr[   )modevaluer   r   r   rV   )Fr0  r  r  r  r   r  rA  r|  r  r  )rb   r  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          rh   r   !Gemma3nAudioSSCPConvBlock.forward0  s     "#8K8KR\dg!h!k!kII"""

  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566rz   )r  r   r  r  r  ))r   r   r   r   )r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r  r    sm     5A)$")$ )$ 	)$
 c3S01)$ )$V7u|| 7 7 7rz   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )#Gemma3nAudioSubSampleConvProjectioniC  r   c                 Z  > [         TU ]  5         Xl        UR                  n/ n/ n[	        S5       Hk  nUR
                  U   u  pgUR                  U   u  pSn
US-
  nSnSnUUU
U4nUR                  U5        X,-   U-   nX-
  U	-  S-   nUR                  U5        UnMm     [        SUR                  UUS   S9U l	        [        SUS   UUS   S9U l
        UR                  S   nUS   nUU-  U l        [        R                  " U R                  U R                  R                  SS9U l        g )Nr   r   rV   )r  r  r   r  r   Fr   )r   r   r   r   r`   r   r   appendr  conv_0conv_1r   input_proj_in_featuresr   r  r?   input_proj_linear)rb   r   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsre   r  r  r  r  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tupler  f_out_after_convfinal_c_outfinal_f_outr   s                      rh   r   ,Gemma3nAudioSubSampleConvProjection.__init__D  sr   $*$:$:!#%  "qA!'!=!=a!@H!'!=!=a!@H I#a<L JK 	$  %++,@A 4@;NK + 68CaG!(()9:(8%= @ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lrz   r  r   c                    UR                  S5      nU R                  U5      nU R                  U5      nUR                  u  pEpgUR	                  SSSS5      R                  5       nUR                  XFXu-  5      n	U R                  U	5      n
U
$ )NrV   r   r   r   )r  r  r  r?  rA  r|  r  r  )rb   r  audio_encodings_reshapedrp  r}  c_outt_outf_out
x_permutedoutput_flattenedoutputs              rh   r   +Gemma3nAudioSubSampleConvProjection.forward}  s     $3#<#<Q#? KK01KKN!"%YYq!Q*557
%??1U]C''(89rz   )r   r  r  r  r  r{   r|   r}   r~   r   r   r   r   r   r   r   r   s   @rh   r  r  C  s3    7m1 7mru||   rz   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerAttentioni  r   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l
        [        U5      U l        [        R                  " U R                  U R                  R                  SS9U l        [        U R                  R                  5      U l        g )Nr   Fr  r   )r   r   r   r?   post_in_featuresr  r   rc  r   r   pre_attn_normrR  attnr   r  post	post_normrb   r   r   s     rh   r   'Gemma3nAudioConformerAttention.__init__  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@rz   r  r   r   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  XB5      nUR
                  u  pgpUR                  XgX-  5      n
U R                  U
5      n[         R                  " XR                  * U R                  5      nX0R                  U5      -   $ r   )	r   r  r   r"  r#  r?  r1  r$  r%  )rb   r  r   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr}  r~  r  r  r  s              rh   r   &Gemma3nAudioConformerAttention.forward  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A#R  %=$B$B!i#;#C#CA)J^#_ ))$<=++o8N8N7NPTPfPfg,~~o/NNNrz   )r#  r   r$  r!  r%  r"  r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   s   @rh   r  r    sG    A1 AOu|| OUEUEU OZ_ZfZf O Orz   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerFeedForwardi  r   c                 ~  > [         TU ]  5         Xl        U R                  S[        R
                  " U R                  R                  5      SS9  [        U R                  R                  5      U l	        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  S-  U R                  R                  SS9U l        [        U R                  R                  5      U l        U R                  R                  U l        g )Nr   Fr  rM   r   )r   r   r   r  r   rc  r   r   r?   pre_layer_normr   r  ffw_layer_1ffw_layer_2post_layer_normr   post_layer_scaler&  s     rh   r   )Gemma3nAudioConformerFeedForward.__init__  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF $ @ @rz   r  r   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  U5      n[
        R                  R                  U5      nU R                  U5      n[         R                  " XR                  * U R                  5      nU R                  U5      nX!U R                  -  -   $ r   )r   r  r   r1  r2  r   r/  silur3  r4  r5  )rb   r  residuals      rh   r   (Gemma3nAudioConformerFeedForward.forward  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..?T-B-BBCCrz   )r   r2  r3  r4  r5  r1  r  r   s   @rh   r/  r/    s6    
A1 
A	Du|| 	D 	D 	Drz   r/  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerLightConv1di  r   c           
        > [         TU ]  5         Xl        [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l	        [        R                  " U R                  R                  U R                  R                  U R                  R                  SSU R                  R                  SS9U l        U R                  S[        R                  " U R                  R                   5      SS	9  [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        U R                  R                  S-
  U l        g )
Nr   r   Fr   rV   r   )r  r  r  r  r  groupsr  r   r  )r   r   r   r   r?   r   r1  r   r  linear_startConv1dr   depthwise_conv1dr  r   rc  r   	conv_norm
linear_endcausal_paddingr&  s     rh   r   )Gemma3nAudioConformerLightConv1d.__init__  sB   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Crz   r  r   c                 2   UnU R                  U5      nU R                  U5      n[        R                  R                  R                  USS9nUR                  SSS5      n[        R                  " X0R                  S45      nU R                  U5      nUR                  SSS5      n[        R                  " XR                  * U R                  5      nU R                  U5      n[        R                  R                  U5      nU R                  U5      nX-   nU$ )Nr   r  r   r   rV   )r1  r@  r   r   r/  glurA  r  r0  rE  rB  r  r   rC  r8  rD  )rb   r  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr  s         rh   r   (Gemma3nAudioConformerLightConv1d.forward  s    #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0HK^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: ;rz   )rE  r   rC  rB  rD  r@  r1  r  r   s   @rh   r<  r<    s2    D1 D*u||   rz   r<  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerBlocki  r   c                   > [         TU ]  5         Xl        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l	        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l        g )Nr   Fr  )r   r   r   r/  ffw_layer_startr  	attentionr<  lconv1dffw_layer_endr  r   rc  r   r   r?   r  r&  s     rh   r   #Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	rz   r  r   r   c                 f   U R                  U5      nU R                  X5      nU) nXR                  S5      R                  UR                  5      -  nU R                  U5      nU R                  U5      n[        R                  " XR                  * U R                  5      nU R                  U5      nU$ )Nr   )rP  rQ  r  r  r  rR  rS  r   r  r   r  )rb   r  r   validity_mask_for_lconvaudio_encodings_for_lconv_inputr  s         rh   r   "Gemma3nAudioConformerBlock.forward  s    ..?..I#1/*9<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+rz   )rQ  r   rS  rP  rR  r  r-  r   s   @rh   rN  rN    s@    	<1 	<u|| UEUEU Z_ZfZf  rz   rN  c                       \ rS rSrSrg)Gemma3nTextScaledWordEmbeddingi
  r\   Nr{   r|   r}   r~   r   r\   rz   rh   rZ  rZ  
      rz   rZ  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nTextLaurelBlocki  z Learned Augmented Residual Layerr   c                   > [         TU ]  5         Xl        [        R                  " U R                  R
                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R
                  SS9U l        [        U R                  R
                  U R                  R                  S9U l        g )NFr   r>  )r   r   r   r   r  r?   rR   linear_leftlinear_rightr   r   post_laurel_normr&  s     rh   r   Gemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd erz   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  U5      nX-   $ r   )r`  ra  rb  )rb   r   laurel_hidden_statesnormed_laurel_hidden_statess       rh   r   Gemma3nTextLaurelBlock.forward  s@    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#::rz   )r   r`  ra  rb  )r{   r|   r}   r~   r   r2   r   r   r   r   r   r   r   s   @rh   r^  r^    s5    *f0 f;U\\ ;ell ; ;rz   r^  c                      ^  \ rS rSrSS\S\4U 4S jjjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
rU =r$ )Gemma3nTextMLPi   r   	layer_idxc                 t   > [         TU ]  U5        UR                  U   U l        UR                  U   U l        g r   )r   r   rB   rS   activation_sparsityrb   r   rj  r   s      rh   r   Gemma3nTextMLP.__init__!  s6     !'!9!9)!D#)#E#Ei#P rz   r   r   c                     U R                  U5      nU R                  S:  a  U R                  U5      nU R                  U5      nU R	                  U5      nU R                  X4-  5      nU$ )Nr[   )	gate_projrl  _gaussian_topkact_fnup_proj	down_proj)rb   r   rp  activationsrs  rt  s         rh   r   Gemma3nTextMLP.forward&  sa    NN=1	##c)++I6Ikk),,,}-NN;#89	rz   inputsc                    [         R                  " U R                  [         R                  UR                  S9n[         R
                  R                  R                  SS5      nUR                  U5      nUR                  UR                  5      n[         R                  " USSS9n[         R                  " USSSS9nXVU-  -   n[        R                  R                  X-
  5      $ )	Nr  r  r   rV   r   Tr  F)r   r   unbiased)r   rc  rl  r  r  distributionsnormalNormalicdfr#  r  r   stdr   r/  relu)rb   rw  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           rh   rq  Gemma3nTextMLP._gaussian_topk/  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&"344rz   )rl  rB   )r   )r{   r|   r}   r~   r2   r   r   r   r   r   rq  r   r   r   s   @rh   ri  ri     s[    Q0 QS Q Q
U\\ ell 5U\\ 5ell 5 5rz   ri  c                   n  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS\R                  S\R                  4S jrSrU =r$ )Gemma3nTextAltUpi@  a  Alternating Updates (AltUp)

The AltUp module wraps transformer layers. The `predict` step modifies the
input to the transformer layer, and the `correct` step propagates the output
of the transformer layer to the sparsely updated dimensions.

See more in the research paper:

https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
r   c                 2  > [         TU ]  5         Xl        [        R                  " [
        R                  " U R                  R                  5      5      U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        U R                  R                  U R                  R                  S9U l        U R#                  S[
        R$                  " U R                  R                  S-  5      SS9  g )NFr   r   r>  router_input_scale      r  )r   r   r   r   r   r   r]  r?   correct_output_scaler  rN   correction_coefsprediction_coefsmodality_routerr   r   router_normr  rc  r&  s     rh   r   Gemma3nTextAltUp.__init__L  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqrz   rp  r   c                     U R                  U5      U R                  -  nU R                  U5      n[        R                  " UR                  5       5      R                  U5      $ r   )r  r  r  r   r  r   r   )rb   rp  router_inputsrouteds       rh   compute_router_modalities*Gemma3nTextAltUp.compute_router_modalitiesV  sM    ((+d.E.EE%%m4zz&,,.)11!44rz   r   c                    U R                  XR                  R                     5      nU R                  (       ap  U R                  R                  bY  U R
                  R                  R                  R                  U R                  R                  * U R                  R                  5        U R                  U5      R                  " / UR                  SS QU R                  R                  PU R                  R                  P76 R                  SSSS5      n[        R                  " UR                  SSSS5      U5      nUR                  SSSS5      nXA-  nUR                  5       R!                  U5      $ )a  Predicts the output of a layer using a trainable map.

Args:
    hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
Nr   r   rV   r   r   )r  r   rJ   trainingrK   r  r   dataclamp_r1  r?  rN   rA  r   rB  r|  r   )rb   r   
modalities	all_coefspredictionss        rh   predictGemma3nTextAltUp.predict[  s?    33M++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSno !!*-W i &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15$%%'//>>rz   r  	activatedc                    U R                  U5      nX!U R                  R                     -
  nUR                  U R                  R                  SSS5      nU R
                  (       a  U R                  R                  b{  U R                  R                  R                  U R                  R                  * U R                  R                  5      n[        R                  R                  R                  X5SS9S-   nOU R                  U5      S-   nUR                  SSS5      R                  S5      n[        R                   " XF5      nXq-  nUR#                  5       R%                  U5      $ )a  Corrects the predictions relative to the

Args:
    predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
    activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
        predictions relative to the activated input embeddings.
rV   Nr   r  r   r   r   )r  r   rJ   repeatrN   r  rK   r  r   r  r   r   r/  linearrA  r  mulr|  r   )rb   r  r  r  
innovationr   r  	correcteds           rh   correctGemma3nTextAltUp.correctw  s$    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
==T[[88D**11779T9T8TVZVaVaVqVqrF++22:D2QTWWI--j9C?I
 %%aA.88<	IIj4	 	##%--i88rz   r  c                 p    UR                  U R                  5      U R                  -  R                  U5      $ )z
This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
(which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
`scale_corrected_output`
)r   r  rb   r  s     rh   r   Gemma3nTextAltUp.forward  s2     !!$";";<t?X?XXaabkllrz   c                 $    U R                  U5      $ )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)r   r  s     rh   scale_corrected_output'Gemma3nTextAltUp.scale_corrected_output  s    ||I&&rz   )r   r  r  r  r  r  )r{   r|   r}   r~   r   r2   r   r   r   r  r  r  r   r  r   r   r   s   @rh   r  r  @  s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9>m m%,, m' ' ' 'rz   r  rp  r"  r!  unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )a$  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r  r#   )rp  r"  r!  r  s       rh   apply_rotary_pos_embr    s6    " --
&C
--
&CGA,--rz   c                   *  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S	\	S-  S
\
\\\R                  \R                  4   4   S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma3nTextAttentioni  r   rj  c                   > [         TU ]  5         Xl        X l        [	        US5      (       a  UR
                  U   OS U l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l        SU l        U R                  R                   U l        SU l        U R                  R$                  U R                  R&                  -
  nX#s=:  =(       a    S:  Os  U l        UR
                  S U nU R(                  (       a@  [+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  U l        S	U l        OBS U l        U[+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  :H  U l        [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l        [;        UR                  UR<                  S9U l        U R(                  (       d  [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l         [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l!        [;        UR                  UR<                  S9U l"        [;        UR                  UR<                  S	S9U l#        [2        R4                  " UR                  U R                  -  UR                  UR6                  S
9U l$        g )NrH   rY   r  r  Tr   rV   r   Fr   )r   r   )r   r   r   )%r   r   r   rj  hasattrrH   
layer_type
is_slidingrG   getattrr?   num_attention_headsr  rE   num_key_value_groupsscalingattention_dropout	is_causalrD   rP   is_kv_shared_layerr^   indexkv_shared_layer_indexstore_full_length_kvr   r  attention_biasr_  r   r   q_normr`  ra  k_normv_normo_proj)rb   r   rj  first_kv_shared_layer_idxprev_layersr   s        rh   r   Gemma3nTextAttention.__init__  s   ";B6=;Y;Y&,,Y7_c//-@@7;f33D
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>$(KK$A$ADKKDdDd$d!"+"L"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D% ii : :T]] JQWQfQf
 %f>Q>QR &&))""F$>$>$NU[UjUjDK ))""F$>$>$NU[UjUjDK )V__&BUBUVDK(V__&BUBUbghDKii&&68J8JQWQfQf
rz   Nr   position_embeddingsattention_maskpast_key_valuesshared_kv_statesrc   r   c                    UR                   S S n/ UQSPU R                  R                  P7nUu  pU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  XPR                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      R	                  U5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR!                  XU R"                  5      u  pU R$                  (       a  X4XPR"                  '   [&        R(                  " U R                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr   r   )r  rV   r[   )dropoutr  rG   )r?  r   r  r_  r  r  r  	transposer  r  r  r  r`  r  ra  r  rr   rj  r  r   get_interface_attn_implementationr"   r  r  r  rG   r1  r|  r  )rb   r   r  r  r  r  rc   input_shapehidden_shaper"  r!  r  r  r  attention_interfaceattn_outputattn_weightss                    rh   r   Gemma3nTextAttention.forward  s:    $))#2.??b?$++*>*>?&{{=166|D{{<0+LsRST#--a3
 ""'78R8R'S$J#|':':;J'??<+>+>?L]388FJZ0J-jsRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$/9/G^^,(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((rz   )r  r   r  r  r  r  r  r`  r  rj  r  r  r  r  r_  r  rG   r  r  ra  NN)r{   r|   r}   r~   r2   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r  r    s    .
0 .
S .
j )-PT;)||;) #\\;) t+	;)
 ;) sE%,,*D$EEFM;) +,;) 
u||U\\D00	1;) ;)rz   r  c                     ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S	\	\\
\R                  \R                  4   4   S-  S
\R                  S-  S\R                  S-  S\S-  S\\   S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma3nTextDecoderLayeri(  r   rj  c                   > [         TU ]  X5        [        XS9U l        UR                  U l        [
        UR                     U l        [        U5      U l	        [        U5      U l        [        X5      U l        [        R                  " U R                   U R                  SS9U l        [        R                  " U R                  U R                   SS9U l        ['        U R                   UR(                  S9U l        g )N)rj  Fr   r>  )r   r   ri  mlprA   r
   hidden_activationrr  r  altupr^  laurelr  	self_attnr   r  r?   per_layer_input_gateper_layer_projectionr   r   post_per_layer_input_normrm  s      rh   r    Gemma3nTextDecoderLayer.__init__)  s    +!&>+1+M+M(V556%f-
,V4-f@$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&rz   Nr   r  per_layer_inputr  r  position_idsr  rc   r   c           
      z   U R                   R                  U5      n	XR                  R                     n
U R	                  U
5      nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nX-   [        R                  " S5      -  nU R                  U5      nU R                  U5      nU R                  U5      nUU-   nU R                   R                  U	U5      nUU R                  R                     R                  5       nU R                  R                  (       a  U R                   R!                  U5      nU R#                  U5      nU R%                  U5      n[&        R(                  " UU5      nU R+                  U5      nU R-                  U5      nUSS === U-  sss& U$ )N)r   r  r  r  r  r  r   rV   r\   )r  r  r   rJ   input_layernormr  r  post_attention_layernormr  sqrtpre_feedforward_layernormr  post_feedforward_layernormr  rd  rL   r  r  rr  r   multiplyr  r  )rb   r   r  r  r  r  r  r  rc   r  active_predictionactive_prediction_normedlaurel_outputr#  rC  
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                          rh   r   Gemma3nTextDecoderLayer.forward7  s    jj((7'(D(DE#'#7#78I#J $<=.. 
2)-% 3+
 
 ,,T2&-
!1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!$$rz   )	rr  r  rA   r  r  r  r  r  r  )NNNNNN)r{   r|   r}   r~   r2   r   r   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r   s   @rh   r  r  (  s   c0 cS c" -1(,PT.204(,3%||3% #\\3% 	3%
 sE%,,*D$EEFM3% t+3% &&-3% 3% +,3% 
u||U5#4#4e6G6G#GH4OO	P3% 3%rz   r  c            	          ^  \ rS rSr% \\S'   SrSS/rS/r\	\
S.r\R                  " 5       S 5       rS	 rS
 r   SS\S-  S\S-  S\S\R(                  4U 4S jjjr   SS\S-  S\S-  S\4S jjrSrU =r$ )Gemma3nPreTrainedModelim  r   )imagetextaudior  r  r  )r   
attentionsc                    [         R                  " X5        [        U[        5      (       a"  [        R
                  " UR                  5        GO[        U[        5      (       a  [        R                  " UR                  5        UR                  S-  nS[        R                  R                  R                  [        R                  " S5      5      -  n[        R                   " UR"                  X#-  5        [        R$                  " UR&                  UR(                  5        [        R                   " UR*                  UR-                  5       5        GO[        U[.        5      (       a-  [        R$                  " UR0                  UR2                  5        GO[        U[4        5      (       aZ  [        R                  " UR6                  5        [        R$                  " UR8                  U R:                  R<                  S-  5        GO[        U[>        5      (       a  Su  pEUR@                  S-  n[B        RD                  " [G        U5      [G        U5      -  5      [I        US-
  S5      -  nU[        RJ                  " [        RL                  " U5      U* -  5      -  n[        R                   " URN                  URG                  5       RQ                  S5      RQ                  S5      5        GO2[        U[R        5      (       ag  [        R$                  " URT                  U R<                  S-  5        [        R$                  " URV                  S[B        RX                  " S	5      -  5        O[        U[Z        5      (       a  UR\                   H  n	UR^                  n
UR`                  U	   S
:w  a  [b        UR`                  U	      n
U
" UR:                  U	S9u  p[        R                   " [e        X S35      U5        [        R                   " [e        X S35      U5        M     [g        US5      (       a6  [        R$                  " URh                  U R:                  Rh                  5        g g )Nr   r  r[   r  )r  r8   r   rV   r          @rm   )r  	_inv_freq_original_inv_freqr   )5r   _init_weightsr]   r  initones_r   rR  zeros_r^  r  r   r   r/  rb  rc  copy_rT  	constant_rV  rZ  rU  rf  rZ  embed_scalescalar_embed_scaler  r  r  r   r?   r   r  r  r  r   r	  r  r  r  r  Gemma3nTextModelper_layer_projection_scaleper_layer_input_scaler  Gemma3nRotaryEmbeddingrH   compute_default_rope_parametersrl   r   r  r  r   )rb   modulerT  rg  r  r  r  r  r  r  rope_init_fncurr_inv_freqrC  s                rh   r  $Gemma3nPreTrainedModel._init_weightsw  s	   %%d3f=>>JJv}}% 566KK,,-oot+G!4!4!=!=ell3>O!PPLJJv~~w'=>NN6>>6+K+KLJJv55v7\7\7^_ >??NN6--v/H/HI 011KK334NN644dkk6M6Mt6ST EFF+5(M#__1N&*hhu]/CeMFZ/Z&[^a"A_ '# +UYYu||N7SWnVn7n-ooNJJv,,n.B.B.D.N.Nq.Q.[.[\].^_ 011NN6<<d>N>NPT>TUNN677TYYs^9KL 677$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 6.//NN633T[[5R5RS 0rz   c                 .    U R                   R                  $ r   
base_modelembed_tokens_per_layerrb   s    rh   get_per_layer_input_embeddings5Gemma3nPreTrainedModel.get_per_layer_input_embeddings  s    555rz   c                 $    XR                   l        g r   r  rb   r  s     rh   set_per_layer_input_embeddings5Gemma3nPreTrainedModel.set_per_layer_input_embeddings  s    16.rz   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 J   > [         TU ]  UUUS9nU R                  XU5        U$ )N)r#  r$  r%  )r   resize_token_embeddings_resize_per_layer_embeddings)rb   r#  r$  r%  inputs_embedsr   s        rh   r'  .Gemma3nPreTrainedModel.resize_token_embeddings  s:     7)1' 8 

 	)).m\rz   c                    U R                   U R                  R                  5       l        U R                  R                  5       R                  (       a  U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[        XV5        UR                  UR                  R                  5        U R                  U5        g g )N_hf_hook)r;   r   get_text_configr=   rA   r  _get_resized_embeddingsr  r,  r/   requires_grad_r   r   r!  )rb   r#  r$  r%  r  new_embeddings_per_layerhooks          rh   r(  3Gemma3nPreTrainedModel._resize_per_layer_embeddings  s     DH??##%@;;&&(DD%)%H%H%J"'+'C'C&8J($ -z::-66"#;B$334J4Q4Q4_4_`//0HI Erz   r\   )NNT)r{   r|   r}   r~   r   r   input_modalities_skip_keys_device_placement_no_split_modulesr  r  _can_record_outputsr   no_gradr  r  r!  r   r   r   	Embeddingr'  r(  r   r   r   s   @rh   r  r  m  s    1#46H"I230*
 ]]_%T %TN67
 &*)-"	d
  $J 	
 
   &*)-"	Jd
J  $JJ 	J Jrz   r  c                      ^  \ rS rSr% Sr\\S'   SrSrS\4U 4S jjr	\
\S\R                  S\R                  S\\   S	\\-  4S
 j5       5       rSrU =r$ )Gemma3nAudioEncoderi  zp
An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
r   	audio_melr  c                 
  > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        U R                  5         g s  snf r   )r   r   r   r  subsample_conv_projectionr   
ModuleListr`   r   rN  	conformer	post_init)rb   r   rC  r   s      rh   r   Gemma3nAudioEncoder.__init__  si     )LV)T&9>v?\?\9]^9]A'/9]^
 	 _s   B r   rc   r   c                 >   U R                  U5      nUR                  S   nSn[        [        U R                  R
                  5      5       H!  nX`R                  R
                  U   S   -  nM#     [        R                  " XRR                  S9U-  n[        R                  " XR                  S   S-
  S9nUR                  S:  a?  UR                  S:X  a/  UR                  S5      R                  UR                  S   S5      nOcUR                  UR                  :X  aI  UR                  S   S:X  a6  UR                  S   S:w  a#  XXR                  S   :X  a  UR                  S5      n[        R                  " USU5      n	U R                   H  n
U
" XI5      nM     U R                  R                  S:  a@  USS2SSU R                  R                  24   nU	SS2SSU R                  R                  24   n	UR!                  U	R                  S5      S5      n[#        UU	S9$ )	ad  Encodes a batch of MELs.

Args:
    audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
      mel_bins].

Returns:
    audio_encodings: a torch.Tensor of shape
        `[batch_size, self.config.audio_soft_tokens_per_image,
        self.config.audio_config.hidden_size]`
    audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
rV   r   r=  )r	  r   Nr[   )last_hidden_stater   )r=  r?  r`   r^   r   r   r   r  r  r  r  r  expandgatherr?  r   masked_fillr   )rb   r;  r   rc   r  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks              rh   r   Gemma3nAudioEncoder.forward  s   " 88C  %%a($S)J)J%KLO;;#D#D_#UVW#XX  M ,,u-B-BCFYY++g+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^E#OBO $ ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV--'
 	
rz   )r   r?  r=  )r{   r|   r}   r~   r   r   r   main_input_namer3  r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r:  r:    sz     !O1   8
8
7<7G7G8
SYZlSm8
	/	/8
   8
rz   r:  c                       \ rS rSrSrg)r  i  r\   Nr[  r\   rz   rh   r  r    r\  rz   r  zBThe base Gemma 3n language model without a language modeling head.custom_introc                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  S\R                  4S jr
 SS\R                  S	\R                  S-  S\R                  4S
 jjr\\" SS9\       SS\R                  S-  S	\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                   S-  S\S-  S\\   S\4S jj5       5       5       rSrU =r$ )r  i  r   c                 <  > [         TU ]  U5        UR                  U l        UR                  U l        [	        UR
                  UR                  UR                  -  U R                  UR                  S-  S9U l        [        R                  " U R                  UR                  UR                  -  SS9U l        [        UR                  UR                  S9U l        [        R                  " [!        UR                  5       Vs/ s H  n[#        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [        R                  " [!        SU R(                  R*                  5       Vs/ s H-  n[        R                  " U R                  U R                  SS9PM/     sn5      U l        [        R                  " [!        SU R(                  R*                  5       Vs/ s H-  n[        R                  " U R                  U R                  SS9PM/     sn5      U l        U R1                  S[2        R4                  " U R                  S-  5      SS	9  U R1                  S
[2        R6                  " [2        R4                  " S5      5      SS	9  / U l        [;        U R$                  5       HT  u  pEUR<                  R>                  (       d  M"  U R8                  RA                  S Vs/ s H  nSU SU 3PM     sn5        MV     g s  snf s  snf s  snf s  snf )Nr   )r  Fr   r>  rV   r  r   r  r  r  )r`  ra  r  r  zlayers.z.self_attn.)!r   r   r?   rA   rZ  r=   rD   padding_idxr  r   r  per_layer_model_projectionr   r   per_layer_projection_normr>  r`   r  layersr  r   rN   altup_projectionsaltup_unembed_projectionsr  r   rc  r  "_keys_to_ignore_on_load_unexpected	enumerater  r  extend)rb   r   rj  rC  re   layernamer   s          rh   r   Gemma3nTextModel.__init__  s    !--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&mmINvOgOgIhiIhI$V7Ihi
 #6#5#56;N;NO	!#PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw"
 *,PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 35/!$++.HA11177>>@hi@hwqcTF3@hi /% j x x js   L
84L*4L,L
	input_idsr   c                     U R                  U5      R                  " / UR                  QU R                  R                  PU R
                  P76 $ r   )r  r1  r?  r   rD   rA   )rb   r`  s     rh   get_per_layer_inputs%Gemma3nTextModel.get_per_layer_inputsL  sN    **95== 
__
KK))
 ,,
 	
rz   Nr)  per_layer_inputsc                    U R                  U5      nX0R                  R                  UR                  UR                  S9-  nUR
                  " / UR                  S S QU R                  R                  PU R                  P76 nU R                  U5      nUc  U$ UR                  UR                  :w  a   USS U R                  R                  2S S 24   nX2-   U R                  R                  UR                  UR                  S9-  $ )Nry  r   .)rU  r  r  r  r  r1  r?  r   rD   rA   rV  r  )rb   r)  rd  r  s       rh   project_per_layer_inputs)Gemma3nTextModel.project_per_layer_inputsS  s&   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  4;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$74;U;U;X;X%%.B.I.I <Y <
 
 	
rz   F)tie_last_hidden_statesr  r  r  	use_cacherc   c           	      v   USL USL-  (       a  [        S5      eUb"  U R                  U5      nU R                  U5      nU R                  Xb5      nU(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R
                  UUUUS.n[        S0 UD6[        S0 UD6S.n
Un[        R                   " US	-  S
SS9S-  n[        R"                  " S5      nU/n[%        SU R
                  R&                  5       H  nU R(                  US-
     " U5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " USS9n0 n[7        U R
                  R8                  5       H  nU R;                  UUU5      UU'   M     0 n[=        U R>                  SU R
                  R@                   5       HZ  u  nnXR
                  R8                  U      nUSS2SS2USS24   nU" UUU R
                  R8                  U      U4UUUUS.UD6nM\     [        R                   " US   S	-  S
SS9S-  nUS   /n[%        SU R
                  R&                  5       H  nU RB                  US-
     " UU   5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " U5      n[        R                   " USS9nU RE                  U5      n[G        UUS9$ )z
per_layer_inputs (torch.Tensor, *optional*, defaults to None):
    Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
N:You must specify exactly one of input_ids or inputs_embeds)r   r   rV   r=  )r   r)  r  r  r  )rX   rY   r   r   Tr  r   gh㈵>ry  r  )r  r  r  r  )rC  r  r\   )$r_   embed_tokensrb  rf  r   r   get_seq_lengthr   r  r?  r  r  r]   r   r   r   r   rc  r`   rN   rX  r  r  r  maximumr  stacksetrH   
rotary_embr[  rW  rD   rY  r  r   )rb   r`  rd  r  r  r  r)  ri  rc   past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesre   
altup_projcurrent_hidden_statenew_magnituder   r  r  r  decoder_layercausal_maskr  altup_unemb_projs                               rh   r   Gemma3nTextModel.forwardo  s/   $ -t";<YZZ  --i8M#88C88Y0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# ( !::oq&8b$OSVVd+-.q$++667A//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $6A> dkk556J.2oom\[e.f
+ 7  )$++6U8U8U*V WA}-kk.E.Ea.HIK.q!Qz:O)#DKK$;$;A$>?	 "2*) /	 	M	 !X  !::mA&6!&;TRVYY+A./q$++667A-1-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $67

=a8		-0&++
 	
rz   )
rZ  rX  rY  r  r?   rA   rW  r  rU  rV  r   )NNNNNNN)r{   r|   r}   r~   r2   r   r   r   r  r   rb  rf  r   r   r   r   r   r   r   r   r   r   r   r   r   s   @rh   r  r    sO   +0 +Z
e.>.> 
5<< 
 15
||
  ,,-
 
	
8  E2 .204.204(,26!%k
##d*k
  ,,-k
 t+	k

 &&-k
 k
 ((4/k
 $;k
 +,k
 
!k
  3  k
rz   r  z?The base Gemma 3n language model with a language modeling head.c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Gemma3nForCausalLMi  r   c                    > [         TU ]  U5        U R                  R                   Vs/ s H  nSU 3PM
     snU l        g s  snf Nzmodel.r   r   modelrZ  rb   r   r^  r   s      rh   r   Gemma3nForCausalLM.__init__  D      )-

(U(U3
(UfTFO(U3
/ 3
   ArZ  )r{   r|   r}   r~   r2   r   r   r   r   s   @rh   r  r    s    
0 
 
rz   r  c                      ^  \ rS rSrSrS\\-  S\4U 4S jjr  SS\	R                  S-  S\	R                  S-  S	\	R                  4S
 jjrSrU =r$ )Gemma3nMultimodalEmbedderi  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configr   c                 ^  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  5      U l        [        U R                  U R
                  S9U l        [        U R                  U R
                  S9U l        [        R                  " U R                  U R                  SS9U l        [        U R                  U R
                  SS9U l        g )Nr>  Fr   )r   r   )r   r   r?   multimodal_hidden_sizer   r   r   r;   text_hidden_sizer   r8  	embeddingr   hard_embedding_normsoft_embedding_normr  embedding_projectionembedding_post_projection_norm)rb   r  r   r   s      rh   r   "Gemma3nMultimodalEmbedder.__init__  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+rz   Nr`  r)  r   c                     USL USL-  (       a  [        S5      eUb  U R                  U5      nO.U R                  XR                  -
  5      nU R	                  U5      nU R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.

Args:
    input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
        `[vocab_offset, vocab_offset + vocab_size)`.
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.

Returns:
    A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
Nrk  )r_   r  r  r   r  r  r  )rb   r`  r)  emb_normhard_embemb_norm_projs         rh   r   !Gemma3nMultimodalEmbedder.forward   s     -t";<YZZ$//>H~~i2C2C&CDH//9H11(;22=AArz   )
r  r  r  r   r  r  r  r  r   r;   r  )r{   r|   r}   r~   r   r   r   r2   r   r   r  r   r   r   r   r   s   @rh   r  r    sq    [t-0CCt 't* .2-1B##d*B ||d*B 
	B Brz   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	" SS9S\
R                  S	\\   S
\\-  4S j5       5       r    SS\
R"                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  4S jjr\           S S\
R"                  S-  S\
R                  S-  S\
R                  S-  S\
R&                  S-  S\
R&                  S-  S\
R"                  S-  S\S-  S\
R"                  S-  S\
R                  S-  S\
R"                  S-  S\S-  S\\   S
\4S jj5       r\\	" SS9S\
R&                  S\
R&                  S	\\   S
\\-  4S j5       5       rSrU =r$ )!Gemma3nModeli  r   c                   > [         TU ]  U5        U ?U ?UR                  R
                  U l        [        R                  " UR                  5      U l	        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        U R                  R                   Vs/ s H  nSU 3PM
     snU l        g s  snf )Nzlanguage_model.)r   r   multi_modal_projectortext_config_dtyper   r=   r   from_configr   audio_towerr  r   embed_visionembed_audiolanguage_modelrZ  r  s      rh   r   Gemma3nModel.__init__#  s     &"*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\] 261D1D1g1g3
1godV$1g3
/ 3
s   7Cc                 .    U R                   R                  $ r   r  r  r  s    rh   r  +Gemma3nModel.get_per_layer_input_embeddings1  s    ""999rz   c                 $    XR                   l        g r   r  r   s     rh   r!  +Gemma3nModel.set_per_layer_input_embeddings4  s    5:2rz   zOProjects the last hidden state from the vision model into language model space.rP  pixel_valuesrc   r   c                    U R                   " S	USSS.UD6nUR                  nUR                  UR                  S   U R                  R
                  R                  U R                  R                  5      R                  SSS5      nX@R                  R
                  R                  S-  -  nU R                  US9Ul
        U$ )
NFT)r  r   return_dictr   r   rV   r   r)  r\   )vision_towerrC  r1  r?  r   r   r?   r   rA  r  pooler_output)rb   r  rc   vision_outputsrC  s        rh   get_image_featuresGemma3nModel.get_image_features7  s     **sQVdhslrs*<< .55##A&KK%%11KK44
 '!Q
	 	 	[[66BBCGG'+'8'8GX'8'Y$rz   Nr`  r)  image_featuresaudio_featuresc           	         Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  R                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbP  [        X%   R                  5       UR                  5       :H  SU SUR                  S   UR                  S   -   35        UR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbP  [        X&   R                  5       UR                  5       :H  SU SUR                  S   UR                  S   -   35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
ry  r   z6Image features and image tokens do not match, tokens: z, features: r   rV   z6Audio features and audio tokens do not match, tokens: )get_input_embeddingsr   rc  r   r   longr  allr   r  r  	expand_asr  r   numelr?  )	rb   r`  r)  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            rh   get_placeholder_mask!Gemma3nModel.get_placeholder_maskM  sN    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;,,.LL!;!;5::VcVjVjk c"g  "+kk.H.H!H!*kk.H.H!H+//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 ,//1/99"=GGVYYZgZnZno%"1779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 "55rz   input_featuresr  input_features_maskr  r  token_type_idslabelsri  	lm_kwargsc                 T	   USL U	SL-  (       a  [        S5      eUGbz  U R                  5       " U5      n	[        R                  " US:  XR                  :  5      n[        R
                  " X[        R                  " U5      5      nU R                  R                  U5      n[        R                  " XR                  R                  :  XR                  R                  :  5      nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      R#                  U	5      n[        R
                  " UUU	5      n	XR                  R                  :  nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      R#                  U	5      n[        R
                  " UUU	5      n	OSnUbe  U R%                  USS9R&                  nUR                  U	R                  U	R                  5      nU R)                  XUS	9u  nnU	R+                  UU5      n	UGb>  UGb:  U R-                  X5) SS9nUR&                  nUR.                  n[        R0                  " U R                  S-
  //[        R2                  UR                  S
9nU R                  US9n [        R
                  " UR!                  S5      U U5      nUR4                  u  n!n"n#U R6                  R8                  U"-
  n$U R;                  U!U$U#5      n%[        R<                  " UU%4SS9nUR                  U	R                  U	R                  5      nU R)                  XUS9u  nn&U	R+                  U&U5      n	U R                  " SSUUUUU	USS.UD6n'[?        U'R@                  U(       a  U'RB                  OSU'RD                  U'RF                  Ub  WOSUb  WS9$ SS9$ )a  
input_features_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Attention mask for `input_features` where non-zero values mark valid audio frames.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

>>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```
Nrk  r   rV   )r`  r   T)r  )r)  r  ry  r  )r)  r  )r`  rd  r  r  r  r)  ri  r  )rC  r  r   r  image_hidden_statesr   r\   )$r_   r  r   r  r=   r  
zeros_liker  rb  r  r   r  r;   r  r  r  r  r  r  r  r  masked_scatterget_audio_featuresr   rc  r  r?  r   r   rD  r   r   rC  r  r   r  )(rb   r`  r  r  r  r  r  r  r  r)  r  ri  r  per_layer_inputs_maskper_layer_inputs_tokensrd  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  rC  audio_outputsr  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr  outputss(                                           rh   r   Gemma3nModel.forwardy  s   ` -t";<YZZ  557	BM %*$5$5i1niRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM),,]-A-A=CVCVWM#.#8#8#<#F#F}#U !KK(<m][M #&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL'??=+?+?ATATUL","6"6r":"D"D]"S!KK(;\=YM# #!44\t4TbbN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I 33NDXfj3kM*88N&55J "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%% 

-)%+'

 

 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
rz   zPProjects the last hidden state from the audio encoder into language model space.c                 n    U R                   " X4SS0UD6nU R                  UR                  S9nXTl        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
r  Tr  )r  r  rC  r  )rb   r  r  rc   r  r  s         rh   r  Gemma3nModel.get_audio_features	  sN     9=8H8H9
=A9
EK9
 ''m6U6U'V&2#rz   )rZ  r  r  r  r=   )NNNN)NNNNNNNNNNN)r{   r|   r}   r~   r   r   r  r!  r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r   r  r   r   r   s   @rh   r  r    sa   
} 
:; !rs'' +, 
+	+	 t , .2263737*6##d**6 ((4/*6 ))D0	*6
 ))D0*6X  .21537.23704(,2626*.!%F
##d*F
 ''$.F
 ))D0	F

 t+F
 #\\D0F
 &&-F
 F
 ((4/F
 ((4/F
   4'F
 $;F
 ./F
 
$F
 F
P !st #\\ +,	
 
/	/ u rz   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
            SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\\   S\4S jj5       5       r            SU 4S jjrS rSrU =r$ )Gemma3nForConditionalGenerationi	  Fr   c                    > [         TU ]  U5        U R                  R                   Vs/ s H  nSU 3PM
     snU l        g s  snf r  r  r  s      rh   r   (Gemma3nForConditionalGeneration.__init__"	  r  r  c                 6    U R                   R                  5       $ r   )r  r  r  s    rh   r  >Gemma3nForConditionalGeneration.get_per_layer_input_embeddings)	  s    zz88::rz   c                 :    U R                   R                  U5        g r   )r  r!  r   s     rh   r!  >Gemma3nForConditionalGeneration.set_per_layer_input_embeddings,	  s    

11%8rz   Nr`  r  r  r  r  r  r  r  r)  r  ri  logits_to_keepr  r   c                 \   U R                   " SUUUUUUUUU	U
USS.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnU
b6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  S9$ )a<  
input_features_mask (torch.Tensor, *optional*, defaults to None):
    The attention mask for the input audio.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in
    `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
T)r`  r  r  r  r  r  r  r  r)  r  ri  r  N)lossr  r  r   r  r  r   r\   )r  rC  r]   r   slicelm_headr   r-  rI   r   r  loss_functionr;   r   r  r   r  r  r   )rb   r`  r  r  r  r  r  r  r  r)  r  ri  r  r  r  r   slice_indicesr  rI   r  s                       rh   r   'Gemma3nForConditionalGeneration.forward/	  sD   D ** 
%)) 3%+)'
 
   118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^lbklD,#33!//)) ' ; ; ' ; ;
 	
rz   c                 p   > [         TU ]  " U4UUUUU
UU	US.UD6nU(       d  U
(       d  X_S'   XoS'   XS'   U$ )N)r  r)  r  r  ri  r  r  is_first_iterationr  r  r  )r   prepare_inputs_for_generation)rb   r`  r  r)  r  r  r  r  r  r  ri  r  r  r  rc   model_inputsr   s                   rh   r  =Gemma3nForConditionalGeneration.prepare_inputs_for_generation	  se    $ w<
+')%))1
 
  Y+7(-;)*2E./rz   c                     [        S5      e)Nz7Do not inherit create_masks_for_generate from PaliGemma)r   )rb   super_kwargss     rh   create_masks_for_generate9Gemma3nForConditionalGeneration.create_masks_for_generate	  s    VWWrz   r  )NNNNNNNNNNNr   )NNNNNNNNTNNF)r{   r|   r}   r~   accepts_loss_kwargsr   r   r  r!  r   r   r   r  r   r   r   r   r   r   r   r   r   r  r  r   r   r   s   @rh   r  r  	  s     
} 
;9  .21537.23704(,2626*.!%-.e
##d*e
 ''$.e
 ))D0	e

 t+e
 #\\D0e
 &&-e
 e
 ((4/e
 ((4/e
   4'e
 $;e
 ell*e
 ./e
 
'e
  e
T   'RX Xrz   r  )
r   r:  r   r  r  r  r  r2   r  r   )rV   )qr  collections.abcr   r   dataclassesr   typingr   r   torch.nnr   torch.nn.functionalr/  r  huggingface_hub.dataclassesr    r	   r  ru  r
   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   autor   gemma2.modeling_gemma2r    r!   r"   r#   gemma3.configuration_gemma3r$   gemma3.modeling_gemma3r%   r&   r'   r(   r)   paligemma.modeling_paligemmar*   r+   r,   r-   'timm_wrapper.configuration_timm_wrapperr.   accelerate.hooksr/   
get_loggerr{   r   r2   r   r   r   r   r   r   Moduler   r   rR  r  r  r  r  r/  r<  rN  rZ  r^  ri  r  r   r   r  r  r  r  r:  r  r  r  r  r  r  __all__r\   rz   rh   <module>r     s    . !      . & ! . 3 R S 6 F &  8 5   ;   H 3			H	% 01G( G  2GT 01_) _  2_D 01%#+ %#  2%#P 01O($ O(  2O(d 
3%? 3  39!= 9$9$C 9,4RYY 40g)BII g)TaBII aHj,bii j,ZB7		 B7JF")) FRORYY O8Dryy D2(ryy (V <	%B 	;RYY ;$5Y 5@`'ryy `'F.ELL .u|| .%,, ._b .,l)299 l)^B%0 B%JVJ2 VJrN
0 N
b	2 	 abA
 A
 cA
H ^_
* 
 `
/B		 /Bd t> ttn cX&G cXcXLrz   