
    Z j                        S SK Jr  S SKJr  S SKrS SKJr  S SKJr  SSK	J
r  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2  SSK3J4r4  SSK5J6r6J7r7J8r8J9r9J:r:J;r;  \-Rx                  " \=5      r>\+" SS9\ " S S\45      5       5       r?\+" SS9\ " S S\5      5       5       r@ " S S\95      rA " S S\75      rB " S  S!\:5      rC " S" S#\65      rD " S$ S%\65      rE " S& S'\5      rF " S( S)\5      rG " S* S+\R                  5      rI " S, S-\R                  5      rJ\+ " S. S/\85      5       rKS0\R                  S-  S1\R                  S2\NS-  S3\R                  4S4 jrO " S5 S6\K5      rP " S7 S8\K5      rQ\+ " S9 S:\K5      5       rR\+ " S; S<\K5      5       rS " S= S>\K\5      rT\+ " S? S@\K5      5       rU\+ " SA SB\K5      5       rV/ SCQrWg)D    )Callable)AnyN)strict   )initialization)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_mask(create_bidirectional_sliding_window_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingeager_attention_forwardzgoogle/t5_gemma_module-7b)
checkpointc                   6    \ rS rSr% SrSr\\S'   \" 5       r	Sr
g)T5GemmaModuleConfigB   a  
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
    scaling factor used on the attention scores
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
    scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
    scaling factor when applying tanh softcapping on the attention scores.

```python
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
>>> configuration = T5GemmaModuleConfig()
>>> # Initializing a model from the t5_gemma_module-7b style configuration
>>> model = T5GemmaModuleModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```F
is_decoder N)__name__
__module____qualname____firstlineno____doc__r/   bool__annotations__AttributeErroruse_bidirectional_attention__static_attributes__r0       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr-   r-   B   s    $ J"0"2r;   r-   c                      ^  \ rS rSr% SrSrS/r\\S.rSr	\\
\\4   -  S-  \S'   Sr\\
\\4   -  S-  \S'   S	r\\S
'   Sr\\-  \S'   Sr\\-  \S'   Sr\\-  \S'   S	r\\S'   Sr\\S'   U 4S jrSrU =r$ )T5GemmaConfig[   a  
encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
    Configuration for the encoder.
decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
    Configuration for the decoder.

Example:

```python
>>> from transformers import T5GemmaConfig, T5GemmaModel
>>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
>>> model = T5GemmaModel(t5gemma_config)
```t5gemmapast_key_values)encoderdecoderNrB   rC   Tis_encoder_decoder        dropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddingsi  
vocab_sizec                   > [        U R                  [        5      (       a  [        S0 U R                  D6U l        OU R                  c  [        5       U l        [        U R                  [        5      (       a  [        S0 U R                  D6U l        OU R                  c  [        5       U l        SU R                  l        U R                  U R                  l        U R                  U R                  l        SU R                  l        SU R                  l        U R                  U R                  l        U R                  U R                  l        U R                  R                  U R                  l
        UR                  SU R                  R                  5      U l        S H"  nX!;  d  M
  [        U R                  U5      X'   M$     [        TU ]<  " S0 UD6  g )NFTinitializer_range)bos_token_idpad_token_ideos_token_idr0   )
isinstancerB   dictr-   rC   r/   rF   rH   	use_cachehidden_sizecross_attention_hidden_sizepoprL   getattrsuper__post_init__)selfkwargsspecial_token_key	__class__s      r<   rX   T5GemmaConfig.__post_init__y   sK   dllD)).>>DL\\!.0DLdllD)).>>DL\\!.0DL"'$($5$5!)-)?)?&"&!%$($5$5!)-)?)?&37<<3K3K0!',?A_A_!`!Q .,3DLLBS,T) "R 	''r;   )rC   rB   rL   )r1   r2   r3   r4   r5   
model_typekeys_to_ignore_at_inferencer-   sub_configsrB   rQ   r   r7   rC   rD   r6   rF   intfloatrG   rH   rI   rJ   rX   r:   __classcell__r\   s   @r<   r>   r>   [   s     J#4"51>QRK;?G 4S>1D8?;?G 4S>1D8?## #L#+#+.S5[.%(us{( $$J( (r;   r>   c                       \ rS rSrSrg)T5GemmaRMSNorm   r0   Nr1   r2   r3   r4   r:   r0   r;   r<   rf   rf          r;   rf   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLP   c                 n   > [         TU ]  U5        [        R                  " UR                  5      U l        g N)rW   __init__nnDropoutrF   dropoutrY   configr\   s     r<   ro   T5GemmaMLP.__init__   s&     zz&"5"56r;   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ rn   )act_fn	gate_projup_projrr   	down_proj)rY   xhidden_statesrz   s       r<   forwardT5GemmaMLP.forward   sH    DNN1$56aH]3NN=1	r;   )rr   )r1   r2   r3   r4   ro   r}   r:   rc   rd   s   @r<   rk   rk      s    7 r;   rk   c                       \ rS rSrSrg)T5GemmaRotaryEmbedding   r0   Nrh   r0   r;   r<   r   r      ri   r;   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )T5GemmaSelfAttention   rt   	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g rn   )rW   ro   r/   	is_causalrY   rt   r   r\   s      r<   ro   T5GemmaSelfAttention.__init__   s    +**r;   )r   )	r1   r2   r3   r4   r-   ra   ro   r:   rc   rd   s   @r<   r   r      s    +2 +s + +r;   r   c                   
  ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\R                  S-  S\R                  S-  S	\	S-  S
\
\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )T5GemmaCrossAttention   rt   r   c                   > [         TU ]  X5        U ?U ?SU l        UR
                  c  [        S5      e[        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        g )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rW   ro   sliding_window
layer_typer   rT   
ValueErrorrp   Linearnum_key_value_headshead_dimattention_biask_projv_projr   s      r<   ro   T5GemmaCrossAttention.__init__   s    +O--5abbii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
r;   Nr|   attention_maskencoder_hidden_statesrA   rZ   returnc                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         R"                  " U R$                  R&                  [(        5      nU" U UUUU4U R*                  (       a  U R,                  OSU R.                  S U R0                  S.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )Nz5Encoder hidden state is required for cross attention.   r#   TrE   )rr   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedgetr   cross_attention_cacher   r   updatelayerskeysvaluesr   get_interfacert   _attn_implementationr*   trainingrH   r   attn_logit_softcappingreshape
contiguouso_proj)rY   r|   r   r   rA   rZ   input_shapehidden_shapequery_statesr   curr_past_key_valuesencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                     r<   r}   T5GemmaCrossAttention.forward   s9    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ#2#H#H "*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+?+F+Fzaeaoao+p(
=A**4>>:-44T^^DIIJ/66t~~FMML(?(M(MKK,,.E)
 %8%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r;   )r   r   r   rn   )r1   r2   r3   r4   r-   ra   ro   torchTensorr   r   r   tupler}   r:   rc   rd   s   @r<   r   r      s    
2 
s 
* )-3)||3) t+3)  %||d2	3)
 3) -.3) 
u||U\\D0%2E2LL	M3) 3)r;   r   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\	\R                  4   4
S jjrSrU =r$ )T5GemmaEncoderLayer   zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)rt   r   eps)rW   ro   rS   rt   r   layer_typesattention_typer   	self_attnrf   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrk   mlppre_feedforward_layernormpost_feedforward_layernormrp   rq   rF   rr   r   s      r<   ro   T5GemmaEncoderLayer.__init__   s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r;   Nr|   position_embeddingsr   position_idsr   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)r|   r   r   r   rA   r0   )r   r   r   rr   r   r   r   )rY   r|   r   r   r   rZ   residual_s           r<   r}   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r;   )r   rt   rr   rS   r   r   r   r   r   r   r   )NNN)r1   r2   r3   r4   r5   ra   ro   r   r   r   
LongTensorFloatTensorr}   r:   rc   rd   s   @r<   r   r      s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	" r;   r   c                   H  ^  \ rS rSrSrS\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  4S jjrSrU =r$ )T5GemmaDecoderLayeri-  z2Decoder sub-layer: an extra cross-attention layer.r   c                   > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        [+        XS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g r   )rW   ro   rS   rt   r   r   r   r   r   rf   r   r   r   rk   r   r   r   rp   rq   rF   rr   r   
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   s      r<   ro   T5GemmaDecoderLayer.__init__0  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r;   Nr|   r   r   r   rA   rR   r   encoder_attention_maskr   c	           
         Un
U R                  U5      nU R                  " SUUUUUb  UR                  OS US.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  " SUUUUUS.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)r|   r   r   r   rA   rR   )r|   r   r   rA   rR   r0   )r   r   self_attention_cacher   rr   r   r   r   r   r   r   )rY   r|   r   r   r   rA   rR   r   r   rZ   r   r   s               r<   r}   T5GemmaDecoderLayer.forwardG  s-    !44]C>> 
' 3)%DSD_O@@ei
 
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r;   )r   rt   r   rr   rS   r   r   r   r   r   r   r   r   r   )NNNNFNN)r1   r2   r3   r4   r5   ra   ro   r   r   r   r   r
   r6   r   r}   r:   rc   rd   s   @r<   r   r   -  s    <e# e4 IM.2046:!&596:,||, #5<<#=>E, t+	,
 &&-, -t3, $;,  %||d2, !&t 3, 
		, ,r;   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadiv  z-Head for sentence-level classification tasks.rS   
num_labelsrG   c                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)p)rW   ro   rp   rq   rr   r   out_proj)rY   rS   r   rG   r\   s       r<   ro   "T5GemmaClassificationHead.__init__y  s/    zz$;<		+:r;   r|   r   c                 J    U R                  U5      nU R                  U5      nU$ rn   rr   r   )rY   r|   s     r<   r}   !T5GemmaClassificationHead.forward~  s$    ]3m4r;   r   )rE   )r1   r2   r3   r4   r5   ra   rb   ro   r   r   r}   r:   rc   rd   s   @r<   r   r   v  sF    7;C ;S ;SX ; ;
U\\ ell  r;   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.rS   rJ   r   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nr   )rW   ro   rp   r   r   )rY   rS   rJ   r   r\   s       r<   ro   T5GemmaLMHead.__init__  s     		+Er;   r|   r   c                 (    U R                  U5      nU$ rn   r   )rY   r|   logitss      r<   r}   T5GemmaLMHead.forward  s    }-r;   r   )F)r1   r2   r3   r4   r5   ra   r6   ro   r   r   r}   r:   rc   rd   s   @r<   r   r     sJ    8FC FS F F FU\\ ell  r;   r   c            	           \ rS rSr% \\S'   SrSrSS/r\	\
" \SSS	9\
" \SS
S	9\
" \SS
S	9/S.r\R                  " 5       S 5       rS rSrg)T5GemmaPreTrainedModeli  rt   modelTr   r   r   r   )index
layer_namer   )r|   
attentionsc                 f   [         R                  " X5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  [        UR                  S5      (       aC  UR                  R                  b+  [        R                  " UR                  R                  5        g g g [	        U[        5      (       ao  U R                  R                  (       dS  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  g g SUR                   R"                  ;   a!  [        R                  " UR                  5        g g )Nr   g      rE   )meanstdr   RMSNorm)r   _init_weightsrt   rL   rP   r   r   weightr   initnormal_hasattrr   zeros_r   rI   r\   r1   )rY   moduler  scales       r<   r  $T5GemmaPreTrainedModel._init_weights  s.    	%%d3kk++f788OO**003t;ELL//cs{Kv//FOO4H4H4TFOO001 5U/..;;22..44Q74?V__33#3;O 3 &**333KK& 4r;   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rt   rC   rM   rN   r   	new_zerosr   clonemasked_fill_)rY   	input_idsdecoder_start_token_idrN   shifted_input_idss        r<   _shift_right#T5GemmaPreTrainedModel._shift_right  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r;   r0   N)r1   r2   r3   r4   r>   r7   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r!   r   r   _can_record_outputsr   no_gradr  r  r:   r0   r;   r<   r   r     sv    &*#.0EF,/q[Q/q\R0lS
 ]]_' '"!r;   r   	token_idsr|   rN   r   c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r   devicedtype)r   tor  r   longonesr   )r  r|   rN   r   s       r<   make_default_2d_attention_maskr"    s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r;   c                      ^  \ rS rSr\\S.rU 4S jr\\	    SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )T5GemmaEncoderi  )r   r|   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf Nr   Frt   )rW   ro   rN   padding_idxrJ   rp   	EmbeddingrS   embed_tokensrf   r   normgradient_checkpointing
ModuleListrangenum_hidden_layersr   r   rq   rF   rr   r   
rotary_emb	post_initr   s      r<   ro   T5GemmaEncoder.__init__       !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"560? 	 f   ?D$Nr  r   r   inputs_embedsrZ   r   c                    US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      nUc=  [        R                  " UR
                  S   UR                  S9nUR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       d'  U R                  UUS.n[        S0 UD6[        S0 UD6S.nUn[        R                  " U R                  R                   S-  UR"                  S	9n	X-  nU R%                  U5      nU R'                  X5      n
[)        U R*                  S U R                  R,                   5       H*  u  pU" UU
X`R                  R.                  U      U40 UD6nM,     U R1                  U5      nU R%                  U5      n[3        US
9$ )N:You must specify exactly one of input_ids or inputs_embedsrA   r   r  r   )rt   r5  r   full_attentionsliding_attention      ?r  )last_hidden_stater0   )r   rU   r*  r   aranger   r  	unsqueezer"  rt   rN   rP   rQ   r   r   tensorrS   r  rr   r0  	enumerater   r/  r   r+  r   )rY   r  r   r   r5  rZ   self_attn_mask_mappingmask_kwargsr|   
normalizerr   ilayer_modules                r<   r}   T5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8M <<(;(;A(>}G[G[\L'11!4L!;IVZVaVaVnVnoNNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA(#&{{'>'>q'AB	
 M  W 		-0]3+
 	
r;   rr   r*  r,  r   r+  r(  r0  rJ   NNNN)r1   r2   r3   r4   r   r   r  ro   r    r"   r   r   r   r   r   r   r   r   r}   r:   rc   rd   s   @r<   r$  r$    s    *,
$   .2.204266
##d*6
 t+6
 &&-	6

 ((4/6
 +,6
 
	 6
   6
r;   r$  c                   V  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
\        SS\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                   S-  S\S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )T5GemmaDecoderi0  r   )r   )r   cross_attentionsr|   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf r&  )rW   ro   rN   r(  rJ   rp   r)  rS   r*  rf   r   r+  r,  r-  r.  r/  r   r   rq   rF   rr   r   r0  r1  r   s      r<   ro   T5GemmaDecoder.__init__7  r3  r4  Nr  r   r   rA   r5  rR   r   r   rZ   r   c	                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d/  U(       a(  Uc%  [        [	        U R
                  S9[	        5       5      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d8  U R
                  UUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U=n[        5      (       d  S	['        U R
                  UUUS
90nUn[        R(                  " U R
                  R*                  S-  UR,                  S9nX-  nU R/                  U5      nU R1                  X5      n[3        U R4                  S U R
                  R6                   5       H2  u  nnU" UUXR
                  R8                  U      UUUUUS	   40 U	D6nM4     U R;                  U5      nU R/                  U5      n[=        UUS9$ )Nr7  z0`encoder_hidden_states` must be given in decoderr'  r   r   r8  )rt   r5  r   rA   r   r9  r:  )rt   r5  r   r   r<  r=  )r>  rA   r0   )r   r*  r   r
   r	   rt   get_seq_lengthr   r?  r   r  r@  r"  rN   rP   rQ   r   r   r   r   rA  rS   r  rr   r0  rB  r   r/  r   r+  r   )rY   r  r   r   rA   r5  rR   r   r   rZ   past_seen_tokensrC  rD  cross_attn_mask_mappingr|   rE  r   rF  rG  s                      r<   r}   T5GemmaDecoder.forwardI  so    -t";<YZZ (OPP  --i8M}}/F 2,dkk2RT`TbcOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L!o&=;IVZVaVaVnVnoNNB0DII++!."0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR ";;;"/#9*?	#'# &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA|(#&{{'>'>q'AB%'(89
 
M  W 		-0]38++
 	
r;   rI  )NNNNNNNN)r1   r2   r3   r4   r!   r   r   r   r  ro   r    r"   r   r   r   r
   r   r6   r   r   r   r   r}   r:   rc   rd   s   @r<   rL  rL  0  s   $%9C*+@J,$   .2.2046:26!%596:P
##d*P
 t+P
 &&-	P

 -t3P
 ((4/P
 $;P
  %||d2P
 !&t 3P
 +,P
 
:	:P
   P
r;   rL  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\
R                   S-  S\
R                   S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModeli  rt   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rW   ro   rD   r   r$  rB   rL  rC   r1  rs   s     r<   ro   T5GemmaModel.__init__  sO     ((uvv%fnn5%fnn5r;   c                 6    U R                   R                  5       $ rn   rB   get_input_embeddingsrY   s    r<   r[  !T5GemmaModel.get_input_embeddings      ||0022r;   c                 8    U R                   R                  U5      $ rn   rB   set_input_embeddingsrY   new_embeddingss     r<   ra  !T5GemmaModel.set_input_embeddings      ||00@@r;   Nr  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsrA   r5  decoder_inputs_embedsrR   rZ   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUS.UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
r  r   r   r5  )r  r   r   r5  rA   r   r   rR   output_hidden_statesF)r>  rA   decoder_hidden_statesdecoder_attentionsrM  encoder_last_hidden_stater   encoder_attentionsr0   )	rB   r>  rC   r   rA   r   r|   r   rM  )rY   r  r   r   rf  rg  rh  ri  rA   r5  rj  rR   rZ   r   decoder_outputss                  r<   r}   T5GemmaModel.forward  s    , ""ll #-)+	
 O !0 A A,, 

'1-/+"7#1

 

 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r;   )rC   rB   )NNNNNNNNNNN)r1   r2   r3   r4   r>   ro   r[  ra  r   r   r   r   r   
BoolTensorr   r
   r   r6   r   r   r   r}   r:   rc   rd   s   @r<   rV  rV    sA   	} 	3A  .2370459:>8<266:-159!%6
##d*6
 ))D06
 &&-	6

 !++d26
 !& 0 04 76
 $..56
 )4/6
 -t36
 ||d*6
  %||d26
 $;6
 +,6
 
6
  6
r;   rV  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  rt   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rW   ro   rD   r   r$  rB   r1  rs   s     r<   ro   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r;   c                 6    U R                   R                  5       $ rn   rZ  r\  s    r<   r[  (T5GemmaEncoderModel.get_input_embeddings  r^  r;   c                 8    U R                   R                  U5      $ rn   r`  rb  s     r<   ra  (T5GemmaEncoderModel.set_input_embeddings  re  r;   Nr  r   r   r5  rZ   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nrl  r0   rB   )rY   r  r   r   r5  rZ   ri  s          r<   r}   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r;   r~  rJ  )r1   r2   r3   r4   r>   ro   r[  ra  r   r   r   r   r   r   r   r   r   r}   r:   rc   rd   s   @r<   rv  rv    s    } 3A  .23704-1##d* ))D0 &&-	
 ||d* +, 
  r;   rv  c            "       2  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
\\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R*                  -  S\\   S\\R                     \-  4S jj5       5       rS\R*                  4S jrSrU =r$ ) T5GemmaForConditionalGenerationi  zlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr|   r   rt   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)rD   rW   ro   rV  r   rC   rJ   r   rS   lm_head	loss_typer1  rs   s     r<   ro   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r;   c                 2   XR                   l        U R                  R                  (       al  UR                  U R
                  R                  R                  l        UR                  R                  S   U R
                  R                  R                  l	        g g )Nr   )
r  r   rt   rI   r  r   rC   r*  r   num_embeddingsrb  s     r<   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings!  sh     . ;;**5C5J5JDJJ++2=K=R=R=X=XYZ=[DJJ++: +r;   c                 .    U R                   R                  $ rn   )r  r   r\  s    r<   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings*  s    ||$$$r;   Nr  r   r   rf  rg  rh  ri  rA   r5  rj  labelsrR   logits_to_keeprZ   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
US.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r  r   r   rf  rg  rh  ri  rA   r5  rj  rR   )	lossr   rA   rn  ro  rM  rp  r   rq  r0   )r  r   r>  rP   ra   slicer  get_decoderrt   final_logit_softcappingr   tanhloss_functionrJ   r   rA   rn  ro  rM  rp  r   rq  )rY   r  r   r   rf  rg  rh  ri  rA   r5  rj  r  rR   r  rZ   rr  r|   slice_indicesr   decoder_configr  s                        r<   r}   'T5GemmaForConditionalGeneration.forward-  sp   : "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7/
 /
 (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r;   c                 $    U R                  U5      $ rn   )r  )rY   r  s     r<   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labelsx  s      ((r;   )r  r  r   rJ   )NNNNNNNNNNNNr   )r1   r2   r3   r4   _tied_weights_keys_tp_plan_pp_planr>   ro   r  r  r   r   r   r   r   rt  r   r
   r6   ra   r   r   r   r   r   r}   r  r:   rc   rd   s   @r<   r  r    s   35XY"$;<H"o%6
$CDH	} 	\%  .2370459:>8<266:26:>*.!%-.G
##d*G
 ))D0G
 &&-	G

 !++d2G
 !& 0 04 7G
 $..5G
 )4/G
 -t3G
 ((4/G
  %0047G
   4'G
 $;G
 ell*G
 +,G
  
u  	!O	3!G
  G
R)ELL ) )r;   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationi|  Nrt   rD   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
NrG   皙?rD   rW   ro   r   rV  r   rv  rB   rS   rC   rV   r   scorer1  rY   rt   rD   rS   classifier_dropoutr\   s        r<   ro   )T5GemmaForSequenceClassification.__init__~  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r;   c                 6    U R                   R                  5       $ rn   r   r[  r\  s    r<   r[  5T5GemmaForSequenceClassification.get_input_embeddings      zz..00r;   c                 :    U R                   R                  U5        g rn   r   ra  rY   values     r<   ra  5T5GemmaForSequenceClassification.set_input_embeddings      

''.r;   r  r   r   rf  rg  rh  ri  r5  rj  r  rZ   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   rf  rg  rh  ri  r5  rj  rR   r   r   r5  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r8  )r   r  pooled_logitsrt   r  r   r|   r   )rt   rD   NotImplementedErrorr\   r1   r   r  r   r>  rn  ro  r|   r   r  r   rN   r  r  r   int32r?  argmaxclamploggerwarning_oncer  r   )rY   r  r   r   rf  rg  rh  ri  r5  rj  r  rZ   outputsr>  r|   r   r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  s                          r<   r}   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r;   r   r   r  rn   
NNNNNNNNNN)r1   r2   r3   r4   r>   r6   ro   r[  ra  r   r   r   r   r   r   r   r   r   r   r}   r:   rc   rd   s   @r<   r  r  |  sS   } $+  .1/  .2.204596:8<2626:>*.i
##d*i
 t+i
 &&-	i

 !++d2i
 !&t 3i
 $..5i
 )4/i
 ((4/i
  %0047i
   4'i
 +,i
 
"i
  i
r;   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi	  Nrt   rD   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
NrG   r  r  r  s        r<   ro   &T5GemmaForTokenClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r;   c                 6    U R                   R                  5       $ rn   r  r\  s    r<   r[  2T5GemmaForTokenClassification.get_input_embeddings#  r  r;   c                 :    U R                   R                  U5        g rn   r  r  s     r<   ra  2T5GemmaForTokenClassification.set_input_embeddings&  r  r;   r  r   r   rf  rg  rh  ri  r5  rj  r  rZ   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r  r  Fr  r  r  )rt   rD   r  r\   r1   r   r  r   r>  rn  ro  r|   r   r  r  r   )rY   r  r   r   rf  rg  rh  ri  r5  rj  r  rZ   r  r>  r|   r   r   r  s                     r<   r}   %T5GemmaForTokenClassification.forward)  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r;   r  rn   r  )r1   r2   r3   r4   r>   r6   ro   r[  ra  r   r   r   r   r   r   r   r   r   r   r}   r:   rc   rd   s   @r<   r  r  	  sS   } $+  01/  .2.204596:8<2626:>*.N
##d*N
 t+N
 &&-	N

 !++d2N
 !&t 3N
 $..5N
 )4/N
 ((4/N
  %0047N
   4'N
 +,N
 
N
  N
r;   r  )r>   r-   r  rV  rv  r   r  r  )Xcollections.abcr   typingr   r   torch.nnrp   huggingface_hub.dataclassesr    r   r  cache_utilsr   r	   r
   configuration_utilsr   
generationr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr    utils.output_capturingr!   r"   gemma2.configuration_gemma2r$   gemma2.modeling_gemma2r%   r&   r'   r(   r)   r*   
get_loggerr1   r  r-   r>   rf   rk   r   r   r   r   r   Moduler   r   r   r   r   ra   r"  r$  rL  rV  rv  r  r  r  __all__r0   r;   r<   <module>r     sa   %    . & C C 3 )  C 9  G &  8 E 6  
		H	% 673, 3  83. 677($ 7(  87(t	] 		 		2 	+? +D)O D)N14 1hF4 FR		 	BII 	 8!2 8! 8!v$&<< * \\	"P
+ P
fk
+ k
\ J
) J
 J
Z !0 ! !Hh)&<o h)V I
'= I
 I
X o
$: o
 o
d	r;   