
    Z j                     v   S SK Jr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJrJrJrJr  SSKJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4  SSK5J6r6J7r7  SSK8J9r9J:r:  \1Rv                  " \<5      r= " S S\R|                  5      r? " S S\R|                  5      r@ " S S\R|                  5      rAS rB\" S5      SLS j5       rCS\R                  S \ES!\R                  4S" jrF   SMS#\R|                  S$\R                  S%\R                  S&\R                  S'\R                  S-  S(\G\E-  S)\GS-  S*\GS-  S!\H\R                  \R                  4   4S+ jjrI\" \C5       " S, S-\R|                  5      5       rJ\" \C5       " S. S/\R|                  5      5       rK " S0 S1\5      rL " S2 S3\5      rM " S4 S5\R|                  5      rN " S6 S7\R|                  5      rO\/ " S8 S9\*5      5       rPS:\R                  S-  S\R                  S;\ES-  S!\R                  4S< jrR " S= S>\P5      rS " S? S@\P5      rT\/ " SA SB\P5      5       rU\/ " SC SD\P5      5       rV " SE SF\P\5      rW\/ " SG SH\P5      5       rX\/ " SI SJ\P5      5       rY/ SKQrZg)N    )Callable)OptionalN   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )T5GemmaConfigT5GemmaModuleConfigc                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
T5GemmaRMSNorm<   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r.   nn	Parametertorchzerosweight)selfr-   r.   	__class__s      }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr2   T5GemmaRMSNorm.__init__=   s,    ll5;;s#34    c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )N   T)keepdim)r5   rsqrtpowmeanr.   )r8   xs     r:   _normT5GemmaRMSNorm._normB   s4    5;;quuQx}}R}>IJJJr<   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )N      ?)rE   floatr7   type_as)r8   rD   outputs      r:   forwardT5GemmaRMSNorm.forwardE   sC    AGGI& 3!2!2!445~~a  r<   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler7   shaper.   r8   s    r:   
extra_reprT5GemmaRMSNorm.extra_reprL   s'    ))*+6$((<<r<   )r.   r7   )gư>)__name__
__module____qualname____firstlineno__intrI   r2   rE   rL   rR   __static_attributes____classcell__r9   s   @r:   r+   r+   <   s0    5C 5e 5 5
K!= =r<   r+   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLPP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        [
        R                  " UR                  5      U l        g )NFbias)r1   r2   confighidden_sizeintermediate_sizer3   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr8   rb   r9   s     r:   r2   T5GemmaMLP.__init__Q   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56r<   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ r0   )rj   rf   rg   rm   rh   )r8   rD   hidden_statesrh   s       r:   rL   T5GemmaMLP.forward\   sH    DNN1$56aH]3NN=1	r<   )rj   rb   rh   rm   rf   rc   rd   rg   )rT   rU   rV   rW   r2   rL   rY   rZ   r[   s   @r:   r]   r]   P   s    	7 r<   r]   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )T5GemmaRotaryEmbeddingc   inv_freqNrb   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrv   F)
persistentoriginal_inv_freq)r1   r2   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   rope_parametersrx   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r8   rb   devicerope_init_fnrv   r9   s        r:   r2   T5GemmaRotaryEmbedding.__init__f   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr<   r   ztorch.deviceseq_lenreturnztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNrH   r   r>   dtyper   r   )	r   getattrrc   num_attention_headsr5   arangeint64torI   )rb   r   r   baser-   attention_factorrv   s          r:   r   6T5GemmaRotaryEmbedding.compute_default_rope_parametersv   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r<   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r?   r'   mpscpuF)device_typeenabledr>   r-   r   )rv   rI   expandrP   r   r   
isinstancetypestrr#   	transposer5   catcosr   sinr   )
r8   rD   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r:   rL   T5GemmaRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)r   rb   r}   r~   rx   r0   NNN)rT   rU   rV   rW   r5   Tensor__annotations__r(   r2   staticmethodr   rX   rO   rI   r   no_gradr   rL   rY   rZ   r[   s   @r:   rt   rt   c   s    llV} V V  '++/"*$*(* t* 
~u$	%	* *: ]]_<  <r<   rt   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr?   r>   r   )rP   r5   r   )rD   x1x2s      r:   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r:   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr<   rq   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r'   N)rP   r   reshape)rq   r   batchnum_key_value_headsslenr   s         r:   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr<   modulequerykeyvalueattention_maskrm   scalingsoftcapc                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )N      r>   r   r?   )r-   r   )ptrainingr'   )r   r   num_key_value_groupsr5   matmulr   tanhr3   
functionalsoftmaxfloat32r   r   rm   r   
contiguous)r   r   r   r   r   rm   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r:   eager_attention_forwardr      s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r<   c                   0  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )T5GemmaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperrb   	layer_idxc                 N  > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        UR                  S-  U l        U R
                  R                  U l        UR                   U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR                  UR(                  S9U l        U R
                  R2                  U l        U R                  S:X  a  UR4                  U l        g S U l        g )Nlayer_typesr   r   r`   sliding_attention)r1   r2   hasattrr   
layer_typerb   r   r   rc   r   r   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr3   re   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr8   rb   r   r9   s      r:   r2   T5GemmaSelfAttention.__init__   s   ;B6=;Y;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>**ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cgr<   Nrq   position_embeddingsr   past_key_valuesr   r   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       a  U R                   OSU R"                  U R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )Nr?   r'   r>           rm   r   r   r   )rP   r   r   viewr   r   r   r   updater   r   get_interfacerb   _attn_implementationr   r   r   r   r   r   r   r   r   )r8   rq   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                   r:   rL   T5GemmaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8%
 /3mmD**LL..//%
 %
! "));;;;FFHkk+.((r<   )r   r   rb   r   r   r   r   r   r   r   r   r   r   r   r   )rT   rU   rV   rW   __doc__r)   rX   r2   r5   r   rO   r   r   r   rL   rY   rZ   r[   s   @r:   r   r      s    Gh2 hs h< IM.2(,()||() #5<<#=>E() t+	()
 () -.() 
u||U\\D0%2E2LL	M() ()r<   r   c                     ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S-  S	\R                  S-  S
\
S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )T5GemmaCrossAttentioni=  r   rb   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R,                  U l        UR$                  c  [/        S5      eg )Nr   r   Fr`   zBCross-attention needs cross_attention_hidden_size to be specified.)r1   r2   rb   r   r   rc   r   r   r   r   r   r   r   r   r3   re   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   s      r:   r2   T5GemmaCrossAttention.__init__A  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
 ii&&68J8JQWQfQf
 '+kk&H&H#--5abb 6r<   Nrq   r   encoder_hidden_statesr   r   r   c                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         R"                  " U R$                  R&                  [(        5      nU" U UUUU4U R*                  (       a  U R,                  OSU R.                  S U R0                  S.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )Nz5Encoder hidden state is required for cross attention.r?   r'   r>   Tr   r   )r  rP   r   r   r   r   
is_updatedgetr   cross_attention_cacher   r   r   layerskeysvaluesr   r   rb   r   r   r   r   r   r   r   r   r   )r8   rq   r   r  r   r   r   r   r   r
  curr_past_key_valuesencoder_input_shapeencoder_hidden_shaper   r   r   r   r   s                     r:   rL   T5GemmaCrossAttention.forward]  s9    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ#2#H#H "*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+?+F+Fzaeaoao+p(
=A**4>>:-44T^^DIIJ/66t~~FMML(?(M(MKK,,.E)
 %8%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r<   )r   r   rb   r   r   r   r   r   r   r   r   r   r0   )rT   rU   rV   rW   r  r)   rX   r2   r5   r   r   r   r   rO   rL   rY   rZ   r[   s   @r:   r  r  =  s    Gc2 cs cB )-3)||3) t+3)  %||d2	3)
 3) -.3) 
u||U\\D0%2E2LL	M3) 3)r<   r  c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\	\R                  4   4
S jjrSrU =r$ )T5GemmaEncoderLayeri  zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)rb   r   r.   )r1   r2   rc   rb   r   r   attention_typer   	self_attnr+   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormr]   mlppre_feedforward_layernormpost_feedforward_layernormr3   rk   rl   rm   r   s      r:   r2   T5GemmaEncoderLayer.__init__  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r<   Nrq   r   r   r   r   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)rq   r   r   r   r    )r  r  r  rm   r  r  r   )r8   rq   r   r   r   r   residual_s           r:   rL   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r<   )r  rb   rm   rc   r   r  r   r  r  r  r  r   )rT   rU   rV   rW   r  rX   r2   r5   r   rO   
LongTensorFloatTensorrL   rY   rZ   r[   s   @r:   r  r    s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	" r<   r  c                   H  ^  \ rS rSrSrS\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  4S jjrSrU =r$ )T5GemmaDecoderLayeri  z2Decoder sub-layer: an extra cross-attention layer.r   c                   > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        [+        XS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g r  )r1   r2   rc   rb   r   r   r  r   r  r+   r  r  r  r]   r  r  r   r3   rk   rl   rm   r  
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   s      r:   r2   T5GemmaDecoderLayer.__init__  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r<   Nrq   r   r   r   r   	use_cacher  encoder_attention_maskr   c	           
         Un
U R                  U5      nU R                  " SUUUUUb  UR                  OS US.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  " SUUUUUS.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)rq   r   r   r   r   r0  )rq   r  r   r   r0  r#  )r  r  self_attention_cacher  rm   r-  r,  r.  r  r  r   )r8   rq   r   r   r   r   r0  r  r1  r   r$  r%  s               r:   rL   T5GemmaDecoderLayer.forward  s-    !44]C>> 
' 3)%DSD_O@@ei
 
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r<   )r  rb   r,  rm   rc   r   r  r.  r   r  r-  r  r  r  )NNNNFNN)rT   rU   rV   rW   r  rX   r2   r5   r   rO   r'  r
   boolr(  rL   rY   rZ   r[   s   @r:   r*  r*    s    <e# e4 IM.2046:!&596:,||, #5<<#=>E, t+	,
 &&-, -t3, $;,  %||d2, !&t 3, 
		, ,r<   r*  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadi  z-Head for sentence-level classification tasks.rc   
num_labelsclassifier_dropout_ratec                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)r   )r1   r2   r3   rk   rm   re   out_proj)r8   rc   r8  r9  r9   s       r:   r2   "T5GemmaClassificationHead.__init__  s/    zz$;<		+:r<   rq   r   c                 J    U R                  U5      nU R                  U5      nU$ r0   rm   r;  )r8   rq   s     r:   rL   !T5GemmaClassificationHead.forward  s$    ]3m4r<   r>  )r   )rT   rU   rV   rW   r  rX   rI   r2   r5   r   rL   rY   rZ   r[   s   @r:   r7  r7    sF    7;C ;S ;SX ; ;
U\\ ell  r<   r7  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.rc   
vocab_sizera   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nr`   )r1   r2   r3   re   r;  )r8   rc   rB  ra   r9   s       r:   r2   T5GemmaLMHead.__init__!  s     		+Er<   rq   r   c                 (    U R                  U5      nU$ r0   r;  )r8   rq   logitss      r:   rL   T5GemmaLMHead.forward%  s    }-r<   rF  )F)rT   rU   rV   rW   r  rX   r5  r2   r5   r   rL   rY   rZ   r[   s   @r:   rA  rA    sJ    8FC FS F F FU\\ ell  r<   rA  c            	          ^  \ rS rSr% \\S'   SrSrSS/rS/r	Sr
SrSrSrSr\\" \SS	S
9\" \SSS
9\" \SSS
9/S.r\R*                  " 5       U 4S j5       rS rSrU =r$ )T5GemmaPreTrainedModeli*  rb   modelTr  r*  r   r'   r  )index
layer_namer,  )rq   
attentionsc                 Z  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  [        UR                  S5      (       aC  UR                  R                  b+  [        R                  " UR                  R                  5        g g g [	        U[        5      (       ao  U R                  R                  (       dS  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  g g SUR                   R"                  ;   a!  [        R                  " UR                  5        g g )Nr   r   r   )rC   stdra   RMSNorm)r1   _init_weightsrb   initializer_ranger   r7  r;  r7   rP   initnormal_r   ra   zeros_rA  tie_word_embeddingsr9   rT   )r8   r   rP  scaler9   s       r:   rR  $T5GemmaPreTrainedModel._init_weights@  s.    	f%kk++f788OO**003t;ELL//cs{Kv//FOO4H4H4TFOO001 5U/..;;22..44Q74?V__33#3;O 3 &**333KK& 4r<   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r?   r'   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rb   decoderbos_token_idpad_token_idr  	new_zerosrP   r   masked_fill_)r8   	input_idsdecoder_start_token_idr]  shifted_input_idss        r:   _shift_right#T5GemmaPreTrainedModel._shift_rightR  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r<   r#  )rT   rU   rV   rW   r(   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr*  r%   r   r  _can_record_outputsr5   r   rR  rc  rY   rZ   r[   s   @r:   rJ  rJ  *  s    &*#.0EF#4"5N!"&,/q[Q/q\R0lS
 ]]_' '"! !r<   rJ  	token_idsr]  c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r'   r   )r  r   r   r5   longonesrP   )ro  rq   r]  r   s       r:   make_default_2d_attention_maskrs  m  s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r<   c                      ^  \ rS rSr\\S.rU 4S jr\\	    SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )T5GemmaEncoderi~  )rN  rq   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf Nr  Frb   )r1   r2   r]  padding_idxrB  r3   	Embeddingrc   embed_tokensr+   r  normgradient_checkpointing
ModuleListrangenum_hidden_layersr  r  rk   rl   rm   rt   
rotary_emb	post_initr   s      r:   r2   T5GemmaEncoder.__init__       !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"560? 	 f   ?D$Nr`  r   r   inputs_embedsr   r   c                    US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      nUc=  [        R                  " UR
                  S   UR                  S9nUR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       d'  U R                  UUS.n[        S0 UD6[        S0 UD6S.nUn[        R                  " U R                  R                   S-  UR"                  S	9n	X-  nU R%                  U5      nU R'                  X5      n
[)        U R*                  S U R                  R,                   5       H*  u  pU" UU
X`R                  R.                  U      U40 UD6nM,     U R1                  U5      nU R%                  U5      n[3        US
9$ )N:You must specify exactly one of input_ids or inputs_embedsr   r'   r   r   )rb   r  r   full_attentionr         ?r   )last_hidden_stater#  )r  popr{  r5   r   rP   r   r   rs  rb   r]  r   dictr   r   tensorrc   r   rm   r  	enumerater  r  r   r|  r   )r8   r`  r   r   r  r   self_attn_mask_mappingmask_kwargsrq   
normalizerr   ilayer_modules                r:   rL   T5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8M <<(;(;A(>}G[G[\L'11!4L!;IVZVaVaVnVnoNNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA(#&{{'>'>q'AB	
 M  W 		-0]3+
 	
r<   rm   r{  r}  r  r|  ry  r  rB  NNNN)rT   rU   rV   rW   r   r  rn  r2   r$   r&   r5   r'  r   r(  r   r   rO   r   rL   rY   rZ   r[   s   @r:   ru  ru  ~  s    *,
$   .2.204266
##d*6
 t+6
 &&-	6

 ((4/6
 +,6
 
	 6
   6
r<   ru  c                   V  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
\        SS\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                   S-  S\S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )T5GemmaDecoderi  r'   )rL  )rN  cross_attentionsrq   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf rw  )r1   r2   r]  ry  rB  r3   rz  rc   r{  r+   r  r|  r}  r~  r  r  r*  r  rk   rl   rm   rt   r  r  r   s      r:   r2   T5GemmaDecoder.__init__  r  r  Nr`  r   r   r   r  r0  r  r1  r   r   c	                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d/  U(       a(  Uc%  [        [	        U R
                  S9[	        5       5      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d8  U R
                  UUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U=n[        5      (       d  S	['        U R
                  UUUS
90nUn[        R(                  " U R
                  R*                  S-  UR,                  S9nX-  nU R/                  U5      nU R1                  X5      n[3        U R4                  S U R
                  R6                   5       H2  u  nnU" UUXR
                  R8                  U      UUUUUS	   40 U	D6nM4     U R;                  U5      nU R/                  U5      n[=        UUS9$ )Nr  z0`encoder_hidden_states` must be given in decoderrx  r   r'   r  )rb   r  r   r   r   r  r  )rb   r  r   r  r  r   )r  r   r#  )r  r{  r   r
   r	   rb   get_seq_lengthr5   r   rP   r   r   rs  r]  r   r  r3  r   r   r   r  rc   r   rm   r  r  r  r  r   r|  r   )r8   r`  r   r   r   r  r0  r  r1  r   past_seen_tokensr  r  cross_attn_mask_mappingrq   r  r   r  r  s                      r:   rL   T5GemmaDecoder.forward  so    -t";<YZZ (OPP  --i8M}}/F 2,dkk2RT`TbcOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L!o&=;IVZVaVaVnVnoNNB0DII++!."0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR ";;;"/#9*?	#'# &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA|(#&{{'>'>q'AB%'(89
 
M  W 		-0]38++
 	
r<   r  )NNNNNNNN)rT   rU   rV   rW   r%   r   r  r*  rn  r2   r$   r&   r5   r'  r   r
   r(  r5  r   r   rO   r   rL   rY   rZ   r[   s   @r:   r  r    s   $%9C*+@J,$   .2.2046:26!%596:P
##d*P
 t+P
 &&-	P

 -t3P
 ((4/P
 $;P
  %||d2P
 !&t 3P
 +,P
 
:	:P
   P
r<   r  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\
R                   S-  S\
R                   S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModeli?  rb   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r1   r2   is_encoder_decoderr  ru  encoderr  r[  r  rn   s     r:   r2   T5GemmaModel.__init__A  sO     ((uvv%fnn5%fnn5r<   c                 6    U R                   R                  5       $ r0   r  get_input_embeddingsrQ   s    r:   r  !T5GemmaModel.get_input_embeddingsL      ||0022r<   c                 8    U R                   R                  U5      $ r0   r  set_input_embeddingsr8   new_embeddingss     r:   r  !T5GemmaModel.set_input_embeddingsO      ||00@@r<   Nr`  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr0  r   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUS.UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
r`  r   r   r  )r`  r   r   r  r   r  r1  r0  output_hidden_statesF)r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsr#  )	r  r  r[  r   r   r  rq   rN  r  )r8   r`  r   r   r  r  r  r  r   r  r  r0  r   r  decoder_outputss                  r:   rL   T5GemmaModel.forwardR  s    , ""ll #-)+	
 O !0 A A,, 

'1-/+"7#1

 

 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r<   )r[  r  )NNNNNNNNNNN)rT   rU   rV   rW   r(   r2   r  r  r!   r    r5   r'  r(  
BoolTensorr   r
   r   r5  r   r   r   rL   rY   rZ   r[   s   @r:   r  r  ?  sA   	} 	3A  .2370459:>8<266:-159!%6
##d*6
 ))D06
 &&-	6

 !++d26
 !& 0 04 76
 $..56
 )4/6
 -t36
 ||d*6
  %||d26
 $;6
 +,6
 
6
  6
r<   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  rb   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r1   r2   r  r  ru  r  r  rn   s     r:   r2   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r<   c                 6    U R                   R                  5       $ r0   r  rQ   s    r:   r  (T5GemmaEncoderModel.get_input_embeddings  r  r<   c                 8    U R                   R                  U5      $ r0   r  r  s     r:   r  (T5GemmaEncoderModel.set_input_embeddings  r  r<   Nr`  r   r   r  r   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nr  r#  r  )r8   r`  r   r   r  r   r  s          r:   rL   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r<   r  r  )rT   rU   rV   rW   r(   r2   r  r  r!   r    r5   r'  r(  r   r   r   r   rL   rY   rZ   r[   s   @r:   r  r    s    } 3A  .23704-1##d* ))D0 &&-	
 ||d* +, 
  r<   r  c            "       2  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
\\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R*                  -  S\\   S\\R                     \-  4S jj5       5       rS\R*                  4S jrSrU =r$ ) T5GemmaForConditionalGenerationi  zlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrq   rG  rb   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)r  r1   r2   r  rK  r[  rB  rA  rc   lm_head	loss_typer  rn   s     r:   r2   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r<   c                 2   XR                   l        U R                  R                  (       al  UR                  U R
                  R                  R                  l        UR                  R                  S   U R
                  R                  R                  l	        g g )Nr   )
r  r;  rb   rW  r7   rK  r[  r{  rP   num_embeddingsr  s     r:   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings  sh     . ;;**5C5J5JDJJ++2=K=R=R=X=XYZ=[DJJ++: +r<   c                 .    U R                   R                  $ r0   )r  r;  rQ   s    r:   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings  s    ||$$$r<   Nr`  r   r   r  r  r  r  r   r  r  labelsr0  logits_to_keepr   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
US.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r`  r   r   r  r  r  r  r   r  r  r0  )	lossrG  r   r  r  r  r  r  r  r#  )rc  rK  r  r   rX   slicer  get_decoderrb   final_logit_softcappingr5   r   loss_functionrB  r   r   r  r  r  r  r  r  )r8   r`  r   r   r  r  r  r  r   r  r  r  r0  r  r   r  rq   slice_indicesrG  decoder_configr  s                        r:   rL   'T5GemmaForConditionalGeneration.forward  sp   : "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7/
 /
 (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r<   c                 $    U R                  U5      $ r0   )rc  )r8   r  s     r:   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r<   )r  r  rK  rB  )NNNNNNNNNNNNr   )rT   rU   rV   rW   _tied_weights_keys_tp_plan_pp_planr(   r2   r  r  r!   r    r5   r'  r(  r  r   r
   r5  rX   r   r   r   rO   r   rL   r  rY   rZ   r[   s   @r:   r  r    s   35XY"$;<H"o%6
$CDH	} 	\%  .2370459:>8<266:26:>*.!%-.G
##d*G
 ))D0G
 &&-	G

 !++d2G
 !& 0 04 7G
 $..5G
 )4/G
 -t3G
 ((4/G
  %0047G
   4'G
 $;G
 ell*G
 +,G
  
u  	!O	3!G
  G
R)ELL ) )r<   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationi  Nrb   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
Nr9  皙?r  r1   r2   r8  r  rK  r  r  rc   r[  r   r7  scorer  r8   rb   r  rc   classifier_dropoutr9   s        r:   r2   )T5GemmaForSequenceClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r<   c                 6    U R                   R                  5       $ r0   rK  r  rQ   s    r:   r  5T5GemmaForSequenceClassification.get_input_embeddings6      zz..00r<   c                 :    U R                   R                  U5        g r0   rK  r  r8   r   s     r:   r  5T5GemmaForSequenceClassification.set_input_embeddings9      

''.r<   r`  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   r  r  r  r  r  r  r0  r   r   r  r   r'   z=Cannot handle batch sizes > 1 if no padding token is defined.r?   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )rG  r  pooled_logitsrb   r  rG  rq   rN  )rb   r  NotImplementedErrorr9   rT   r  rc  rK  r  r  r  rq   rN  r  rP   r]  r   r   r5   int32r   argmaxclamploggerwarning_oncer  r   )r8   r`  r   r   r  r  r  r  r  r  r  r   outputsr  rq   rN  rG  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr
  r  s                          r:   rL   (T5GemmaForSequenceClassification.forward<  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r<   rK  r8  r  r0   
NNNNNNNNNN)rT   rU   rV   rW   r(   r5  r2   r  r  r!   r    r5   r'  r   r   r(  r   r   r   rL   rY   rZ   r[   s   @r:   r  r    sS   } $+  .1/  .2.204596:8<2626:>*.i
##d*i
 t+i
 &&-	i

 !++d2i
 !&t 3i
 $..5i
 )4/i
 ((4/i
  %0047i
   4'i
 +,i
 
"i
  i
r<   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi  Nrb   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
Nr9  r  r  r  s        r:   r2   &T5GemmaForTokenClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r<   c                 6    U R                   R                  5       $ r0   r  rQ   s    r:   r  2T5GemmaForTokenClassification.get_input_embeddings  r  r<   c                 :    U R                   R                  U5        g r0   r  r  s     r:   r  2T5GemmaForTokenClassification.set_input_embeddings  r  r<   r`  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r  r  Fr  r  r  )rb   r  r  r9   rT   r  rc  rK  r  r  r  rq   rN  r  r  r   )r8   r`  r   r   r  r  r  r  r  r  r  r   r  r  rq   rN  rG  r  s                     r:   rL   %T5GemmaForTokenClassification.forward  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r<   r  r0   r  )rT   rU   rV   rW   r(   r5  r2   r  r  r!   r    r5   r'  r   r   r(  r   r   r   rL   rY   rZ   r[   s   @r:   r  r    sS   } $+  01/  .2.204596:8<2626:>*.N
##d*N
 t+N
 &&-	N

 !++d2N
 !&t 3N
 $..5N
 )4/N
 ((4/N
  %0047N
   4'N
 +,N
 
N
  N
r<   r  )r  r  r  rJ  r  r  )r'   )r   NN)[collections.abcr   typingr   r5   torch.nnr3    r   rT  activationsr   cache_utilsr   r	   r
   
generationr   integrationsr   r   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r    r!   r"   utils.genericr#   r$   utils.output_capturingr%   r&   configuration_t5gemmar(   r)   
get_loggerrT   r  Moduler+   r]   rt   r   r   r   rX   r   rI   rO   r   r   r  r  r*  r7  rA  rJ  r'  rs  ru  r  r  r  r  r  r  __all__r#  r<   r:   <module>r:     s%  * %    & ! C C ) I  C 9  L F & R R G E E 
		H	%=RYY =( &><RYY ><B( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D )*F)299 F) +F)R )*R)BII R) +R)j14 1hF4 FR		 	BII 	 ?!_ ?! ?!D$&<< * \\	"P
+ P
fk
+ k
\ J
) J
 J
Z !0 ! !Hh)&<o h)V I
'= I
 I
X o
$: o
 o
dr<   