
    Z j                        S r SSKrSSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  \'RV                  " \,5      r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1 " S S\R\                  5      r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S S\R\                  5      r5 " S S \5      r6 " S! S"\R\                  5      r7\& " S# S$\"5      5       r8 " S% S&\85      r9\& " S' S(\85      5       r:\&" S)S*9 " S+ S,\8\5      5       r;\& " S- S.\85      5       r<\&" S/S*9 " S0 S1\85      5       r=\& " S2 S3\85      5       r>\& " S4 S5\85      5       r?/ S6Qr@g)7zPyTorch T5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringloggingtorch_compilable_check   )T5Configc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )T5LayerNorm.   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      s/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/t5/modeling_t5.pyr%   T5LayerNorm.__init__/   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor'   float32powmeanrsqrtr*   r)   dtypefloat16bfloat16)r+   hidden_statesvariances      r/   forwardT5LayerNorm.forward7   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r1   )r*   r)   )gư>)__name__
__module____qualname____firstlineno__r%   r@   __static_attributes____classcell__r.   s   @r/   r!   r!   .   s    $+ +r1   r!   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseActDenseG   configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r$   r%   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr+   rL   r.   s     r/   r%   T5DenseActDense.__init__H   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r1   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)rT   rZ   rX   
isinstancerU   r)   r'   Tensorr;   int8r6   r+   r>   s     r/   r@   T5DenseActDense.forwardO   s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r1   )rZ   rX   rT   rU   	rB   rC   rD   rE   r   r%   r@   rF   rG   rH   s   @r/   rJ   rJ   G   s    /x / r1   rJ   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseGatedActDense]   rL   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rN   )r$   r%   r   rQ   rR   rS   wi_0wi_1rU   rV   rW   rX   r	   rY   rZ   r[   s     r/   r%   T5DenseGatedActDense.__init__^   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r1   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ r^   )rZ   ri   rj   rX   r_   rU   r)   r'   r`   r;   ra   r6   )r+   r>   hidden_geluhidden_linears       r/   r@   T5DenseGatedActDense.forwardf   s    hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r1   )rZ   rX   ri   rj   rU   rd   rH   s   @r/   rf   rf   ]   s    /x / r1   rf   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	T5LayerFFz   rL   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr-   )r$   r%   is_gated_actrf   DenseReluDenserJ   r!   rR   layer_norm_epsilon
layer_normr   rV   rW   rX   r[   s     r/   r%   T5LayerFF.__init__{   s_    "6v">D"1&"9D%fnn&:S:STzz&"5"56r1   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r^   )rx   rv   rX   )r+   r>   forwarded_statess      r/   r@   T5LayerFF.forward   s;    ??=9../?@%5E(FFr1   )rv   rX   rx   rd   rH   s   @r/   rq   rq   z   s    7x 7 r1   rq   c                   t   ^  \ rS rSr  S
S\S\S-  4U 4S jjjr\SS j5       rSS jr	     SS jr
S	rU =r$ )T5Attention   NrL   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrO   )r$   r%   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerR   d_kvkey_value_proj_dim	num_headsn_headsrW   rX   	inner_dimr   loggerwarning_oncer.   rB   r   rQ   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr+   rL   r   r   r.   s       r/   r%   T5Attention.__init__   se    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r1   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r3   r   )r6   r'   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r/   _relative_position_bucket%T5Attention._relative_position_bucket   s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r1   c                    Uc   U R                   R                  R                  n[        R                  " U[        R
                  US9SS2S4   U-   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r;   device)r   r   r   )r3   r   r   r   )r   r)   r   r'   aranger   r   r   r   r   permute	unsqueeze)
r+   query_length
key_lengthr   past_seen_tokenscontext_positionmemory_positionr   relative_position_bucketvaluess
             r/   compute_biasT5Attention.compute_bias   s    >1188??F <<EJJvVWXZ^W^_brr,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r1   c                 :   UR                   SS n/ UQSPU R                  P7n	Ub  UR                  U R                  5      OSn
[	        U
[
        R                  5      (       a  U
R                  5       OU
n
USLnU R                  U5      R                  U	5      R                  SS5      nSn[	        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  UR                   U R                     R"                  nUR                   U R                     R$                  nO/ UR                   SS QSPU R                  P7nU R'                  U5      R                  U5      R                  SS5      nU R)                  U5      R                  U5      R                  SS5      nUbU  UR+                  UUU R                  5      u  nnU(       a.  [	        U[        5      (       a  SUR                  U R                  '   [
        R,                  " UUR                  SS5      5      nUc  UR                   S	   nU R.                  (       dh  [
        R0                  " SUR                   S   US   U4UR2                  UR4                  S
9nU R6                  (       a  U R8                  (       a  SUl        OU R=                  US   UUR2                  U
S9nUb#  USS2SS2SS2SUR                   S	   24   nUU-   nUnUU-  n[>        R@                  RC                  URE                  5       SS9RG                  U5      n[>        R@                  RI                  UU RH                  U R8                  S9n[
        R,                  " UU5      nUR                  SS5      RK                  5       nURL                  " / UQSP76 nU RO                  U5      nUU4nU(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr4   r   r   r3   FTr   )r   r;   )r   r   dim)ptraining)(shaper   get_seq_lengthr   r_   r'   r`   cloner   view	transposer   
is_updatedgetcross_attention_cacheself_attention_cachelayerskeysr   r   r   updatematmulr   zerosr   r;   r   r   requires_gradr   r   
functionalsoftmaxr   type_asrX   
contiguousreshaper   )r+   r>   maskkey_value_statesposition_biaspast_key_valuesoutput_attentionskwargsinput_shapehidden_shaper   is_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapescoresr   causal_maskposition_bias_maskedattn_weightsattn_outputoutputss                             r/   r@   T5Attention.forward   s    $))#2.BBbB$*A*ABM\Mh?99$..Ino7ABRTYT`T`7a7a+113gw .T9vvm,11,?II!QO 
o':;;(3377GJ!'6'L'L$'6'K'K$#2 -?)]/"=*-44T^^DIIJ/66t~~FMMLP--cr2PBP8O8OPH/44X>HHANJ66.166x@JJ1aPL*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~> lJ,@,@A,FG #))"-J33 %**1-{1~zJSYS`S`hnhtht! ..4==26M/ $ 1 1NJv}}Wg !2 ! "1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!));;;;ff[)./Gr1   )rR   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FN)T       )Nr   )NNNNF)rB   rC   rD   rE   r   intr%   staticmethodr   r   r@   rF   rG   rH   s   @r/   r~   r~      sc     %* $	 , , :	 ,  ,D -  - ^( [ [r1   r~   c                   N   ^  \ rS rSrSS\S-  4U 4S jjjr     SS jrSrU =r$ )	T5LayerSelfAttentioniN  Nr   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   rt   )r$   r%   r~   SelfAttentionr!   rR   rw   rx   r   rV   rW   rX   r   s       r/   r%   T5LayerSelfAttention.__init__O  sQ    (W`
 &fnn&:S:STzz&"5"56r1   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   nU4U	SS  -   n
U
$ )N)r   r   r   	use_cacher   r   r   )rx   r   rX   )r+   r>   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   s              r/   r@   T5LayerSelfAttention.forwardW  sn      $}=-- '+/ . 
 &5Ea5H(II "%5ab%99r1   )r   rX   rx   r   )NNNFF	rB   rC   rD   rE   r   r%   r@   rF   rG   rH   s   @r/   r   r   N  s4    7SSWZ 7 7  r1   r   c                   L   ^  \ rS rSrSS\S-  4U 4S jjjr    SS jrSrU =r$ )	T5LayerCrossAttentionio  Nr   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr   rt   )r$   r%   r~   EncDecAttentionr!   rR   rw   rx   r   rV   rW   rX   )r+   rL   r   r.   s      r/   r%   T5LayerCrossAttention.__init__p  sN    *6u`ij%fnn&:S:STzz&"5"56r1   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   n
U
4U	SS  -   nU$ )N)r   r   r   r   r   r   r   )rx   r   rX   )r+   r>   r   r   r   r   r   r   r   r   layer_outputr   s               r/   r@   T5LayerCrossAttention.forwardv  sm      $}=// -'+/ 0 
 %||4DQ4G'HH/$4QR$88r1   )r   rX   rx   r^   )NNNFr   rH   s   @r/   r   r   o  s0    7#* 7 7  r1   r   c                   V   ^  \ rS rSrSS\S-  4U 4S jjjr         SS jrSrU =r$ )	T5Blocki  Nr   c                 l  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        XUS95        U R                  (       a"  U R
                  R                  [        XS95        U R
                  R                  [        U5      5        g )Nr   )r   )
r$   r%   r   r   
ModuleListlayerappendr   r   rq   r   s       r/   r%   T5Block.__init__  s}     ++]]_


 luv	
 ??JJ3FPQ

)F+,r1   c           	         U R                   S   " UUUUUU	S9nUS   nUSS  nUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " X* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	S9nUS   nUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " X* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " X* US9nU4nUU-   $ )Nr   )r   r   r   r   r   r   i  )r   max)r   r   r   r   r   r4   )r  r;   r'   r<   r   isinfanyfinfor
  clampr   )r+   r>   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r/   r@   T5Block.forward  sM    "&A)'+/"
 /q12126 %--/++M*..0M//044t;M//044K
 "KK<[YM!__R1Fd1R&*jjm!65; /"3'# 4A6M ""emm3#kkKK.224KK 3 34884?KK 3 3488
 !&M|Q\ ] !24KAB4O O 

2}5 %--/++M*..0M//044t;M//044K
 "KK<[YM " ''	
r1   )r   r  r   )	NNNNNNFFTr   rH   s   @r/   r  r    sB    
-SSWZ 
- 
- "#&*J
 J
r1   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	T5ClassificationHeadi  z-Head for sentence-level classification tasks.rL   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )N)r   )r$   r%   r   rQ   rR   denserV   classifier_dropoutrX   
num_labelsout_projr[   s     r/   r%   T5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr1   r>   returnc                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r^   )rX   r  r'   tanhr  rb   s     r/   r@   T5ClassificationHead.forward  sN    ]3

=1

=1]3m4r1   )r  rX   r  )rB   rC   rD   rE   __doc__r   r%   r'   r`   r@   rF   rG   rH   s   @r/   r  r    s4    7Ex EU\\ ell  r1   r  c                   |    \ rS rSr% \\S'   SrSrSrS/r	S/r
\S 5       r\R                  " 5       S 5       rS	 rS
rg)T5PreTrainedModeli  rL   transformerTr  rU   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r'   tensorr   r   )r+   r+  
input_maskdummy_inputss       r/   r/  T5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r1   c                    U R                   R                  n[        U[        5      (       a%  [        R
                  " UR                  US-  5        g	[        U[        [        [        [        45      (       Ga  [        R                  " UR                  R                  SUS-  S9  [        US5      (       aH  U R                   R                  (       d-  [        R                  " UR                  R                  SUS-  S9  [        US5      (       an  [        R                  " UR                   R                  SX R                   R"                  S-  -  S9  [        R$                  " UR                   R&                  5        g	g	[        U[(        5      (       aj  [        US5      (       aX  [        R                  " UR*                  R                  SUS-  S9  [        R$                  " UR*                  R&                  5        g	g	[        U[,        5      (       GaA  [        R                  " UR.                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aA  UR.                  R&                  b*  [        R$                  " UR.                  R&                  5        [        R                  " UR0                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aC  UR0                  R&                  b+  [        R$                  " UR0                  R&                  5        g	g	g	[        U[2        5      (       GaA  [        R                  " UR4                  R                  SX R                   R"                  S-  -  S9  [        UR4                  S5      (       aA  UR4                  R&                  b*  [        R$                  " UR4                  R&                  5        [        R                  " UR6                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aC  UR6                  R&                  b+  [        R$                  " UR6                  R&                  5        g	g	g	[        U[:        5      (       Ga  [        R                  " UR<                  R                  SX R                   R"                  S-  -  S9  [        UR<                  S5      (       aA  UR<                  R&                  b*  [        R$                  " UR<                  R&                  5        [        R                  " UR>                  R                  SX R                   R"                  S-  -  S9  [        UR>                  S5      (       aA  UR>                  R&                  b*  [        R$                  " UR>                  R&                  5        [        R                  " UR6                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aC  UR6                  R&                  b+  [        R$                  " UR6                  R&                  5        g	g	g	[        U[@        5      (       GaF  U R                   R"                  nU R                   RB                  nU R                   RD                  n[        R                  " URF                  R                  SX#U-  S-  -  S9  [        R                  " URH                  R                  SX#S-  -  S9  [        R                  " URJ                  R                  SX#S-  -  S9  [        R                  " URL                  R                  SX%U-  S-  -  S9  URN                  (       a0  [        R                  " URP                  R                  SX#S-  -  S9  g	g	g	)
zInitialize the weightsg      ?g        )r9   stdlm_head
qa_outputs      
classifierrP   N))rL   initializer_factorr_   r!   init	constant_r)   T5ModelT5ForConditionalGenerationT5EncoderModelT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr3  r4  rR   zeros_rP   T5ForTokenClassificationr6  r  r  r  rJ   rT   rU   rS   rf   ri   rj   r~   r   r   r   r   r   r   r   r   )r+   modulefactorrR   r   r   s         r/   _init_weightsT5PreTrainedModel._init_weights  sO    //fk**NN6==&3,70.BXY
 
 LL--CVc\Jvy))$++2Q2QV^^22&3,Ov|,,V..55CVP[P[PcPchlOlEmnF--223 -  899v|,,V..55CVc\RF--223 -  455LL,,3F{{GZGZ_cFc<dev||V,,1B1B1NFLL--.LL//cv++J]J]bfIf?ghv//FOO4H4H4TFOO001 5U/00LL))KKDWDW\`C`9abvyy&))fiinn.HFIINN+LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I) 455LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I),,kk))G!%!1!1kk++GLLsM_C_dhBh8ijLLs4-8PQLLs4-8PQLLsM_C_dhBh8ij11V;;BBRXim\mRno 2 -r1   c                 :   U R                   R                  nU R                   R                  nUc  [        S5      eUR	                  UR
                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US:H  U5        U$ )Nzself.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information..r4   r   ).r   z1self.model.config.pad_token_id has to be defined.)rL   decoder_start_token_idpad_token_id
ValueError	new_zerosr   r   masked_fill_)r+   r+  rJ  rK  shifted_input_idss        r/   _shift_rightT5PreTrainedModel._shift_rightF  s    !%!C!C{{//!)4 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r1    N)rB   rC   rD   rE   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr/  r'   no_gradrF  rP  rF   rR  r1   r/   r'  r'    s^    %&*#!"!F  ]]_4p 4pl!r1   r'  c                   L   ^  \ rS rSrU 4S jrS r          SS jrSrU =r$ )T5Stacki\  c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R"                  " UR$                  5      U l        U R)                  5         SU l        g s  snf )Nr   r   rt   F)r$   r%   r   r   
vocab_sizerR   embed_tokensr   r  range
num_layersr  boolblockr!   rw   final_layer_normrV   rW   rX   	post_initr   )r+   rL   ir.   s      r/   r%   T5Stack.__init__]  s     LL):):FNNK ++]]]bcictct]uv]uXYWVa1fQRS]uv

 !,FNN@Y@Y Zzz&"5"56 	&+# ws   9!Dc                     Xl         g r^   )r_  r+   new_embeddingss     r/   set_input_embeddingsT5Stack.set_input_embeddingsm  s    *r1   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S
5        SnUc)  U R                  c  [        S5      eU R                  U5      nUu  pUSL a   U R
                  (       d  [        SU  S35      eU R
                  (       ah  U(       a`  Uc]  U R                   R                  (       a/  [        [!        U R                   S9[!        U R                   S95      nO'[!        U R                   S9nOU R
                  (       d  S nU R                   R
                  (       a8  [#        U R                   UU[%        U[        5      (       a  UR&                  OUS9nO[)        U R                   UUS9nS nU R
                  (       a  Ub  [)        U R                   UUUS9nU	(       a  SOS nU(       a  SOS nU(       a  U R
                  (       a  SOS nS nS nU R+                  U5      nU R,                   Hy  nU	(       a  UU4-   nU" UUUUUUUUUU
S9
nUS   nUS   nU R
                  (       a  Ub  UU(       a  SOS   nU(       d  MT  UUS   4-   nU R
                  (       d  Mp  UUS   4-   nM{     U R/                  U5      nU R+                  U5      nU	(       a  UU4-   nU
(       d  [1        S UUUUU4 5       5      $ [3        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer4   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rL   )rL   rp  r   r   )rL   rp  r   )rL   rp  r   r  rR  )r   r   r   r  r   r   r   r3      c              3   0   #    U  H  nUc  M  Uv   M     g 7fr^   rR  ).0r   s     r/   	<genexpr>"T5Stack.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   r>   
attentionscross_attentions)rL   r   r   output_hidden_statesr  r   rL  sizer   r   r   r   r   r_  is_encoder_decoderr   r   r   r_   r   r   rX   rc  rd  tupler   )r+   r+  r   r  r  rp  r   r   r   ry  r  r   err_msg_prefixr   
batch_size
seq_lengthencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r  r>   layer_modulelayer_outputss                            r/   r@   T5Stack.forwardp  s    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
?? #LTFRg!hii??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #O;;!!/{{+-o/BCC !0 D D$N 7{{+-N +/'??4@.G{{+5&;	/+ #7BD0d&7DOOrRV(,%]3 JJL#$58H$H!(%/- /#"3'M *!,M
 *!,M#8#D0=CTaZ[0\-  !/=3C2E!E???+?=QRCSBU+U(; '> --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r1   )rc  rX   r_  rd  r   r   )
NNNNNNNNNN)	rB   rC   rD   rE   r%   rk  r@   rF   rG   rH   s   @r/   r\  r\  \  s9    , +
 "#![
 [
r1   r\  c                     ^  \ rS rSrS/rSSS.rS\4U 4S jjrS rS r	\
            SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                        S	-  S\S	-  S\R"                  S	-  S\R"                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r:  i  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightrL   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        U5      U l        [        R                  " U5      nSUl	        UR                  Ul        [        U5      U l        U R!                  5         g NFT)r$   r%   r   r   r^  rR   r?  copydeepcopyr   r   r\  encodernum_decoder_layersra  decoderre  r+   rL   encoder_configdecoder_configr.   s       r/   r%   T5Model.__init__  s     ll6#4#4fnnEv.$)!#( ~.v.$(!$*$=$=!~. 	r1   c                     U R                   $ r^   r?  r+   s    r/   get_input_embeddingsT5Model.get_input_embeddings)      {{r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r?  r  rk  r  ri  s     r/   rk  T5Model.set_input_embeddings,  +    $)).9)).9r1   Nr+  r   r*  r,  encoder_outputsr   rp  decoder_inputs_embedsr   r   ry  r  r!  c                 R   U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUc  U R                  UUUU
UUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUUUUU	U
UUS9
nU(       d  X-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	a>	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, T5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5Model.from_pretrained("google-t5/t5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
>>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
>>> decoder_input_ids = model._shift_right(decoder_input_ids)

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr+  r   rp  r   ry  r  r   r   r3   rv  r>   rw  
r+  r   rp  r   r  r  r   r   ry  r  )rv  r   decoder_hidden_statesdecoder_attentionsrx  encoder_last_hidden_stater  encoder_attentions)rL   r   r  r  r_   r   lenr  r   rv  r   r>   rw  rx  )r+   r+  r   r*  r,  r  r   rp  r  r   r   ry  r  r   r>   decoder_outputss                   r/   r@   T5Model.forward1  sP   D "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5# ' 
 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r1   )r  r  r?  NNNNNNNNNNNN)rB   rC   rD   rE   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r%   r  rk  r   r'   
LongTensorFloatTensor
BoolTensorr|  r
   r`   rb  r   r@   rF   rG   rH   s   @r/   r:  r:    sx    	Q*& (7'6
x ":
  .23759:>BF(,-159!%)-,0#'q
##d*q
 ))D0q
 !++d2	q

 !& 0 04 7q
 uU%6%6784?q
 q
 ||d*q
  %||d2q
 $;q
  $;q
 #Tkq
 D[q
 
u  	!$6	6q
 q
r1   r:  z:
    T5 Model with a `language modeling` head on top.
    )custom_introc                     ^  \ rS rSrS/rSSSS.rS\4U 4S jjrS rS r	\
             SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                         S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rS\R                   4S jrSrU =r$ )r;  i  r  r  )zlm_head.weightr  r  rL   c                   > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        [        U5      U l        [        R                  " U5      nSUl
        UR                  Ul        [        U5      U l        [        R"                  " UR                  UR                  SS9U l        U R'                  5         g )NFTrO   )r$   r%   rR   	model_dimr   r   r^  r?  r  r  r   r   r\  r  r  ra  r  rQ   r3  re  r  s       r/   r%   #T5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( ~.v.$(!$*$=$=!~.yy1B1BO 	r1   c                     U R                   $ r^   r  r  s    r/   r  /T5ForConditionalGeneration.get_input_embeddings  r  r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r  ri  s     r/   rk  /T5ForConditionalGeneration.set_input_embeddings  r  r1   Nr+  r   r*  r,  r  r   rp  r  labelsr   r   ry  r  r!  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU	b  Uc  Uc  U R                  U	5      nU R                  UUUUUUU
UUUS9
nUS   nU R                   R                  (       a  UU R                  S-  -  nU R                  U5      nSnU	b[  [        S	S
9nU	R                  UR                  5      n	U" UR                  SUR!                  S5      5      U	R                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  UR(                  UR*                  UR,                  UR&                  UR(                  S9	$ )a
  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, T5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer(
...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
>>> # studies have shown that owning a dog is good for you.
```Nr  r   r   r3   r  r  r5  rI  ignore_indexr4   	losslogitsr   r  r  rx  r  r  r  )rL   r   r  r  r_   r   r  rP  r  scale_decoder_outputsr  r3  r   r6   r   r   rz  r   r   r>   rw  rx  rv  )r+   r+  r   r*  r,  r  r   rp  r  r  r   r   ry  r  r   r>   r  sequence_output	lm_logitsr  loss_fctoutputs                         r/   r@   "T5ForConditionalGeneration.forward  s    R "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5# ' 
 *!,;;,,-1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r1   c                 $    U R                  U5      $ r^   )rP  )r+   r  s     r/   %prepare_decoder_input_ids_from_labels@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsb  s      ((r1   )r  r  r3  r  r?  NNNNNNNNNNNNN)rB   rC   rD   rE   r  r  r   r%   r  rk  r   r'   r  r  r  r|  r`   r
   rb  r   r@   r  rF   rG   rH   s   @r/   r;  r;    s    	Q*& *'6'6x *:
  .23759:>=A(,26:>*.!%)-,0#'M
##d*M
 ))D0M
 !++d2	M

 !& 0 04 7M
 uU\\23d:M
 M
 ((4/M
  %0047M
   4'M
 $;M
  $;M
 #TkM
 D[M
  
u  	!O	3!M
 M
^)ELL ) )r1   r;  c                   
  ^  \ rS rSrSS0rS/rS\4U 4S jjrS rS r	\
      SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r<  if  r  r  r  rL   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        UnSUl        SUl        [        U5      U l
        U R                  5         g )NF)r$   r%   r   r   r^  rR   r?  r   r{  r\  r  re  )r+   rL   r  r.   s      r/   r%   T5EncoderModel.__init__k  sY     ll6#4#4fnnE#( ,1)~. 	r1   c                     U R                   $ r^   r  r  s    r/   r  #T5EncoderModel.get_input_embeddingsw  r  r1   c                 F    Xl         U R                  R                  U5        g r^   )r?  r  rk  ri  s     r/   rk  #T5EncoderModel.set_input_embeddingsz  s    $)).9r1   Nr+  r   rp  r   ry  r  r!  c           	      d    Ub  UOU R                   R                  nU R                  UUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).

Example:

```python
>>> from transformers import AutoTokenizer, T5EncoderModel

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5EncoderModel.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )rL   r  r  )	r+   r+  r   rp  r   ry  r  r   r  s	            r/   r@   T5EncoderModel.forward~  sH    D &1%<k$++BYBY,,)'/!5# ' 
 r1   )r  r?  )NNNNNN)rB   rC   rD   rE   r  r  r   r%   r  rk  r   r'   r  r  rb  r|  r   r@   rF   rG   rH   s   @r/   r<  r<  f  s    7I*4&
x 
:  .23726)-,0#',##d*, ))D0, ((4/	,
  $;, #Tk, D[, 
u  	!O	3, ,r1   r<  z
    T5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                     ^  \ rS rSrS/rS\4U 4S jjr\            SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\R                     S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )T5ForSequenceClassificationi  r  rL   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r^   )r$   r%   r:  r(  r  classification_headre  r[   s     r/   r%   $T5ForSequenceClassification.__init__  s5     "6?#7#?  	r1   Nr+  r   r*  r,  r  rp  r  r  r   r   ry  r  r!  c                 4   Ub  UOU R                   R                  nUb  Sn	Uc%  Ub"  [        SU R                  R                   35      eUc"  Uc  Uc  [        S5      eU R                  U5      nU R                  UUUUUUUU	U
UUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R!                  5       S:H  S5        UR"                  u  nnnUUSS24   R%                  US	U5      SS2S	SS24   nU R'                  U5      nSnUGb  UR                  UR                  5      nU R                   R(                  c  U R                   R*                  S:X  a  S
U R                   l        OyU R                   R*                  S:  aN  UR,                  [        R.                  :X  d  UR,                  [        R0                  :X  a  SU R                   l        OSU R                   l        U R                   R(                  S
:X  aT  [3        5       nU R                   R*                  S:X  a&  U" UR5                  5       UR5                  5       5      nOU" UU5      nOU R                   R(                  S:X  aG  [7        5       nU" UR%                  S	U R                   R*                  5      UR%                  S	5      5      nO-U R                   R(                  S:X  a  [9        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [;        UUUR<                  UR>                  UR@                  URB                  URD                  URF                  URH                  S9	$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r*  r,  r  rp  r  r   r   ry  r  r   r   z7All examples must have the same number of <eos> tokens.r4   
regressionsingle_label_classificationmulti_label_classificationr  )%rL   r  NotImplementedErrorr.   rB   rL  rP  r(  eqeos_token_idr6   r   r   r'   unique_consecutivesumnumelr   r   r  problem_typer  r;   r   r   r   squeezer   r   r   r   r  r  rx  r  r  r  )r+   r+  r   r*  r,  r  rp  r  r  r   r   ry  r  r   r   r  eos_maskr~  _r,   sentence_representationr  r  r  r  s                            r/   r@   #T5ForSequenceClassification.forward  sN   ` &1%<k$++BYBYI!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9+'"7/!5# # 
 "!*<< 8 89<<_=S=ST$$X\\!_5;;=BE	
 &5%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r1   )r  r(  r  )rB   rC   rD   rE   r  r   r%   r   r'   r  r`   listr  rb  r|  r   r@   rF   rG   rH   s   @r/   r  r    sU    +s)s&x   .2.259:>:>26:>*.!%)-,0#'A
##d*A
 t+A
 !++d2	A

 !& 0 04 7A
 e//047A
 ((4/A
  %0047A
   4'A
 $;A
  $;A
 #TkA
 D[A
 
0	0A
 A
r1   r  c                     ^  \ rS rSrS\4U 4S jjr\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S-  S
\	S-  S\	S-  S\
\R                     \-  4S jj5       rSrU =r$ )rC  iD  rL   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r^   )r$   r%   r  r<  r(  r   rV   r  rX   rQ   r,   r6  re  r[   s     r/   r%   !T5ForTokenClassification.__init__F  sj      ++)&1zz&";";<))F$6$68I8IJ 	r1   Nr+  r   rp  r  r   ry  r  r!  c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  XSS 4nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
N)r   rp  r   ry  r  r   r4   r3   )r  r  r>   rw  )rL   r  r(  rX   r6  r   r   r  r   r>   rw  )r+   r+  r   rp  r  r   ry  r  r   r   r>   r  r  r  r  s                  r/   r@    T5ForTokenClassification.forwardQ  s    4 &1%<k$++BYBY"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDam,F)-)9TGf$EvE$!//))	
 	
r1   )r6  rX   r  r(  )NNNNNNN)rB   rC   rD   rE   r   r%   r   r'   r`   rb  r|  r   r@   rF   rG   rH   s   @r/   rC  rC  D  s    	x 	  *..2-1&*)-,0#'6
<<$&6
 t+6
 ||d*	6

 t#6
  $;6
 #Tk6
 D[6
 
u||	4	46
 6
r1   rC  c                     ^  \ rS rSrS/rSSS.rS\4U 4S jjrS rS r	\
             SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                         S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r=  i  r  r  r  rL   c                 ,  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        [        U5      U l        [        R                  " U5      nSUl
        UR                  Ul        [        U5      U l        UR"                  U l        [        R$                  " UR&                  UR"                  5      U l        U R+                  5         g r  )r$   r%   rR   r  r   r   r^  r?  r  r  r   r   r\  r  r  ra  r  r  rQ   r,   r4  re  r  s       r/   r%   T5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( ~.v.$(!$*$=$=!~. ++))F$6$68I8IJ 	r1   c                     U R                   $ r^   r  r  s    r/   r  +T5ForQuestionAnswering.get_input_embeddings  r  r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r  ri  s     r/   rk  +T5ForQuestionAnswering.set_input_embeddings  r  r1   Nr+  r   r*  r,  r  start_positionsend_positionsrp  r  r   r   ry  r  r!  c                    Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  Ub  Sn
Uc"  U	c  Uc  [        S5      eU R	                  U5      nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUU	SUUU
UUUS	9
nUS   nU R                  U5      nUR                  SS
S9u  nnUR                  S
5      R                  5       nUR                  S
5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a*  UR                  S
5      R                  UR                   5      n[        UR                  5       5      S:  a*  UR                  S
5      R                  UR                   5      nUR                  S5      nUR#                  SU5      nUR#                  SU5      n[%        US9nU" UU5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   U-   nUb  U4U-   $ U$ ['        UUUUR(                  UR*                  UR,                  UR.                  UR0                  UR*                  UR,                  S9
$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
NFr  r  r   r   r3   r  r  r4   r   r  )
r  start_logits
end_logitsr   r  r  rx  r  r  r  )rL   r  r   rL  rP  r  r_   r   r  r  r4  splitr  r   rz  r6   r   r  r   r   r   r>   rw  rx  rv  )r+   r+  r   r*  r,  r  r  r  rp  r  r   r   ry  r  r   r>   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                              r/   r@   T5ForQuestionAnswering.forward  s3   \ &1%<k$++BYBY!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r1   )r  r  r  r  r4  r?  r  )rB   rC   rD   rE   r  r  r   r%   r  rk  r   r'   r  r  r  r|  r`   rb  r   r@   rF   rG   rH   s   @r/   r=  r=    s   *r)s&'6'6
x ,:
  .23759:>=A371526:>!%)-,0#'I
##d*I
 ))D0I
 !++d2	I

 !& 0 04 7I
 uU\\23d:I
 ))D0I
 ''$.I
 ((4/I
  %0047I
 $;I
  $;I
 #TkI
 D[I
  
u  	!$G	G!I
 I
r1   r=  )r<  r;  r:  r'  r=  r  rC  )Ar%  r  r   r'   r   torch.nnr   r   r   ro  r   r8  activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_t5r   
get_loggerrB   r   Moduler!   rJ   rf   rq   r~   r   r   r  r  r'  r\  r:  r;  r<  r  rC  r=  __all__rR  r1   r/   <module>r     s         A A & ! C C ) J 9   . ^ ^ & 
		H	%+")) +2bii ,299 :		 $")) D299 BBII >W
( W
t299 $ ^! ^! ^!Bo
 o
d T
 T
 T
n 
x)!2O x)
x)v D& D DN M
"3 M
M
` C
0 C
 C
L o
. o
 o
dr1   