
    Z j~!                       S r SSKrSSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  \'RV                  " \,5      r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1 " S S\R\                  5      r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S S\R\                  5      r5 " S S \5      r6 " S! S"\R\                  5      r7\% " S# S$\!5      5       r8 " S% S&\85      r9\% " S' S(\85      5       r:\%" S)S*9 " S+ S,\8\5      5       r;\% " S- S.\85      5       r<\%" S/S*9 " S0 S1\85      5       r=\% " S2 S3\85      5       r>\% " S4 S5\85      5       r?/ S6Qr@g)7zPyTorch UMT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilingloggingtorch_compilable_check   )
UMT5Configc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UMT5LayerNorm6   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zU
Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/umt5/modeling_umt5.pyr%   UMT5LayerNorm.__init__7   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor'   float32powmeanrsqrtr*   r)   dtypefloat16bfloat16)r+   hidden_statesvariances      r/   forwardUMT5LayerNorm.forward?   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r1   )r*   r)   )gư>)__name__
__module____qualname____firstlineno__r%   r@   __static_attributes____classcell__r.   s   @r/   r!   r!   6   s    $+ +r1   r!   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseActDenseP   configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r$   r%   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr+   rL   r.   s     r/   r%   UMT5DenseActDense.__init__Q   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r1   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)rT   rZ   rX   
isinstancerU   r)   r'   Tensorr;   int8r6   r+   r>   s     r/   r@   UMT5DenseActDense.forwardX   s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r1   )rZ   rX   rT   rU   	rB   rC   rD   rE   r   r%   r@   rF   rG   rH   s   @r/   rJ   rJ   P   s    /z / r1   rJ   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseGatedActDenseg   rL   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rN   )r$   r%   r   rQ   rR   rS   wi_0wi_1rU   rV   rW   rX   r	   rY   rZ   r[   s     r/   r%   UMT5DenseGatedActDense.__init__h   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r1   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ r^   )rZ   ri   rj   rX   r_   rU   r)   r'   r`   r;   ra   r6   )r+   r>   hidden_geluhidden_linears       r/   r@   UMT5DenseGatedActDense.forwardp   s    hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r1   )rZ   rX   ri   rj   rU   rd   rH   s   @r/   rf   rf   g   s    /z / r1   rf   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5LayerFF   rL   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr-   )r$   r%   is_gated_actrf   DenseReluDenserJ   r!   rR   layer_norm_epsilon
layer_normr   rV   rW   rX   r[   s     r/   r%   UMT5LayerFF.__init__   s_    "8"@D"3F";D'F<U<UVzz&"5"56r1   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r^   )rx   rv   rX   )r+   r>   forwarded_statess      r/   r@   UMT5LayerFF.forward   s;    ??=9../?@%5E(FFr1   )rv   rX   rx   rd   rH   s   @r/   rq   rq      s    7z 7 r1   rq   c            
          ^  \ rS rSrSrSS\S-  4U 4S jjjrS\R                  S\R                  4S jr	S	 r
SS
 jr   SS\R                  S\R                  S-  S\S-  S\R                  S-  4S jjrSrU =r$ )UMT5Attention   z/
T5's attention using relative_attention_bias.
N	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a1  [&        R2                  " U R                  U R                  5      U l        g g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrO   )r$   r%   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerR   d_kvkey_value_proj_dim	num_headsn_headsrW   rX   	inner_dimr   loggerwarning_oncer.   rB   r   rQ   qkvo	Embeddingrelative_attention_bias)r+   rL   r   r   r.   s       r/   r%   UMT5Attention.__init__   s`    +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD( ,r1   
projectionreturnc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      R	                  SSSS5      nU$ )Nr4   r   r3   r   r   )sizer   r   viewpermute)r+   r   new_projection_shapenew_projections       r/   _shapeUMT5Attention._shape   sQ    )0"5tG^G^8__#)=>FFq!QPQRr1   c                    SnU R                   nU R                  nU R                  (       dC  US-  nX!S:  R                  [        R
                  5      U-  -  n[        R                  " U5      nO,[        R                  " U[        R                  " U5      5      * nUS-  nX:  n[        R                  " UR                  5       U-  5      [        R                  " XE-  5      -  nXsU-
  -  nXWR                  [        R
                  5      -   n[        R                  " U[        R                  " XS-
  5      5      nU[        R                  " XaU5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r3   r   )r   r   r   r6   r'   longabsmin
zeros_likelogfloatmath	full_likewhere)	r+   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r/   _relative_position_bucket'UMT5Attention._relative_position_bucket   s/   * 99;;AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 II/557)CDtxxP\PhGii	y!89	%.ejj1I%I"%*YY&8RbcTc(d&
" 	EKKE_``r1   c                    Uc   U R                   R                  R                  n[        R                  " U[        R
                  US9SS2S4   U-   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  U5      nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r;   device)r3   r   r   r   )	r   r)   r   r'   aranger   r   r   	unsqueeze)
r+   query_length
key_lengthr   past_seen_tokenscontext_positionmemory_positionr   relative_position_bucketvaluess
             r/   compute_biasUMT5Attention.compute_bias   s    >1188??F <<EJJvVWXZ^W^_brr,,zFSTXZ[T[\+>#'#A#ABS#T --.FG	*44Q7r1   r>   encoder_hidden_statespast_key_valuesattention_maskc                    UR                   S S u  pgUb  UR                  U R                  5      OSn[        U[        R
                  5      (       a  UR                  5       OUnUS Ln	U R                  U5      n
U
R                  USU R                  U R                  5      R                  SS5      n
SnUb[  [        U[        5      (       aF  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                   nOUnU	(       a  UOUnU	(       aQ  UbN  U(       aG  UR"                  U R                     R$                  nUR"                  U R                     R&                  nOU R)                  U5      nU R+                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbS  UR-                  XU R                  5      u  pU	(       a.  [        U[        5      (       a  SUR                  U R                  '   [        R.                  " XR                  SS5      5      nUR                   S   nU R0                  (       d9  [        R2                  " SU R                  UU4UR4                  UR6                  S	9nOU R9                  UUUR4                  US
9nUb  UU-   nUnUU-  n[:        R<                  R?                  URA                  5       SS9RC                  U5      n[:        R<                  RE                  UU RD                  U RF                  S9n[        R.                  " UU5      nUR                  SS5      RI                  5       nUR                  XgS5      nU RK                  U5      nUU4$ )Nr3   r   r4   r   FTr   )r   r;   )r   r   dim)ptraining)&shapeget_seq_lengthr   r_   r'   r`   cloner   r   r   r   	transposer   
is_updatedgetcross_attention_cacheself_attention_cachelayerskeysr   r   r   updatematmulr   zerosr   r;   r   r   
functionalsoftmaxr   type_asrX   r   
contiguousr   )r+   r>   r   r   r   kwargs
batch_size
seq_lengthr   is_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   position_biasposition_bias_maskedattn_weightsattn_outputs                         r/   r@   UMT5Attention.forward   sU    "/!4!4Ra!8
M\Mh?99$..Ino7ABRTYT`T`7a7a+113gw 3$>vvm,#((RtG^G^_iijkmno 
&:oGZ+[+[(3377GJ!'6'L'L$'6'K'K$#2 2D.-/"=*-44T^^DIIJ/66t~~FMML/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~> l,@,@A,FG%%b)
//!KKDLL*j9&--W]WcWcM !--Jv}}O_ . M %)N:M,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&zrBff[)L((r1   )rR   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   )FN)Nr   NNN)rB   rC   rD   rE   __doc__intr%   r'   r`   r   r   r   r
   r@   rF   rG   rH   s   @r/   r~   r~      s    kSSWZ k k6 %,, - ^
 6:(,.2N)||N)  %||d2N) 	N)
 t+N) N)r1   r~   c                   H   ^  \ rS rSrSS\S-  4U 4S jjjr  SS jrSrU =r$ )	UMT5LayerSelfAttentioniI  Nr   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NTr   r   rt   )r$   r%   r~   SelfAttentionr!   rR   rw   rx   r   rV   rW   rX   r+   rL   r   r.   s      r/   r%   UMT5LayerSelfAttention.__init__J  sN    *6t_hi'F<U<UVzz&"5"56r1   c                     U R                  U5      nU R                  UUUS9nXR                  US   5      -   nU4USS  -   nU$ )Nr   r   r   r   )rx   r   rX   )r+   r>   r   r   r   normed_hidden_statesattention_outputoutputss           r/   r@   UMT5LayerSelfAttention.forwardP  se      $}=-- )+ . 

 &5Ea5H(II "%5ab%99r1   )r   rX   rx   r^   )NN	rB   rC   rD   rE   r   r%   r@   rF   rG   rH   s   @r/   r   r   I  s*    7#* 7 7 	 r1   r   c                   J   ^  \ rS rSrSS\S-  4U 4S jjjr   SS jrSrU =r$ )	UMT5LayerCrossAttentionib  Nr   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr   rt   )r$   r%   r~   EncDecAttentionr!   rR   rw   rx   r   rV   rW   rX   r   s      r/   r%    UMT5LayerCrossAttention.__init__c  sO    ,VQVbkl'F<U<UVzz&"5"56r1   c                     U R                  U5      nU R                  UUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ )Nr   r   r   r   r   )rx   r   rX   )
r+   r>   r   r   r   r   r   r   layer_outputr   s
             r/   r@   UMT5LayerCrossAttention.forwardi  sg      $}=// "7)+	 0 
 %||4DQ4G'HH/$4QR$88r1   )r   rX   rx   r^   r   r   rH   s   @r/   r   r   b  s-    7#* 7 7 # r1   r   c                   P   ^  \ rS rSrSS\S-  4U 4S jjjr      SS jrSrU =r$ )		UMT5Blocki}  Nr   c                 j  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        XS95        U R                  (       a"  U R
                  R                  [        XS95        U R
                  R                  [        U5      5        g )Nr   )
r$   r%   r   r   
ModuleListlayerappendr   r   rq   r   s      r/   r%   UMT5Block.__init__~  sv     ++]]_


0MN??JJ5fRS

+f-.r1   c                    U R                   S   " UUUS9u  pUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n
[        R                  " [        R                  " U5      R                  5       U
S-
  U
5      n[        R                  " X* US9nS nU R                  =(       a    US LnU(       a  U R                   S   " UUUUS9u  pUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n
[        R                  " [        R                  " U5      R                  5       U
S-
  U
5      n[        R                  " X* US9nU R                   S   " U5      nUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n
[        R                  " [        R                  " U5      R                  5       U
S-
  U
5      n[        R                  " X* US9nU4nU(       a  XU4-  nU$ )Nr   r   i  )r   maxr   r   r4   )r  r;   r'   r<   finfor  r   isinfanyclampr   )r+   r>   r   r   encoder_attention_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   s                  r/   r@   UMT5Block.forward  s    ,0::a=)+,
( %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM "!__R1Fd1R04

1&;5 /	1-M ""emm3!KK(;(;<@@	#kk%++m*D*H*H*JIX\L\^gh %M|Q\ ] 

2}5 %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM "+=>>Gr1   )r   r  r^   )NNNNFFr   rH   s   @r/   r  r  }  s6    /#* / / "#5 5r1   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	UMT5ClassificationHeadi  z-Head for sentence-level classification tasks.rL   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )N)r   )r$   r%   r   rQ   rR   denserV   classifier_dropoutrX   
num_labelsout_projr[   s     r/   r%   UMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr1   r>   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r^   )rX   r  r'   tanhr  rb   s     r/   r@   UMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r1   )r  rX   r  )rB   rC   rD   rE   r   r   r%   r'   r`   r@   rF   rG   rH   s   @r/   r  r    s4    7Ez EU\\ ell  r1   r  c                   |    \ rS rSr% \\S'   SrSrSrS/r	S/r
\S 5       r\R                  " 5       S 5       rS	 rS
rg)UMT5PreTrainedModeli  rL   transformerTr  rU   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r'   tensorr   r   )r+   r)  
input_maskdummy_inputss       r/   r-   UMT5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r1   c                    U R                   R                  n[        U[        5      (       a%  [        R
                  " UR                  US-  5        g	[        U[        [        [        [        45      (       Ga  [        R                  " UR                  R                  SUS-  S9  [        US5      (       aH  U R                   R                  (       d-  [        R                  " UR                  R                  SUS-  S9  [        US5      (       an  [        R                  " UR                   R                  SX R                   R"                  S-  -  S9  [        R$                  " UR                   R&                  5        g	g	[        U[(        5      (       aj  [        US5      (       aX  [        R                  " UR*                  R                  SUS-  S9  [        R$                  " UR*                  R&                  5        g	g	[        U[,        5      (       GaA  [        R                  " UR.                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aA  UR.                  R&                  b*  [        R$                  " UR.                  R&                  5        [        R                  " UR0                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aC  UR0                  R&                  b+  [        R$                  " UR0                  R&                  5        g	g	g	[        U[2        5      (       GaA  [        R                  " UR4                  R                  SX R                   R"                  S-  -  S9  [        UR4                  S5      (       aA  UR4                  R&                  b*  [        R$                  " UR4                  R&                  5        [        R                  " UR6                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aC  UR6                  R&                  b+  [        R$                  " UR6                  R&                  5        g	g	g	[        U[:        5      (       Ga  [        R                  " UR<                  R                  SX R                   R"                  S-  -  S9  [        UR<                  S5      (       aA  UR<                  R&                  b*  [        R$                  " UR<                  R&                  5        [        R                  " UR>                  R                  SX R                   R"                  S-  -  S9  [        UR>                  S5      (       aA  UR>                  R&                  b*  [        R$                  " UR>                  R&                  5        [        R                  " UR6                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aC  UR6                  R&                  b+  [        R$                  " UR6                  R&                  5        g	g	g	[        U[@        5      (       GaF  U R                   R"                  nU R                   RB                  nU R                   RD                  n[        R                  " URF                  R                  SX#U-  S-  -  S9  [        R                  " URH                  R                  SX#S-  -  S9  [        R                  " URJ                  R                  SX#S-  -  S9  [        R                  " URL                  R                  SX%U-  S-  -  S9  URN                  (       a0  [        R                  " URP                  R                  SX#S-  -  S9  g	g	g	)
zInitialize the weights      ?g        )r9   stdlm_head
qa_outputs      
classifierrP   N))rL   initializer_factorr_   r!   init	constant_r)   	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr2  r3  rR   zeros_rP   UMT5ForTokenClassificationr5  r  r  r  rJ   rT   rU   rS   rf   ri   rj   r~   r   r   r   r   r   r   r   r   )r+   modulefactorrR   r   r   s         r/   _init_weights!UMT5PreTrainedModel._init_weights  sW    //fm,,NN6==&3,7, (	
 
 LL--CVc\Jvy))$++2Q2QV^^22&3,Ov|,,V..55CVP[P[PcPchlOlEmnF--223 -  :;;v|,,V..55CVc\RF--223 -  677LL,,3F{{GZGZ_cFc<dev||V,,1B1B1NFLL--.LL//cv++J]J]bfIf?ghv//FOO4H4H4TFOO001 5U/ 122 LL))KKDWDW\`C`9abvyy&))fiinn.HFIINN+LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I) 677LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I).. kk))G!%!1!1kk++GLLsM_C_dhBh8ijLLs4-8PQLLs4-8PQLLsM_C_dhBh8ij11V;;BBRXim\mRno 2 /r1   c                 :   U R                   R                  nU R                   R                  nUc  [        S5      eUR	                  UR
                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US:H  U5        U$ )Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information..r4   r   ).r   z1self.model.config.pad_token_id has to be defined.)rL   decoder_start_token_idpad_token_id
ValueError	new_zerosr   r   masked_fill_)r+   r)  rI  rJ  shifted_input_idss        r/   _shift_right UMT5PreTrainedModel._shift_right+  s    !%!C!C{{//!)6 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r1    N)rB   rC   rD   rE   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr-  r'   no_gradrE  rO  rF   rQ  r1   r/   r%  r%    s`    %&*#!$!F  ]]_@p @pD!r1   r%  c                   L   ^  \ rS rSrU 4S jrS r          SS jrSrU =r$ )	UMT5StackiA  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R                   " UR"                  5      U l        SU l        U R)                  5         g s  snf )Nr  rt   F)r$   r%   r   r   
vocab_sizerR   embed_tokensr   r  range
num_layersr  blockr!   rw   final_layer_normrV   rW   rX   gradient_checkpointing	post_init)r+   rL   ir.   s      r/   r%   UMT5Stack.__init__B  s     LL):):FNNK ++]]ERXRcRcLd#eLdqIf$BLd#ef
 -fnn&B[B[ \zz&"5"56 ',# $fs   9C8c                     Xl         g r^   )r^  r+   new_embeddingss     r/   set_input_embeddingsUMT5Stack.set_input_embeddingsN  s    *r1   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S
5        SnUc)  U R                  c  [        S5      eU R                  U5      nUu  pUSL a   U R
                  (       d  [        SU  S35      eU R
                  (       ah  U(       a`  Uc]  U R                   R                  (       a/  [        [!        U R                   S9[!        U R                   S95      nO'[!        U R                   S9nOU R
                  (       d  S nUb  UR#                  5       OSnUc4  [%        5       (       d%  UU-   n[&        R(                  " UUUR*                  S9nU R
                  (       a  [-        U R                   UUUS9nO\UbW  US S 2S S S S 24   nUR/                  UR0                  S9nSU-
  [&        R2                  " UR0                  5      R4                  -  nOS nU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [&        R(                  " UUR*                  S9nU R7                  U5      nOS nU	(       a  SOS nU(       a  SOS nU(       a  U R
                  (       a  SOS nU R9                  U5      n[;        U R<                  5       HR  u  nnU	(       a  UU4-   nU" UUUUUUUS9nUS   nU(       d  M-  UUS   4-  nU R
                  (       d  MI  UUS   4-  nMT     U R?                  U5      nU R9                  U5      nU	(       a  UU4-   nU
(       d  [A        S UUUUU4 5       5      $ [C        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer4   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rL   r   )r   )rL   ro  r   r   )r;   r0  rQ  )r  r   r  r  r   r3   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr^   rQ  ).0r   s     r/   	<genexpr>$UMT5Stack.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   r>   
attentionscross_attentions)"rL   r  r  output_hidden_statesreturn_dictr   rK  r   r   rc  r   r   r   r^  is_encoder_decoderr   r   r   r   r'   r(   r   r   r6   r;   r  r   invert_attention_maskrX   	enumeratera  rb  tupler   )r+   r)  r   r   r  ro  r   r  r  rw  rx  r   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthcausal_maskencoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr>   re  layer_modulelayer_outputss                                  r/   r@   UMT5Stack.forwardQ  sY    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
?? #LTFRg!hii ??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!*B*D*D4zAO"ZZ
OML`L`aN??,{{+- /	K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d%64??rPT]3(4OA|#$58H$H!(%'F /#"3M *!,M  =#3"55???(]1-=,??('  5* --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r1   )ra  rX   r^  rb  rc  r   )
NNNNNNNNNN)	rB   rC   rD   rE   r%   rj  r@   rF   rG   rH   s   @r/   r[  r[  A  s9    
+
 "#!V
 V
r1   r[  c                     ^  \ rS rSr% SrSr\\S'   SSS.rU 4S jr	S r
S	 r\            SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                   S
-  S\\\R                        S
-  S\S
-  S\R&                  S
-  S\R&                  S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\\R                     \-  4S jj5       rSrU =r$ )r9  i  a?  
Examples:

```python
>>> from transformers import UMT5Model, AutoTokenizer

>>> model = UMT5Model.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
>>> label = "<extra_id_0> verhandelt"
>>> inputs = tokenizer(inputs, return_tensors="pt")
>>> labels = tokenizer(label=label, return_tensors="pt")

>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state
```umt5rL   shared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightc                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        U5      U l        [        R                  " U5      nSUl	        UR                  Ul        [        U5      U l        U R!                  5         g NFT)r$   r%   r   r   r]  rR   r>  copydeepcopyr   r  r[  encodernum_decoder_layersr`  decoderrd  r+   rL   encoder_configdecoder_configr.   s       r/   r%   UMT5Model.__init__  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0 	r1   c                     U R                   $ r^   r>  r+   s    r/   get_input_embeddingsUMT5Model.get_input_embeddings      {{r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r>  r  rj  r  rh  s     r/   rj  UMT5Model.set_input_embeddings  +    $)).9)).9r1   Nr)  r   r(  r*  encoder_outputsr   ro  decoder_inputs_embedsr  r  rw  rx  r   c                 R   U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUc  U R                  UUUU
UUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUUUUU	U
UUS9
nU(       d  X-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	aP	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, UMT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5Model.from_pretrained("google/umt5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
>>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
>>> decoder_input_ids = model._shift_right(decoder_input_ids)

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr)  r   ro  r  rw  rx  r   r   r3   rt  r>   ru  
r)  r   ro  r   r   r  r  r  rw  rx  )rt  r   decoder_hidden_statesdecoder_attentionsrv  encoder_last_hidden_stater   encoder_attentions)rL   r  rx  r  r_   r   lenr  r   rt  r   r>   ru  rv  )r+   r)  r   r(  r*  r  r   ro  r  r  r  rw  rx  r   r>   decoder_outputss                   r/   r@   UMT5Model.forward  sP   D "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5# ' 
 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r1   )r  r  r>  NNNNNNNNNNNN)rB   rC   rD   rE   r   
model_typer   rR  _tied_weights_keysr%   r  rj  r   r'   
LongTensorFloatTensor
BoolTensorr|  r
   r`   boolr   r@   rF   rG   rH   s   @r/   r9  r9    ss   " J'6'6
$:
  .23759:>BF(,-159!%)-,0#'q
##d*q
 ))D0q
 !++d2	q

 !& 0 04 7q
 uU%6%6784?q
 q
 ||d*q
  %||d2q
 $;q
  $;q
 #Tkq
 D[q
 
u  	!$6	6q
 q
r1   r9  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc                     ^  \ rS rSrSrSrSSSS.rU 4S jrS rS r	\
             SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                         S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rS\R                   4S jrSrU =r$ )r:  i  a  
Examples:

```python
>>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

>>> outputs = model(**inputs)
>>> loss = outputs.loss
```r  r  )r  r  zlm_head.weightc                   > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        [        U5      U l        [        R                  " U5      nSUl
        UR                  Ul        [        U5      U l        [        R"                  " UR                  UR                  SS9U l        U R'                  5         g )NFTrO   )r$   r%   rR   	model_dimr   r   r]  r>  r  r  r   r  r[  r  r  r`  r  rQ   r2  rd  r  s       r/   r%   %UMT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0yy1B1BO 	r1   c                     U R                   $ r^   r  r  s    r/   r  1UMT5ForConditionalGeneration.get_input_embeddings  r  r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r  rh  s     r/   rj  1UMT5ForConditionalGeneration.set_input_embeddings  r  r1   Nr)  r   r(  r*  r  r   ro  r  labelsr  r  rw  rx  r   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU	b  Uc  Uc  U R                  U	5      nU R                  UUUUUUU
UUUS9
nUS   nU R                   R                  (       a  UU R                  S-  -  nU R                  U5      nSnU	b[  [        S	S
9nU	R                  UR                  5      n	U" UR                  SUR!                  S5      5      U	R                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  UR(                  UR*                  UR,                  UR&                  UR(                  S9	$ )aP
  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids)
>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
```Nr  r   r   r3   r  r  r4  rH  ignore_indexr4   	losslogitsr   r  r  rv  r  r   r  )rL   r  rx  r  r_   r   r  rO  r  r@  r  r2  r   r6   r   r   r   r   r   r>   ru  rv  rt  )r+   r)  r   r(  r*  r  r   ro  r  r  r  r  rw  rx  r   r>   r  sequence_output	lm_logitsr  loss_fctoutputs                         r/   r@   $UMT5ForConditionalGeneration.forward  s"   L "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5# ' 
 *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r1   c                 $    U R                  U5      $ r^   )rO  )r+   r  s     r/   %prepare_decoder_input_ids_from_labelsBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels`  s      ((r1   )r  r  r2  r  r>  NNNNNNNNNNNNN)rB   rC   rD   rE   r   r  r  r%   r  rj  r   r'   r  r  r  r|  r`   r
   r  r   r@   r  rF   rG   rH   s   @r/   r:  r:    s     J'6'6),:
  .23759:>=A(,26:>*.!%)-,0#'L
##d*L
 ))D0L
 !++d2	L

 !& 0 04 7L
 uU\\23d:L
 L
 ((4/L
  %0047L
   4'L
 $;L
  $;L
 #TkL
 D[L
  
u  	!O	3!L
 L
^)ELL ) )r1   r:  c                     ^  \ rS rSrSrSrSS0rU 4S jrS rS r	\
      SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r;  id  a  
Examples:

```python
>>> from transformers import UMT5EncoderModel, AutoTokenizer

>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> input_ids = tokenizer(article, return_tensors="pt").input_ids
>>> outputs = model(input_ids)
>>> hidden_state = outputs.last_hidden_state
```r  r  r  c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        U5      U l        U R                  5         g )NF)r$   r%   r   r   r]  rR   r>  r  r  r  ry  r[  r  rd  )r+   rL   r  r.   s      r/   r%   UMT5EncoderModel.__init__z  sb     ll6#4#4fnnEv.#( ,1) 0 	r1   c                     U R                   $ r^   r  r  s    r/   r  %UMT5EncoderModel.get_input_embeddings  r  r1   c                 F    Xl         U R                  R                  U5        g r^   )r>  r  rj  rh  s     r/   rj  %UMT5EncoderModel.set_input_embeddings  s    $)).9r1   Nr)  r   ro  r  rw  rx  r   c           	      d    Ub  UOU R                   R                  nU R                  UUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, UMT5EncoderModel

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )rL   rx  r  )	r+   r)  r   ro  r  rw  rx  r   r  s	            r/   r@   UMT5EncoderModel.forward  sH    F &1%<k$++BYBY,,)'/!5# ' 
 r1   )r  r>  )NNNNNN)rB   rC   rD   rE   r   r  r  r%   r  rj  r   r'   r  r  r  r|  r   r@   rF   rG   rH   s   @r/   r;  r;  d  s     J 	&
:  .23726)-,0#',##d*, ))D0, ((4/	,
  $;, #Tk, D[, 
u  	!O	3, ,r1   r;  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                     ^  \ rS rSrS/rS\4U 4S jjr\            SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\R                     S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )UMT5ForSequenceClassificationi  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightrL   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r^   )r$   r%   r9  r&  r  classification_headrd  r[   s     r/   r%   &UMT5ForSequenceClassification.__init__  s6     $V,#9&#A  	r1   Nr)  r   r(  r*  r  ro  r  r  r  r  rw  rx  r   c                 4   Ub  UOU R                   R                  nUb  Sn	Uc%  Ub"  [        SU R                  R                   35      eUc"  Uc  Uc  [        S5      eU R                  U5      nU R                  UUUUUUUU	U
UUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R!                  5       S:H  S5        UR"                  u  nnnUUSS24   R%                  US	U5      SS2S	SS24   nU R'                  U5      nSnUGb  UR                  UR                  5      nU R                   R(                  c  U R                   R*                  S:X  a  S
U R                   l        OyU R                   R*                  S:  aN  UR,                  [        R.                  :X  d  UR,                  [        R0                  :X  a  SU R                   l        OSU R                   l        U R                   R(                  S
:X  aT  [3        5       nU R                   R*                  S:X  a&  U" UR5                  5       UR5                  5       5      nOU" UU5      nOU R                   R(                  S:X  aG  [7        5       nU" UR%                  S	U R                   R*                  5      UR%                  S	5      5      nO-U R                   R(                  S:X  a  [9        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [;        UUUR<                  UR>                  UR@                  URB                  URD                  URF                  URH                  S9	$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r(  r*  r  ro  r  r  r  rw  rx  r   r   z7All examples must have the same number of <eos> tokens.r4   
regressionsingle_label_classificationmulti_label_classificationr  )%rL   rx  NotImplementedErrorr.   rB   rK  rO  r&  eqeos_token_idr6   r   r   r'   unique_consecutivesumnumelr   r   r  problem_typer  r;   r   r   r   squeezer   r   r   r   r  r  rv  r  r   r  )r+   r)  r   r(  r*  r  ro  r  r  r  r  rw  rx  r   r   r  eos_maskr   r  r,   sentence_representationr  r  r  r  s                            r/   r@   %UMT5ForSequenceClassification.forward  sN   ` &1%<k$++BYBYI!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9+'"7/!5# # 
 "!*<< 8 89<<_=S=ST$$X\\!_5;;=BE	
 &5%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r1   )r  r&  r  )rB   rC   rD   rE   "_keys_to_ignore_on_load_unexpectedr   r%   r   r'   r  r`   listr  r  r|  r   r@   rF   rG   rH   s   @r/   r  r    sU    +s)s&z   .2.259:>:>26:>*.!%)-,0#'A
##d*A
 t+A
 !++d2	A

 !& 0 04 7A
 e//047A
 ((4/A
  %0047A
   4'A
 $;A
  $;A
 #TkA
 D[A
 
0	0A
 A
r1   r  c                     ^  \ rS rSrS/rS\4U 4S jjr\       SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
S-  S\
S-  S\
S-  S\\R                     \-  4S jj5       rSrU =r$ )rB  iW  r  rL   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r^   )r$   r%   r  r;  r&  r   rV   r  rX   rQ   r,   r5  rd  r[   s     r/   r%   #UMT5ForTokenClassification.__init__\  sj      +++F3zz&";";<))F$6$68I8IJ 	r1   Nr)  r   ro  r  r  rw  rx  r   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  XSS 4nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
N)r   ro  r  rw  rx  r   r4   r3   )r  r  r>   ru  )rL   rx  r&  rX   r5  r   r   r  r   r>   ru  )r+   r)  r   ro  r  r  rw  rx  r   r   r>   r  r  r  r  s                  r/   r@   "UMT5ForTokenClassification.forwardg  s    6 &1%<k$++BYBY"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDam,F)-)9TGf$EvE$!//))	
 	
r1   )r5  rX   r  r&  )NNNNNNN)rB   rC   rD   rE   r  r   r%   r   r'   r`   r  r|  r   r@   rF   rG   rH   s   @r/   rB  rB  W  s    *r)s&	z 	  *..2-1&*)-,0#'6
<<$&6
 t+6
 ||d*	6

 t#6
  $;6
 #Tk6
 D[6
 
u||	4	46
 6
r1   rB  c                     ^  \ rS rSrSSS.rU 4S jrS rS r\             SS\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\\\	R                        S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\\	R                     \-  4S jj5       rSrU =r$ )r<  i  r  r  c                 ,  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        [        U5      U l        [        R                  " U5      nSUl
        UR                  Ul        [        U5      U l        UR"                  U l        [        R$                  " UR                  UR"                  5      U l        U R)                  5         g r  )r$   r%   rR   r  r   r   r]  r>  r  r  r   r  r[  r  r  r`  r  r  rQ   r3  rd  r  s       r/   r%   !UMT5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#(  0v.$(!$*$=$=! 0 ++))FNNF4E4EF 	r1   c                     U R                   $ r^   r  r  s    r/   r  -UMT5ForQuestionAnswering.get_input_embeddings  r  r1   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r^   r  rh  s     r/   rj  -UMT5ForQuestionAnswering.set_input_embeddings  r  r1   Nr)  r   r(  r*  r  start_positionsend_positionsro  r  r  r  rw  rx  r   c                    Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  Ub  Sn
Uc"  U	c  Uc  [        S5      eU R	                  U5      nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUU	SUUU
UUUS	9
nUS   nU R                  U5      nUR                  SS
S9u  nnUR                  S
5      R                  5       nUR                  S
5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a*  UR                  S
5      R                  UR                   5      n[        UR                  5       5      S:  a*  UR                  S
5      R                  UR                   5      nUR                  S5      nUR#                  SU5      nUR#                  SU5      n[%        US9nU" UU5      nU" UU5      nUU-   S-  nU(       d  UU4USS -   U-   nUb  U4U-   $ U$ ['        UUUUR(                  UR*                  UR,                  UR.                  UR0                  UR*                  UR,                  S9
$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
NFr  r  r   r   r3   r  r  r4   r   r  )
r  start_logits
end_logitsr   r  r  rv  r  r   r  )rL   rx  r  rK  rO  r  r_   r   r  r  r3  splitr  r   r   r6   r   r  r   r   r   r>   ru  rv  rt  )r+   r)  r   r(  r*  r  r  r  ro  r  r  r  rw  rx  r   r>   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                              r/   r@    UMT5ForQuestionAnswering.forward  s3   \ &1%<k$++BYBY!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r1   )r  r  r  r  r3  r>  r  )rB   rC   rD   rE   r  r%   r  rj  r   r'   r  r  r  r|  r`   r  r   r@   rF   rG   rH   s   @r/   r<  r<    s    (7'6
.:
  .23759:>=A371526:>!%)-,0#'I
##d*I
 ))D0I
 !++d2	I

 !& 0 04 7I
 uU\\23d:I
 ))D0I
 ''$.I
 ((4/I
  %0047I
 $;I
  $;I
 #TkI
 D[I
  
u  	!$G	G!I
 I
r1   r<  )r;  r:  r<  r  rB  r9  r%  )Ar   r  r   r'   r   torch.nnr   r   r   rn  r   r7  activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_umt5r   
get_loggerrB   r   Moduler!   rJ   rf   rq   r~   r   r   r  r  r%  r[  r9  r:  r;  r  rB  r<  __all__rQ  r1   r/   <module>r     s        A A & ! C C ) / 9   .  + 
		H	%+BII +4		 .RYY <")) $o)BII o)dRYY 2bii 6@* @HRYY $ j!/ j! j!Zf
# f
R f
# f
 f
R 
H)#6 H)
H)V X* X Xv N
$7 N
N
b G
!4 G
 G
T p
2 p
 p
fr1   