
    Z j:I                       S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJr  SSK	J
r  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r)SNS\RT                  S\+S\+S\+S\RT                  4
S jjr,S\RT                  S\+S\+S\RT                  4S jr-SNS\RT                  S\+S\+S\+S\RT                  4
S jjr.S\+S\RT                  4S jr/S\RT                  S\+S\RT                  4S jr0S\RT                  S\+S \Rb                  S\RT                  4S! jr2S\RT                  S"\+S\3\RT                  \RT                  4   4S# jr4S\RT                  S"\+S\RT                  4S$ jr5S%\RT                  S&\RT                  S'\+S\RT                  4S( jr6 " S) S*\Rn                  5      r8 " S+ S,\Rn                  5      r9 " S- S.\Rn                  5      r: " S/ S0\Rn                  5      r; " S1 S2\Rn                  5      r< " S3 S4\Rn                  5      r= " S5 S6\Rn                  5      r> " S7 S8\Rn                  5      r? " S9 S:\Rn                  5      r@ " S; S<\Rn                  5      rA " S= S>\Rn                  5      rB " S? S@\5      rC\" " SA SB\5      5       rD " SC SD\D5      rE\" " SE SF\D5      5       rF\"" SGSH9 " SI SJ\D\5      5       rG\" " SK SL\D5      5       rH/ SMQrIg)OzPyTorch LongT5 model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )LongT5Configx	block_lendim	pad_valuereturnc                 l   U R                   U   * U-  n[        U R                   5      (       d?  [        U R                   5      nXR==   U-  ss'   [        R                  " XPR
                  S9$ S/U R                  -  nSU4Xb'   [        USSS2   S5      n[        R                  R                  XSUS9n U $ )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr"   ndimsumr   
functionalr(   )r   r   r   r   pad_len	new_shaper(   s          {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler6   5   s    wws|mi'Gqww<<M	'!{{9GG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 4   U R                   U   U-  S:w  a  [        XUSS9n U R                   U   U-  nU R                   SU X14-   U R                   US-   S -   nSU;   a)  [        R                  " X@R                  U R
                  S9$ U R                  U5      $ )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
r   )r   Nr   r"   device)r+   r6   r.   emptyr"   r:   reshape)r   r   r   
num_blocksoutput_shapes        r5   _split_into_blocksr?   E   s    
 	wws|i1$Q3!<*J774C=J#::QWWcAg[=QQLL{{<wwqxxHH99\""r7   	block_dimsequence_dimc                    U R                   U   nS/U R                  -  nSXQ'   [        USSS2   S5      n[        R                  R                  XSUS9n / n[        S5       HK  n[        S	S5      /U R                  -  n[        XwU-   5      X'   [        U5      nUR                  X   5        MM     [        R                  " XbS
9$ )zConcatenate three consecutive blocks for each input block for local attentiont.

For more information, see: https://huggingface.co/papers/2112.07916.
r#   )r   r   Nr$   r%   r&   r'   r   r   r   )r+   r0   r1   r   r2   r(   rangeslicetupleappendr.   cat)	r   r@   rA   r   r=   r(   blocks_listiindicess	            r5   _concatenate_3_blocksrL   T   s    
 #J(QVV
CCN
c$B$i
C
!:YGA&(K1X D>"QVV+"1*n5.1:&  99[33r7   c                     [         R                  " SU -  [         R                  S9nXU *  nUR                  S5      UR                  S5      -
  nU$ )z:Makes 3-blocked relative position ids for local attention.r   r!   r   r   )r.   arangeint32	unsqueeze)r   position_idscenter_position_idsrelative_position_idss       r5   "_make_3block_relative_position_idsrT   m   sP    <<IU[[AL&)<(22158K8U8UVW8XX  r7   local_attention_maskc                     [        U5      n[        R                  " U5      U:  nUSSSS2SS24   nUR                  U R                  5      n[        R
                  " X5      $ )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rT   r.   abstor:   logical_and)rU   r   rS   locality_masks       r5   _mask_local_attention_maskr[   v   s]    >yIII34y@M!$a"23M!$$%9%@%@AM1AAr7   attention_maskr:   c                     [        XSS9n[        USSS9nUR                  S5      nUR                  S5      n[        R                  " X45      n[        XQ5      nUR                  S5      R                  U5      $ )z;Prepare attention mask to be applied for a local attention.r   rC      r@   rA   r$   )r?   rL   rP   r.   rY   r[   rX   )r\   r   r:   _blocked_attention_mask_3blocked_attention_maskrU   s         r5   _get_local_attention_maskrc      s     1PQR45LXYhij5??C7AA"E ,,-D_56JV))!,//77r7   global_block_sizec                   ^^ U R                   SS u  nmS[        R                  S[        R                  4UU4S jjn[        R                  " X R                  S9T-  n[        R
                  " USS9U-
  n[        R                  " U S	:g  S
S5      R                  U R                  5      n[        R                  " XT-   S
-
  5      R                  U R                  5      n[        R                  " SUR                  UR                  S9n[        R                  " Xg:  Xg5      nX`-  U S-
  -   nU" U5      nTT-  nUS:  a@  [        R                  " USS9R                  R                  US5      R                  SS5      n	O+[        R                  " USUR                  UR                  S9n	[        R
                  " [        R                   " X(5      SS9S-
  n
U
R#                  U R                  5      n
[        R                  " X:*  SS5      n
UR                  [        R$                  5      U
R                  [        R$                  5      4$ )a  Obtain the "fixed block" global id corresponding to each input token.

This implementation is a simplified version of the original Flaxformr implementation adopted from:
https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
the whole fixed block, are assigned to the preceding block.

Padding tokens from the original sequence are represented by -1.
Nr^   	block_idsr   c                 X  > [         R                  " T5      T-  TS-
  :H  nUR                  U R                  5      n[         R                  " XS:  5      nUR                  S5      R                  S5      R                  U R                  5      S-
  n[         R                  " X:  X5      n U $ )Nr   r   r$   )
r.   rN   rX   r:   rY   r1   rP   typer"   where)rf   
block_endstrue_block_endsfull_blocksrd   seq_lens       r5   handle_orphan_tokens:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++JQG%))"-77;@@QTUUKK	 7P	r7   r:   r   )axis              ?g     @r$   r9   r   rC   )r+   r.   Tensor	ones_liker:   cumsumri   rh   r"   floortensormaxvaluesrepeat	transposer/   onesrX   int)r\   rd   
batch_sizern   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrm   s    `         @r5   _make_global_fixed_block_idsr      s    )..r2J    ~>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{8:J )9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj&IrRUVV+..~/D/DE%7%RTUWXY  +-?-D-DUYY-OOOr7   c                     [        X5      u  p#UR                  S   n[        R                  " XBR                  S9nXRS   -
  nUR                  [        R                  5      $ )zBCreate the relative position tensor for local -> global attention.r$   rp   .N)r   r+   r.   rN   r:   rh   int64)r\   rd   rf   r   global_seq_lenglobal_positionsside_relative_positions          r5    _make_side_relative_position_idsr      sW    $@$c!I'--b1N||N;K;KL-)0DD!&&u{{33r7   hidden_statesrf   r   c           	      r   UR                  US:  [        R                  " X!R                  UR                  S95      n[
        R                  R                  UR                  [        R                  5      US-   5      SS2SS2SS24   n[        R                  " SXR                  U R                  5      5      $ )zFCompute individual block aggregates by summing over individual blocks.r   r9   r   Nr$   z...nd,...ng->...gd)ri   r.   rx   r"   r:   r   r2   one_hotrh   r   einsum)r   rf   r   one_hot_block_idss       r5   _create_global_aggregatesr      s    
 Q^??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=S=STaTgTg=hiir7   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )LongT5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zW
Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterr.   r}   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r5   r   LongT5LayerNorm.__init__   s/     	ll5::k#:; #r7   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )Nr^   r$   T)keepdim)rX   r.   float32powmeanrsqrtr   r   r"   float16bfloat16)r   r   variances      r5   forwardLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r7   )r   r   )gư>)__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r5   r   r      s    $+ +r7   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseActDense   configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r5   r   LongT5DenseActDense.__init__   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r7   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)r   r   r   
isinstancer   r   r.   rt   r"   int8rX   )r   r   s     r5   r   LongT5DenseActDense.forward   s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r7   )r   r   r   r   	r   r   r   r   r   r   r   r   r   r   s   @r5   r   r      s    /| / r7   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseGatedActDensei  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r5   r   !LongT5DenseGatedActDense.__init__  s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r7   c                     U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r5   r    LongT5DenseGatedActDense.forward  sQ    hhtyy78		-0#3]3.r7   )r   r   r   r   r   r   r   s   @r5   r   r     s    /| / r7   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5LayerFFi  r   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r5   r   LongT5LayerFF.__init__  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r7   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )r   r   r   )r   r   forwarded_statess      r5   r   LongT5LayerFF.forward$  s;    ??=9../?@%5E(FFr7   )r   r   r   r   r   s   @r5   r   r     s    7| 7 r7   r   c                   t   ^  \ rS rSr  S
S\S\S-  4U 4S jjjr\SS j5       rSS jr	     SS jr
S	rU =r$ )LongT5Attentioni,  Nr   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr   r   r   r   r   s       r5   r   LongT5Attention.__init__-  se    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r7   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r^   r   rX   r.   longrW   min
zeros_likelogfloatmath	full_likeri   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r5   _relative_position_bucket)LongT5Attention._relative_position_bucketO  s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r7   c                    Uc   U R                   R                  R                  n[        R                  " U[        R
                  US9SS2S4   U-   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )%Compute binned relative position biasNr9   r	  r
  r  r^   r   r   r   )r   r   r:   r.   rN   r   r  r   r   r   permuterP   )
r   query_length
key_lengthr:   past_seen_tokenscontext_positionmemory_positionr  relative_position_bucketrz   s
             r5   compute_biasLongT5Attention.compute_bias  s    >1188??F <<EJJvVWXZ^W^_brr,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r7   c                 :   UR                   SS n/ UQSPU R                  P7n	Ub  UR                  U R                  5      OSn
[	        U
[
        R                  5      (       a  U
R                  5       OU
n
USLnU R                  U5      R                  U	5      R                  SS5      nSn[	        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  UR                   U R                     R"                  nUR                   U R                     R$                  nO/ UR                   SS QSPU R                  P7nU R'                  U5      R                  U5      R                  SS5      nU R)                  U5      R                  U5      R                  SS5      nUbU  UR+                  UUU R                  5      u  nnU(       a.  [	        U[        5      (       a  SUR                  U R                  '   [
        R,                  " UUR                  SS5      5      nUc  UR                   S	   nU R.                  (       dh  [
        R0                  " SUR                   S   US   U4UR2                  UR4                  S
9nU R6                  (       a  U R8                  (       a  SUl        OU R=                  US   UUR2                  U
S9nUb#  USS2SS2SS2SUR                   S	   24   nUU-   nUnUU-  n[>        R@                  RC                  URE                  5       SS9RG                  U5      n[>        R@                  RI                  UU RH                  U R8                  S9n[
        R,                  " UU5      nUR                  SS5      RK                  5       nURL                  " / UQSP76 nU RO                  U5      nUU4nU(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr$   r   r   r^   FTr   r`   r:   r"   )r:   r  rC   ptraining)(r+   r   get_seq_lengthr   r   r.   rt   cloner   viewr|   r   
is_updatedgetcross_attention_cacheself_attention_cachelayerskeysrz   r   r   updatematmulr   r/   r:   r"   r   r#  requires_gradr  r   r2   softmaxr  type_asr   
contiguousr<   r   )r   r   r   key_value_statesposition_biaspast_key_valuesoutput_attentionskwargsinput_shapehidden_shaper  is_cross_attentionquery_statesr'  curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapescoresr  causal_maskposition_bias_maskedattn_weightsattn_outputoutputss                             r5   r   LongT5Attention.forward  s    $))#2.BBbB$*A*ABM\Mh?99$..Ino7ABRTYT`T`7a7a+113gw .T9vvm,11,?II!QO 
o':;;(3377GJ!'6'L'L$'6'K'K$#2 -?)]/"=*-44T^^DIIJ/66t~~FMMLP--cr2PBP8O8OPH/44X>HHANJ66.166x@JJ1aPL*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~> lJ,@,@A,FG #))"-J33 %**1-{1~zJSYS`S`hnhtht! ..4==26M/ $ 1 1NJv}}Wg !2 ! "1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!));;;;ff[)./Gr7   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FNT       )Nr   )NNNNF)r   r   r   r   r   r~   r   staticmethodr  r  r   r   r   r   s   @r5   r   r   ,  sc     %* $	 , , :	 ,  ,D -  - ^( [ [r7   r   c                   n   ^  \ rS rSrSS\S\SS4U 4S jjjr\SS j5       rS\	4S	 jr
   SS
 jrSrU =r$ )LongT5LocalAttentioni  r   r   r   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        U R                  U R                  -  U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        U R                  (       a0  [         R,                  " U R                  U R                  5      U l        SU l        g )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s      r5   r   LongT5LocalAttention.__init__  sE    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(??4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r7   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r   r   r  s           r5   r  .LongT5LocalAttention._relative_position_bucket     . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r7   block_lengthc                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ r  metaNr   r9   r  r  r   r   r   r:   rh   r.   rN   r   r  r   r   r   r  rP   r   rV  target_devicer  r  r  r  rz   s           r5   r  !LongT5LocalAttention.compute_bias8       ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr7   c                   ^ ^ UR                   S S u  mnUU 4S jnUU 4S jnU" T R                  U5      5      nU" T R                  U5      5      n	U" T R                  U5      5      n
[	        UT R
                  SS9n[	        U	T R
                  SS9n	[	        U
T R
                  SS9n
[        U	SSS9n	[        U
SSS9n
[        R                  " SX5      nUc  T R                  (       dz  [        R                  " SST R                  T R
                  ST R
                  -  4UR                  UR                  S	9nT R                  (       a  T R                  (       a  S
Ul        OT R#                  T R
                  5      nUb/  [        R$                  " US:  SS5      nX2R'                  SS5      -   nX-  n[(        R*                  R-                  UR/                  5       SS9R1                  U5      n[(        R*                  R3                  UT R2                  T R                  S9nUR5                  U
R                  5      nU" [        R                  " SX5      5      nUS S 2S U2S S 24   nT R7                  U5      nUU4nU(       a  X4-   nU$ )Nr^   c                 T   > U R                  TSTR                  TR                  5      $ 
projectionr$   r&  r   r   statesr   r   s    r5   r+   +LongT5LocalAttention.forward.<locals>.shapeY  "    ;;z2t||T=T=TUUr7   c                 Z   > U R                  5       R                  TSTR                  5      $ r<   r$   r2  r&  r   rd  s    r5   unshape-LongT5LocalAttention.forward.<locals>.unshape]  %    $$&++JDNNKKr7   r   rC   r_   ...qhd,...khd->...hqkr   r   Tr   rr       _r$   r!  ...hqk,...khd->...qhd)r+   r   r   r   r?   r   rL   r.   r   r   r/   r   r:   r"   r   r#  r/  r  ri   r|   r   r2   r0  r  r1  r   rh   r   )r   r   r   r4  r6  
seq_lengthr+   rk  r;  r>  r?  rA  rD  rE  rF  r   s   `              @r5   r   LongT5LocalAttention.forwardP  sG    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D}},,V\\^,DLLVT}},,\T\\TXTaTa,b#((););<ell+BL_`!![j[!"34ff[) 

 /Gr7   )r   r   r   r   r   r   r   r   r   rP  r   r   r   r   r   r   r   FrI  NNF)r   r   r   r   r   boolr   rL  r  r~   r  r   r   r   r   s   @r5   rN  rN    s[    ,| ,$ ,[_ , ,0 -  - ^ 6 G Gr7   rN  c                      ^  \ rS rSrSS\S\SS4U 4S jjjr\SS j5       rS\	4S	 jr
S
\R                  S\R                  S\R                  4S jr   SS jrSrU =r$ )LongT5TransientGlobalAttentioni  r   r   r   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                   U R                  SS9U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [5        UR                  UR6                  S9U l        g )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   rP  r   rd   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normrQ  s      r5   r   'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(??4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD( ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r7   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r   r   r  s           r5   r  8LongT5TransientGlobalAttention._relative_position_bucket  rU  r7   rV  c                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ rX  rZ  r[  s           r5   r  +LongT5TransientGlobalAttention.compute_bias  r^  r7   r   r   c                 x   [         R                  " US   US S 2S S S 24   5      S S 2S S4   n[         R                  " US:  SS5      n[        XR                  5      nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      nXG-   nU$ )Nr   .r   rr   ro  r  )r   r   r   r^   )r.   eqri   r   rd   r  r   r   r   ry  r  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r5   compute_side_bias0LongT5TransientGlobalAttention.compute_side_bias   s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1=""r7   c                 ,	  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jn[        Ub  UO"[        R                  " UR                   S S 5      T R                  5      u  pU	R                   S   n
[        XU
5      nT R                  U5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      n[        UT R                  SS9n[        UT R                  SS9n[        UT R                  SS9n[        USSS9n[        USSS9nS/UR                  S-   -  nUR                   S   US'   UR                  S5      R                  U5      nUR                  S5      R                  U5      n[        R                   " X/SS9n[        R                   " UU/SS9n[        R"                  " SX5      nUb=  [%        UT R                  UR&                  5      n[        R(                  " US	:  S
S5      nOS nUGct  T R*                  (       dz  [        R,                  " SST R.                  T R                  ST R                  -  4UR&                  UR0                  S9nT R2                  (       a  T R4                  (       a  SUl        OT R9                  T R                  5      nUb  UUR;                  SS5      -   nUR=                  UR0                  5      nUc  [        R                  " TU5      nT R?                  X)5      n[        UT R                  SS9R;                  SS5      nUR=                  UR0                  5      RA                  UR&                  5      n[        R                   " UU/SS9nUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UT RL                  T R4                  S9nUR=                  UR0                  5      nU" [        R"                  " SUU5      5      nUS S 2S U2S S 24   nT RO                  U5      nUU4nU(       a  UU4-   nU$ )Nr^   c                 T   > U R                  TSTR                  TR                  5      $ ra  rc  rd  s    r5   r+   5LongT5TransientGlobalAttention.forward.<locals>.shape  rg  r7   c                 Z   > U R                  5       R                  TSTR                  5      $ ri  rj  rd  s    r5   rk  7LongT5TransientGlobalAttention.forward.<locals>.unshape"  rm  r7   r$   r   rC   r_   rn  r   rr   ro  r   r   Tr`   r!  rp  )(r+   r   r.   r}   rd   r   rz  r   r   r   r?   r   rL   r0   rP   r{   rH   r   rc   r:   ri   r   r/   r   r"   r   r#  r/  r  r|   rh   r  rX   r   r2   r0  r  r1  r   r   )r   r   r   r4  r6  rq  r+   rk  rf   r   _global_seq_lenglobal_inputsr;  r>  r?  side_key_statesside_value_statesrepsrA  rU   side_position_biasrD  rE  rF  r   s   `                       @r5   r   &LongT5TransientGlobalAttention.forward  s%    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
<!D
yy,0A!BJ 5|P#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b#((););<ell+BLR^_`!![j[!"34ff[)./Gr7   )r   r   r   rd   rz  ry  r   r   r   r   r   rP  r   r   r   r   r   r   r   rs  rI  rt  )r   r   r   r   r   ru  r   rL  r  r~   r  r.   rt   r  r   r   r   r   s   @r5   rw  rw    s    f| f$ f[_ f f8 -  - ^ 0#ell # #Y^YeYe #0 q qr7   rw  c                   N   ^  \ rS rSrSS\S-  4U 4S jjjr     SS jrSrU =r$ )	LongT5LayerSelfAttentioni  Nr   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   !LongT5LayerSelfAttention.__init__  sQ    ,W`
 *&..f>W>WXzz&"5"56r7   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   nU4U	SS  -   n
U
$ )N)r   r4  r5  	use_cacher6  r   r   )r   r  r   )r   r   r\   r4  r5  r  r6  r7  normed_hidden_statesattention_outputrF  s              r5   r    LongT5LayerSelfAttention.forward  sn      $}=-- '+/ . 
 &5Ea5H(II "%5ab%99r7   )r  r   r   rH  )NNNFF	r   r   r   r   r~   r   r   r   r   r   s   @r5   r  r    s4    7SSWZ 7 7  r7   r  c                   V   ^  \ rS rSrSrS	S\S-  4U 4S jjjr   S
S\4S jjrSr	U =r
$ )LongT5LayerLocalSelfAttentioni  z$Local self attention used in encoderNr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g N)r   r   )r   r   rN  LocalSelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   &LongT5LayerLocalSelfAttention.__init__  sI    "6v"w)&..f>W>WXzz&"5"56r7   r7  c                     U R                  U5      nU R                  UUUUS9nXR                  US   5      -   nU4USS  -   nU$ N)r   r4  r6  r   r   )r   r  r   	r   r   r\   r4  r6  r7  r  r  rF  s	            r5   r   %LongT5LayerLocalSelfAttention.forward  sh      $}=22 '/	 3 
 &5Ea5H(II "%5ab%99r7   )r  r   r   rH  rt  r   r   r   r   __doc__r~   r   r   r   r   r   r   s   @r5   r  r    s;    .7SSWZ 7 7   r7   r  c                   V   ^  \ rS rSrSrS	S\S-  4U 4S jjjr   S
S\4S jjrSr	U =r
$ )'LongT5LayerTransientGlobalSelfAttentioni  z/Transient-Global self attention used in encoderNr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r   r   rw  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s       r5   r   0LongT5LayerTransientGlobalSelfAttention.__init__  sN    ,J-
) *&..f>W>WXzz&"5"56r7   r7  c                     U R                  U5      nU R                  UUUUS9nXR                  US   5      -   nU4USS  -   nU$ r  )r   r  r   r  s	            r5   r   /LongT5LayerTransientGlobalSelfAttention.forward  sh      $}=<< '/	 = 
 &5Ea5H(II "%5ab%99r7   )r  r   r   rH  rt  r  r   s   @r5   r  r    s;    97SSWZ 7 7   r7   r  c                   L   ^  \ rS rSrSS\S-  4U 4S jjjr    SS jrSrU =r$ )	LongT5LayerCrossAttention  Nr   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r5   r   "LongT5LayerCrossAttention.__init__  sO    .vSXdmn)&..f>W>WXzz&"5"56r7   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   n
U
4U	SS  -   nU$ )N)r   r3  r4  r5  r6  r   r   )r   r  r   )r   r   r3  r\   r4  r5  r6  r7  r  r  layer_outputrF  s               r5   r   !LongT5LayerCrossAttention.forward  sm      $}=// -'+/ 0 
 %||4DQ4G'HH/$4QR$88r7   )r  r   r   r   )NNNFr  r   s   @r5   r  r    s0    7#* 7 7  r7   r  c                   V   ^  \ rS rSrSS\S-  4U 4S jjjr         SS jrSrU =r$ )	LongT5Blocki  Nr   c                 $  > [         TU ]  5         UR                  U l        UR                  (       a  [        nOGUR                  S:X  a  [
        nO0UR                  S:X  a  [        nO[        SUR                   S35      e[        R                  " 5       U l
        U R                  R                  U" XUS95        U R                  (       a"  U R                  R                  [        XS95        U R                  R                  [        U5      5        g )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrG   r  r   )r   r   r   r   attention_layerr   s        r5   r   LongT5Block.__init__  s     ++6O**g5;O**.@@EO!889<  ]]_


Fgpq	
 ??JJ7TU

-/0r7   c           	      >   U R                   S   " UUUUUU	S9nUS   nUSS  nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	S9nUS   nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nU4U-   $ )Nr   )r\   r4  r5  r  r6  r   r  )r  ry   )r3  r\   r4  r5  r6  r$   )
r  r"   r.   r   isinfanyfinfory   clampr   )r   r   r\   r4  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr5  r  r6  return_dictr7  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputss                    r5   r   LongT5Block.forward  s    "&A)'+/"
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KK<[YM!__R1Fd1R&*jjm!65; /"3'# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O 

2}5 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KK<[YM 00	
r7   )r   r  rH  )	NNNNNNFFTr  r   s   @r5   r  r    s@    1SSWZ 1 14 "#&*<
 <
r7   r  c                   v    \ rS rSr% \\S'   SrSrS/rSr	\
S 5       r\R                  " 5       S 5       rS	 rS
rg)LongT5PreTrainedModeli^  r   transformerTr  Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r.   rx   r   r   )r   r  
input_maskdummy_inputss       r5   r  "LongT5PreTrainedModel.dummy_inputsg  s8     LL.	\\*-
!*"&0

 r7   c                    U R                   R                  n[        U[        5      (       a%  [        R
                  " UR                  US-  5        g[        U[        [        [        45      (       a  [        R                  " UR                  R                  SUS-  S9  [        US5      (       aJ  U R                   R                  (       d.  [        R                  " UR                  R                  SUS-  S9  ggg[        U[        5      (       GaA  [        R                  " UR                   R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aA  UR                   R$                  b*  [        R&                  " UR                   R$                  5        [        R                  " UR(                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aC  UR(                  R$                  b+  [        R&                  " UR(                  R$                  5        ggg[        U[,        5      (       Ga  [        R                  " UR.                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aA  UR.                  R$                  b*  [        R&                  " UR.                  R$                  5        [        R                  " UR0                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aA  UR0                  R$                  b*  [        R&                  " UR0                  R$                  5        [        R                  " UR(                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aC  UR(                  R$                  b+  [        R&                  " UR(                  R$                  5        ggg[        U[2        [4        [6        45      (       Ga  U R                   R"                  nU R                   R8                  nU R                   R:                  n[        R                  " UR<                  R                  SX#U-  S-  -  S9  [        R                  " UR>                  R                  SX#S-  -  S9  [        R                  " UR@                  R                  SX#S-  -  S9  [        R                  " URB                  R                  SX%U-  S-  -  S9  URD                  (       au  [        R                  " URF                  R                  SX#S-  -  S9  [        U[6        5      (       a0  [        R                  " URH                  R                  SX#S-  -  S9  gggg)zInitialize the weightsrs   rr   )r   stdlm_head      r   N)%r   initializer_factorr   r   init	constant_r   LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelnormal_sharedhasattrtie_word_embeddingsr  r   r   r   r   zeros_r   r   r   r   r   r   rN  rw  r   r   r   r   r   r   r   r   ry  )r   modulefactorr   r   r   s         r5   _init_weights#LongT5PreTrainedModel._init_weightss  s    //fo..NN6==&3,7.LN` abbLL--CVc\Jvy))$++2Q2QV^^22&3,O 3R) 344LL))KKDWDW\`C`9abvyy&))fiinn.HFIINN+LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I) 899LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL++#6kkFYFY^bEb;cdv{{F++0@0@0LFKK,,-LL))KKDTDTY]C]9^_vyy&))fiinn.HFIINN+ /I)2FHf ghhkk))G!%!1!1kk++GLLsM_C_dhBh8ijLLs4-8PQLLs4-8PQLLsM_C_dhBh8ij11V;;BBRXim\mRnof&DEELL==DD3TZko^oTp F 2 ir7   c                 :   U R                   R                  nU R                   R                  nUc  [        S5      eUR	                  UR
                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US:H  U5        U$ )Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information..r$   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  	new_zerosr+   r%  masked_fill_)r   r  r  r  shifted_input_idss        r5   _shift_right"LongT5PreTrainedModel._shift_right  s    !%!C!C{{//!)8 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r7   r%   N)r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r.   no_gradr  r  r   r%   r7   r5   r  r  ^  sT    %&*#&"  ]]_' 'T!r7   r  c                   L   ^  \ rS rSrU 4S jrS r          SS jrSrU =r$ )LongT5Stacki  c                 `  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        UR                  U l        UR                  U l        U R                  S-   U l	        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [!        UR
                  UR"                  S9U l        [        R&                  " UR(                  5      U l        SU l        U R/                  5         g s  snf )Nr   r   r  r   F)r   r   r   r   
vocab_sizer   embed_tokensr   rP  r   r  rD   
num_layersr  ru  blockr   r   final_layer_normr   r   r   r   	post_init)r   r   rJ   r   s      r5   r   LongT5Stack.__init__  s     LL):):FNNK ++"//**Q.]] v0011A FQ!VXYZ1

 !0FD]D] ^zz&"5"56&+# 	s   !D+c                     Xl         g r   )r  r   new_embeddingss     r5   set_input_embeddings LongT5Stack.set_input_embeddings  s    *r7   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S
5        SnUc%  U R                  c   S5       eU R                  U5      nUu  pU R
                  (       ah  U(       a`  Uc]  U R                   R                  (       a/  [        [!        U R                   S9[!        U R                   S95      nO'[!        U R                   S9nOU R
                  (       d  S nUb  UR#                  5       OSnUc4  [%        5       (       d%  UU-   n[&        R(                  " UUUR*                  S9nU R
                  (       a  [-        U R                   UUUS9nO=U R                   R.                  S:X  a!  [1        X R2                  UR*                  5      nOUnU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [&        R(                  " UUR*                  S9nU R5                  U5      nOS nU	(       a  SOS nU(       a  SOS nU(       a  U R
                  (       a  SOS nS nS nU R7                  U5      n[9        U R:                  5       H|  u  nnU	(       a  UU4-   nU" UUUUUUUUUU
S9
n U S   nU S   nU R
                  (       a  Ub  U U(       a  SOS   nU(       d  MW  UU S   4-   nU R
                  (       d  Ms  UU S   4-   nM~     U R=                  U5      nU R7                  U5      nU	(       a  UU4-   nU
(       d  [?        S UUUUU4 5       5      $ [A        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer$   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddings)r   r   rp   )r   r  r\   r5  r  r%   )r5  r  r6  r  r   r   r^      c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r%   ).0r   s     r5   	<genexpr>&LongT5Stack.forward.<locals>.<genexpr>]  s"      
A  s   	)last_hidden_stater5  r   
attentionscross_attentions)!r   r  r6  output_hidden_statesr  r   r  sizer&  r   r#  r   r   r  is_encoder_decoderr   r
   r$  r   r.   r}   r:   r   r  rc   r   invert_attention_maskr   	enumerater	  r
  rF   r   )!r   r  r\   r  r  r  r5  r  r6  r  r  r7  err_msg_prefixr8  r   rq  past_key_values_lengthmask_seq_lengthrB  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr4  r  r   rJ   layer_modulelayer_outputss!                                    r5   r   LongT5Stack.forward  sI    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!*B*D*D4zAO"ZZ
OML`L`aN??,{{+- /	K [[//7:3NNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d&7DOOrRV(,%]3(4OA|#$58H$H!(%/- /#"3'M  *!,M
 *!,M#8#D0=CTaZ[0\-  !/=3C2E!E???+?=QRCSBU+U(A  5D --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r7   )r	  r   r   r  r
  r   r   rP  )
NNNNNNNNNN)	r   r   r   r   r   r  r   r   r   r   s   @r5   r  r    s9    0+
 "#!^
 ^
r7   r  c                     ^  \ rS rSrS/rSSS.rS\4U 4S jjrS rS r	\
            SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                        S	-  S\S	-  S\R"                  S	-  S\R"                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r  iq  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weight)encoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        U5      U l        [        R                  " U5      nSUl	        UR                  Ul        [        U5      U l        U R!                  5         g )NFT)r   r   r   r   r  r   r  copydeepcopyr   r  r  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   s       r5   r   LongT5Model.__init__{  s     ll6#4#4fnnEv.$)!#( ">2v.$(!$*$=$=!">2 	r7   c                     U R                   $ r   r  r   s    r5   get_input_embeddings LongT5Model.get_input_embeddings      {{r7   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r9  r  r;  r  s     r5   r   LongT5Model.set_input_embeddings  +    $)).9)).9r7   Nr  r\   r  r  encoder_outputsr5  r  decoder_inputs_embedsr  r6  r  r  r   c                 R   U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUc  U R                  UUUU
UUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUUUUU	U
UUS9
nU(       d  X-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, LongT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

>>> # Let's try a very long encoder input.
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1

>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr  r\   r  r6  r  r  r   r   r^   r  r   r  
r  r\   r  r5  r  r  r  r6  r  r  )r  r5  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r  r  r9  r   r   lenr;  r   r  r5  r   r  r  )r   r  r\   r  r  rJ  r5  r  rK  r  r6  r  r  r7  r   decoder_outputss                   r5   r   LongT5Model.forward  sP   D "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5# ' 
 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r7   )r;  r9  r  )NNNNNNNNNNNN)r   r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r   rC  r  r   r.   
LongTensorFloatTensor
BoolTensorrF   r	   rt   ru  r   r   r   r   r   s   @r5   r  r  q  sx    	R*& (7'6
| ":
  .23759:>BF(,-159!%)-,0#'q
##d*q
 ))D0q
 !++d2	q

 !& 0 04 7q
 uU%6%6784?q
 q
 ||d*q
  %||d2q
 $;q
  $;q
 #Tkq
 D[q
 
u  	!$6	6q
 q
r7   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc                     ^  \ rS rSrS/rSSSS.rS\4U 4S jjrS rS r	\
             SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\\\R                         S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rS\R                   4S jrSrU =r$ )r  i	  r2  r3  )r4  r5  zlm_head.weightr   c                   > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        [        U5      U l        [        R                  " U5      nSUl
        UR                  Ul        [        U5      U l        [        R"                  " UR                  UR                  SS9U l        U R'                  5         g )NFTr   )r   r   r   	model_dimr   r   r  r  r7  r8  r   r  r  r9  r:  r  r;  r   r  r  r<  s       r5   r   'LongT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( ">2v.$(!$*$=$=!">2yy1B1BO 	r7   c                     U R                   $ r   rA  rB  s    r5   rC  3LongT5ForConditionalGeneration.get_input_embeddings-  rE  r7   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   rG  r  s     r5   r  3LongT5ForConditionalGeneration.set_input_embeddings0  rI  r7   Nr  r\   r  r  rJ  r5  r  rK  labelsr  r6  r  r  r   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU	b  Uc  Uc  U R                  U	5      nU R                  UUUUUUU
UUUS9
nUS   nU R                   R                  (       a  UU R                  S-  -  nU R                  U5      nSnU	b[  [        S	S
9nU	R                  UR                  5      n	U" UR                  SUR!                  S5      5      U	R                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  UR(                  UR*                  UR,                  UR&                  UR(                  S9	$ )a	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
>>> model = LongT5ForConditionalGeneration.from_pretrained(
...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
... )

>>> # Let's try a very long input.
>>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
>>> input_ids = inputs.input_ids

>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
abstractthe aim of this article is to provide an overview of the literature on the role of dog
```NrM  r   r   r^   rN  rO  r  r  )ignore_indexr$   )	losslogitsr5  rP  rQ  r  rR  r  rS  )r   r  r  r9  r   r   rT  r  r;  r  r_  r  r   rX   r:   r&  r  r   r5  r   r  r  r  )r   r  r\   r  r  rJ  r5  r  rK  re  r  r6  r  r  r7  r   rU  sequence_output	lm_logitsrh  loss_fctoutputs                         r5   r   &LongT5ForConditionalGeneration.forward5  s    L "+!6IDKK<Q<Q	%0%<k$++BYBY ""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5# ' 
 *!,;;**-1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r7   c                 $    U R                  U5      $ r   )r  )r   re  s     r5   %prepare_decoder_input_ids_from_labelsDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r7   )r;  r9  r  r_  r  )NNNNNNNNNNNNN)r   r   r   r   rW  rX  r   r   rC  r  r   r.   rY  rZ  r[  rF   rt   r	   ru  r   r   rp  r   r   r   s   @r5   r  r  	  s    	R*& (7'6)| *:
  .23759:>=A(,26:>*.!%)-,0#'J
##d*J
 ))D0J
 !++d2	J

 !& 0 04 7J
 uU\\23d:J
 J
 ((4/J
  %0047J
   4'J
 $;J
  $;J
 #TkJ
 D[J
  
u  	!O	3!J
 J
X)ELL ) )r7   r  c                   
  ^  \ rS rSrSS0rS/rS\4U 4S jjrS rS r	\
      SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\\R                     \-  4S jj5       rSrU =r$ )r  i  r4  r3  r;  r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        [        U5      U l        U R                  5         g )NF)r   r   r   r   r  r   r  r7  r8  r  r  r9  r  )r   r   r=  r   s      r5   r   LongT5EncoderModel.__init__  sZ     ll6#4#4fnnEv.#( ">2 	r7   c                     U R                   $ r   rA  rB  s    r5   rC  'LongT5EncoderModel.get_input_embeddings  rE  r7   c                 F    Xl         U R                  R                  U5        g r   )r  r9  r  r  s     r5   r  'LongT5EncoderModel.set_input_embeddings  s    $)).9r7   Nr  r\   r  r6  r  r  r   c           	      d    Ub  UOU R                   R                  nU R                  UUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```rM  )r   r  r9  )	r   r  r\   r  r6  r  r  r7  rJ  s	            r5   r   LongT5EncoderModel.forward  sH    F &1%<k$++BYBY,,)'/!5# ' 
 r7   )r9  r  )NNNNNN)r   r   r   r   rX  rW  r   r   rC  r  r   r.   rY  rZ  ru  rF   r   r   r   r   r   s   @r5   r  r    s     	& +5&	| 	:  .23726)-,0#'-##d*- ))D0- ((4/	-
  $;- #Tk- D[- 
u  	!O	3- -r7   r  )r  r  r  r  )r   )Jr  r7  r  typingr   r.   r   torch.nnr   r  r   r  activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_longt5r   
get_loggerr   r   rt   r~   r6   r?   rL   rT   r[   r:   rc   rF   r   r   r   Moduler   r   r   r   r   rN  rw  r  r  r  r  r  r  r  r  r  r  __all__r%   r7   r5   <module>r     sf         % & ! C C ) / 9  .  / 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +4")) ,ryy &BII &bii Di299 iXlRYY l`ryy BBII :bii @		 >T
, T
n R!O R! R!jz
' z
z T
' T
 T
n 
u)%:O u)
u)p F. F FR kr7   