
    Z j                     j   S r SSKrSSKrSSKJr  SSKrSSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2  SSK3J4r4  \-Rj                  " \65      r7S\Rp                  S\9S\94S jr: " S S\Rv                  5      r<  S<S\Rz                  S\Rp                  S\Rp                  S\Rp                  S \Rp                  S-  S!\>S-  S"\>S#\'\)   4S$ jjr? " S% S&\Rz                  5      r@ " S' S(\5      rA " S) S*\5      rB\* " S+ S,\%5      5       rC " S- S.\C5      rD " S/ S0\C5      rE\* " S1 S2\C5      5       rF\*" S3S49 " S5 S6\C\5      5       rG " S7 S8\C5      rH " S9 S:\C\5      rI/ S;QrJg)=zPyTorch PEGASUS model.    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)OutputRecordercapture_outputs   )PegasusConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr!   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r#   r$   r%   shifted_input_idss       }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/pegasus/modeling_pegasus.pyshift_tokens_rightr/   9   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c            
          ^  \ rS rSrSrSS\S\S\S-  SS4U 4S jjjrS	 r\R                  " 5        SS
\R                  S\S\R                  S-  S\R                  4U 4S jjj5       rSrU =r$ )$PegasusSinusoidalPositionalEmbeddingJ   zDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxreturnc                 "   > [         TU ]  XSS9  g )NT)_freeze)super__init__)selfr4   r5   r6   	__class__s       r.   r;   -PegasusSinusoidalPositionalEmbedding.__init__M   s    tDr0   c                    U R                   R                  u  p[        R                  " [	        U5       VVs/ s H?  n[	        U5       Vs/ s H%  oC[        R
                  " SSUS-  -  U-  5      -  PM'     snPMA     snn5      n[        R                  " XU R                   R                  SS9nUS-  S:X  a  US-  OUS-  S-   n[        R                  " [        R                  " USS2SSS24   5      5      USS2SU24'   [        R                  " [        R                  " USS2SSS24   5      5      USS2US24'   U$ s  snf s  snnf )z
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
the 2nd half of the vector. [dim // 2:]
i'     F)dtyperequires_gradr   r!   N)weightr)   nparrayrangepowertorchemptyrA   FloatTensorsincos)r<   n_posdimposjposition_encoutsentinels           r.   create_weight2PegasusSinusoidalPositionalEmbedding.create_weightP   s   
 [[&&
xxX]^cXdeXdQTsLABHHUAaL3$677LXde
 kk%DKK,=,=US"Qw!|3!8#(a"..rvvl1add76K/LMAqzM!--bff\!QTT'5J.KLAxyL
 Mes   E

,E6E
E
input_ids_shapepast_key_values_lengthposition_idsc                    > UcB  USS u  pE[         R                  " X"U-   [         R                  U R                  R                  S9n[
        TU ]  U5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr@   )rA   device)rH   arangelongrC   rZ   r:   forward)r<   rV   rW   rX   bszseq_lenr=   s         r.   r]   ,PegasusSinusoidalPositionalEmbedding.forward_   sX    
 *2A.LC <<&(HPUPZPZcgcncncucuL w|,,r0    N)r   N)__name__
__module____qualname____firstlineno____doc__intr;   rT   rH   no_gradSizeTensorr]   __static_attributes____classcell__r=   s   @r.   r2   r2   J   s    NEc E# ECRVJ Ebf E E ]]_pt	-$zz	-CF	-Z_ZfZfimZm	-		- 	-r0   r2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr'         r@   r   rN   ptrainingr!   )
sizerH   matmul	transposer   
functionalsoftmaxru   r|   
contiguous)
ro   rp   rq   rr   rs   rt   ru   rv   attn_weightsattn_outputs
             r.   eager_attention_forwardr   m   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r0   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )PegasusAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsru   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rx   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r:   r;   r   r   ru   head_dimr   r+   rt   r   r   r   loggerwarning_oncer=   rc   r   Lineark_projv_projq_projout_proj)
r<   r   r   ru   r   r   r   r   r   r=   s
            r.   r;   PegasusAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr0   hidden_stateskey_value_statespast_key_valuesrs   rv   r7   c                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNr'   r!   r@   FT        )ru   rt   )r)   r   r   viewr   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr   r|   ru   rt   reshaper   r   )r<   r   r   r   rs   rv   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r.   r]   PegasusAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r0   )r   ru   r   r   r   r   r   r   r   r   r   rt   r   )r   FTFNNNNN)rc   rd   re   rf   rg   rh   floatboolr"   r;   rH   rk   r	   r   r   tupler]   rl   rm   rn   s   @r.   r   r      s   G  '+ $%C%C %C 	%C
 %C %C %C $%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r0   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jr
S	rU =r$ )
PegasusEncoderLayer   r   c                 j  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  US9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   ru   r   )r:   r;   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normru   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr<   r   r=   s     r.   r;   PegasusEncoderLayer.__init__  s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r0   r   rs   rv   r7   c                    UnU R                  U5      nU R                  " SUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXA-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXA-   nUR                  [        R                  :X  aC  [        R                  " UR                  5      R                  S-
  n[        R                   " X* US9nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
)r   rs   rz   i  )minmaxra   )r   r   r   r   ru   r|   r   r   r   r   r   rA   rH   float16finfor   clamp)r<   r   rs   rv   residual_clamp_values          r.   r]   PegasusEncoderLayer.forward  sD    !11-@>> 
')
 

 --m||VZVcVc-d 0 --m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0%--/++m&9&9:>>EK!KK<[YMr0   )	r   r   ru   r   r   r   r   r   r   )rc   rd   re   rf   r"   r;   rH   rk   r   r   r]   rl   rm   rn   s   @r.   r   r      sQ    =} =$"||" " +,	"
 
" "r0   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )PegasusDecoderLayeri9  Nr   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   ru   r   r   r   r   )ru   r   r   r   )r:   r;   r   r   r   decoder_attention_headsr   r   ru   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )r<   r   r   r=   s      r.   r;   PegasusDecoderLayer.__init__:  s    )nn44,,
 ~~#F$>$>?"(";";$&LL$@!,NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r0   r   rs   encoder_hidden_statesencoder_attention_maskr   	use_cacherv   r7   c                    UnU R                  U5      nU R                  " SUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUbb  UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r   r   rs   rz   )r   r   rs   r   ra   )r   r   r   r   ru   r|   r   r   r   r   r   r   r   )
r<   r   rs   r   r   r   r   rv   r   r   s
             r.   r]   PegasusDecoderLayer.forwardY  s   * !11-@  >> 
'+)
 	
 --m||VZVcVc-d 0 !,$H 88GM#00  +!65 /	 
  M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0r0   )r   r   ru   r   r   r   r   r   r   r   r   rb   )NNNNT)rc   rd   re   rf   r"   rh   r;   rH   rk   r	   r   r   r   r]   rl   rm   rn   s   @r.   r   r   9  s    =} =t = =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
: :r0   r   c                   t   ^  \ rS rSr% \\S'   SrSrSrSr	Sr
Sr\R                  " 5       U 4S j5       rSrU =r$ )PegasusPreTrainedModeli  r   modelTc                   > [         TU ]  U5        [        U[        5      (       a0  [        R
                  " UR                  UR                  5       5        g [        U[        5      (       a!  [        R                  " UR                  5        g g rb   )r:   _init_weightsr   r2   initcopy_rC   rT   PegasusForConditionalGenerationzeros_final_logits_bias)r<   ro   r=   s     r.   r   $PegasusPreTrainedModel._init_weights  s_    f%fBCCJJv}}f&:&:&<= ?@@KK001 Ar0   ra   )rc   rd   re   rf   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphrH   ri   r   rl   rm   rn   s   @r.   r   r     sD    &*#N!
]]_2 2r0   r   c                      ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
4S jrS\R                  4S	 jr\\\   SS
\\   S\4S jj5       5       5       rSrU =r$ )PegasusEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`PegasusEncoderLayer`].

Args:
    config: PegasusConfig
    embed_tokens (nn.Embedding): output embedding
)r   
attentionsr   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        [        R                  " UR                   X R                  5      U l        [%        UR                  UU R                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        U5      PM     sn5      U l        [        R2                  " UR
                  5      U l        SU l        U R9                  5         g s  snf )N      ?F)r:   r;   ru   encoder_layerdrop	layerdropr   r$   r6   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   	Embedding
vocab_sizeembed_tokensr2   embed_positions
ModuleListrF   encoder_layersr   r   r   
layer_normgradient_checkpointing	post_init)r<   r   r   r   r=   s       r.   r;   PegasusEncoder.__init__  s
    ~~11NN	!..$*$B$B!393I3I499Y/sLL):):IGWGWXC** 

 mm%PVPePeJf$gJfQ%8%@Jf$gh,,v~~6&+# %hs   E*new_num_position_embeddingsc                    [         R                  SU S35        XR                  l        [	        U R                  R                  U R                  R
                  U R                  5      U l        [        R                  " U R                  R                  U R                  R                  5       5        U R                  R                  U R                  5        g  
Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
config.max_position_embeddings`.

Arguments:
    new_num_position_embeddings (`int`):
        The number of new position embeddings. If position embeddings are learned, increasing the size will add
        newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
        position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
        add correct vectors at the end following the position encoding algorithm, whereas reducing the size
        will remove vectors from the end.
z(Setting `config.max_position_embeddings=z`...Nr   infor   r   r2   r   r6   r  r   r   rC   rT   torZ   r<   r  s     r.   resize_position_embeddings)PegasusEncoder.resize_position_embeddings       	>?Z>[[_`a.I+CKK//KK 

 	

4''..0D0D0R0R0TU,r0   r7   c                     U R                   $ z(
Returns the position embeddings matrix
r  r<   s    r.   get_position_embeddings&PegasusEncoder.get_position_embeddings       ###r0   rv   c                 V   US L US L-  (       a  [        S5      eUc  U R                  U5      U R                  -  nUR                  S S nU R	                  U5      nX6-   n[
        R                  R                  XpR                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pSn
U R                  (       a'  [        R                  " / 5      nXR                  :  a  Sn
U
(       a  MH  U	" UU40 UD6nMT     U R!                  U5      n[#        US9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr'   rz   )r   inputs_embedsrs   FT)last_hidden_state)r+   r  r  r)   r  r   r   ru   r|   r   r   	enumerater   rH   randr   r  r   )r<   r#   rs   r"  rv   r   	embed_posr   idxencoder_layerto_dropdropout_probabilitys               r.   r]   PegasusEncoder.forward  s(    -t";<YZZ  --i84;K;KKM#))#2.((5	%1--m||VZVcVc-d2;;')
 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9 6+
 	
r0   )
ru   r  r  r  r  r  r   r   r   r6   r   )rc   rd   re   rf   rg   r   r   _can_record_outputsr"   r;   rh   r  r   r  r  r   r    r   r   r   r   r]   rl   rm   rn   s   @r.   r   r     s     -&
} 2-c -0$ $   	-

 +,-
 
-
    -
r0   r   c                      ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
S
\4S jrS\R                  4S jr\\\       SS\\   S\4S jj5       5       5       rSrU =r$ )PegasusDecoderi"  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PegasusDecoderLayer`]

Args:
    config: PegasusConfig
    embed_tokens (nn.Embedding): output embedding
r!   r   )index
layer_namer   )r   r   cross_attentionsr   c           
         > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                   UR                  U R                  5      U l        [%        UR                  UR                  U R                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [        R2                  " UR                  5      U l        SU l        U R9                  5         g s  snf )Nr   )r   F)r:   r;   ru   decoder_layerdropr   r$   r6   r   max_target_positionsr  r  r  r   r  r   r  r  r  r2   r  r	  rF   decoder_layersr   r   r   r  r  r  )r<   r   ir=   s      r.   r;   PegasusDecoder.__init__1  s    ~~11!..$*$B$B!8>8N8N499V^^4TWLL):):FNNDL\L\]C**NN 

 mmW\]c]r]rWs$tWsRS%8%MWs$tu,,v~~6&+# %us   E;r  c                    [         R                  SU S35        XR                  l        [	        U R                  R                  U R                  R
                  U R                  5      U l        [        R                  " U R                  R                  U R                  R                  5       5        U R                  R                  U R                  5        gr  r  r  s     r.   r  )PegasusDecoder.resize_position_embeddingsG  r  r0   r7   c                     U R                   $ r  r  r  s    r.   r  &PegasusDecoder.get_position_embeddings_  r   r0   rv   c                 ~   US L US L-  (       a  [        S5      eUc  U R                  U5      nX`R                  -  nU(       ab  Uc_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       S S u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[
        5      (       a  UR                  OUn[!        U R                  UUUS9n[#        U R                  UUUS9nU R%                  X4XS9nUU-   n[&        R(                  R+                  UU R*                  U R,                  S	9n[/        U R0                  5       HN  u  nnU R,                  (       a(  [        R2                  " / 5      nUU R4                  :  a  M?  U" UUU4UUUS
.UD6nMP     U R7                  U5      n[9        UUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r   r'   r   rZ   )r   r"  rs   r   )r   r"  rs   r   )rX   rz   )r   r   r   )r#  r   )r+   r  r  r   is_encoder_decoderr   r
   r}   get_seq_lengthrH   r[   rZ   r   onesr   r   r   r   r  r   r   ru   r|   r$  r   r%  r   r  r   )r<   r#   rs   r   r   r   r"  r   rv   
batch_size
seq_lengthrW   rX   mask_seq_lengthself_attn_cachecausal_mask	positionsr   r'  decoder_layerr*  s                        r.   r]   PegasusDecoder.forwarde  sC    -t";<stt  --i8M &(8(88 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 );;')+	
 ";;;'1"7	"
 ((*)ACY(u	%	1--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M #9" 68++
 	
r0   )
ru   r  r  r  r  r  r   r   r4  r6   )NNNNNNN)rc   rd   re   rf   rg   r   r   r   r,  r"   r;   rh   r  r   r  r  r   r    r   r   r   r   r]   rl   rm   rn   s   @r.   r.  r.  "  s     -$%5Q;W*+;1Q_`} ,-c -0$ $   "#S
 +,S
 
3S
    S
r0   r.  c                     ^  \ rS rSrSSS.rS\4U 4S jjrS rS rS\	4S	 jr
S
\\R                     4S jr\\         SS\R$                  S-  S\R$                  S-  S\R$                  S-  S\R$                  S-  S\\R&                     S-  S\S-  S\R$                  S-  S\R$                  S-  S\S-  S\\   S
\\-  4S jj5       5       rSrU =r$ )PegasusModeli  zshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weightr   c                    > [         TU ]  U5        UR                  UR                  p2[        R
                  " X1R                  U5      U l        [        U5      U l	        [        U5      U l        U R                  5         g rb   )r:   r;   r$   r  r   r  r   sharedr   encoderr.  decoderr  )r<   r   r6   r  r=   s       r.   r;   PegasusModel.__init__  s]     "("5"5v7H7HZll:~~{K%f-%f- 	r0   c                     U R                   $ rb   )rL  r  s    r.   get_input_embeddings!PegasusModel.get_input_embeddings  s    {{r0   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g rb   )rL  rM  r  rN  r<   rr   s     r.   set_input_embeddings!PegasusModel.set_input_embeddings  s'    $(KK!$(KK!r0   r  c                     XR                   l        U R                  R                  U5        U R                  R                  U5        gr  N)r   r   rM  r  rN  r  s     r.   r  'PegasusModel.resize_position_embeddings  s5     /J+//0KL//0KLr0   r7   c                 j    U R                   R                  5       U R                  R                  5       4$ r  )rM  r  rN  r  s    r.   r  $PegasusModel.get_position_embeddings  s)     4468\8\8^__r0   Nr#   rs   decoder_input_idsdecoder_attention_maskencoder_outputsr   r"  decoder_inputs_embedsr   rv   c
                    Uc  U R                   " S	UUUS.U
D6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

Example:

```python
>>> from transformers import AutoTokenizer, PegasusModel

>>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
>>> model = PegasusModel.from_pretrained("google/pegasus-large")

>>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
>>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt")
>>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 4, 1024]
```N)r#   rs   r"  r   r!   r@   )r#  r   r   r#   rs   r   r   r   r"  r   )r#  r   decoder_hidden_statesdecoder_attentionsr1  encoder_last_hidden_stater   encoder_attentionsra   )rM  r   r   lenrN  r   r#  r   r   r   r1  )r<   r#   rs   r\  r]  r^  r   r"  r_  r   rv   decoder_outputss               r.   r]   PegasusModel.forward  s   ` ""ll #-+ 	O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO ,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r0   )rN  rM  rL  )	NNNNNNNNN)rc   rd   re   rf   _tied_weights_keysr"   r;   rQ  rU  rh   r  r   r   r  r  r   r   rH   rk   rJ   r	   r   r   r   r   r]   rl   rm   rn   s   @r.   rJ  rJ    sZ    (7'6

} 
0
Mc M"`r||)< `  *..2156:;?(,-159!%R
<<$&R
 t+R
 !<<$.	R

 !&t 3R
 u001D8R
 R
 ||d*R
  %||d2R
 $;R
 +,R
 
#	#R
  R
r0   rJ  zY
    The PEGASUS Model with a language modeling head. Can be used for summarization.
    )custom_introc                   @  ^  \ rS rSrSrS/rSS0rS\4U 4S jjr S S	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jrS\	4S jrS\\R                     4S jr\\          S!S\R*                  S-  S\R*                  S-  S\R*                  S-  S\R*                  S-  S\\R,                     S-  S\S-  S\R*                  S-  S\R*                  S-  S\R*                  S-  S\
S-  S\\   S\\-  4S jj5       5       rS\R*                  4S jrSrU =r$ )"r   iG  r   r   lm_head.weightzmodel.shared.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r!   Fr   )r:   r;   rJ  r   register_bufferrH   zerosrL  num_embeddingsr   r   r   lm_headr  r   s     r.   r;   (PegasusForConditionalGeneration.__init__S  s     !&)
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r0   Nnew_num_tokenspad_to_multiple_ofmean_resizingr7   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r:   resize_token_embeddings_resize_final_logits_biasrC   r)   )r<   rs  rt  ru  new_embeddingsr=   s        r.   rw  7PegasusForConditionalGeneration.resize_token_embeddings\  s<     8]jk&&~'<'<'B'B1'EFr0   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr'   r!   r=  ry   r   )r   r)   rH   ro  rZ   catrn  )r<   rs  old_num_tokensnew_bias
extra_biass        r.   rx  9PegasusForConditionalGeneration._resize_final_logits_biasc  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r0   r  c                     XR                   l        U R                  R                  R	                  U5        U R                  R
                  R	                  U5        grX  )r   r   r   rM  r  rN  r  s     r.   r  :PegasusForConditionalGeneration.resize_position_embeddingsl  sA     /J+

556QR

556QRr0   c                     U R                   R                  R                  5       U R                   R                  R                  5       4$ r  )r   rM  r  rN  r  s    r.   r  7PegasusForConditionalGeneration.get_position_embeddings}  s5     

""::<djj>P>P>h>h>jkkr0   r#   rs   r\  r]  r^  r   r"  r_  labelsr   rv   c                    U	bX  U
(       a  [         R                  S5        Sn
Uc7  Uc4  [        XR                  R                  U R                  R
                  5      nU R                  " U4UUUUUUUU
S.UD6nU R                  UR                  5      U R                  -   nSnU	bF  [        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  S9	$ )a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Summarization:

```python
>>> from transformers import AutoTokenizer, PegasusForConditionalGeneration

>>> model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
>>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

>>> ARTICLE_TO_SUMMARIZE = (
...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt")

>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"])
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"California's largest electricity provider has turned off power to hundreds of thousands of customers."
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rs   r\  r^  r]  r   r"  r_  r   r'   )	losslogitsr   rb  rc  r1  rd  r   re  )r   warningr/   r   r$   r%   r   rq  r#  r   r   r   r  r   r   rb  rc  r1  rd  r   re  )r<   r#   rs   r\  r]  r^  r   r"  r_  r  r   rv   outputs	lm_logitsmasked_lm_lossloss_fcts                   r.   r]   'PegasusForConditionalGeneration.forward  s>   r klI (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+'"7'
 '
 LL!:!:;d>T>TT	')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r0   c                 j    [        XR                  R                  U R                  R                  5      $ rb   )r/   r   r$   r%   )r<   r  s     r.   %prepare_decoder_input_ids_from_labelsEPegasusForConditionalGeneration.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr0   rq  r   )NT)
NNNNNNNNNN)rc   rd   re   rf   r   _keys_to_ignore_on_load_missingri  r"   r;   rh   r   r   r  rw  rx  r  r   r  r   r   rH   rk   rJ   r	   r   r   r   r]   r  rl   rm   rn   s   @r.   r   r   G  s     ':&;#/}  ae!7:TzY]	 < < <Sc S"lr||)< l  *..2156:;?(,-159&*!%]
<<$&]
 t+]
 !<<$.	]

 !&t 3]
 u001D8]
 ]
 ||d*]
  %||d2]
 t#]
 $;]
 +,]
 
	 ]
  ]
~hELL h hr0   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PegasusDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rb   )r:   r;   r.  rN  r  r   s     r.   r;   PegasusDecoderWrapper.__init__  s&     %f-r0   c                 &    U R                   " U0 UD6$ rb   rN  )r<   argsrv   s      r.   r]   PegasusDecoderWrapper.forward  s    ||T,V,,r0   r  )	rc   rd   re   rf   rg   r;   r]   rl   rm   rn   s   @r.   r  r    s    

- -r0   r  c                     ^  \ rS rSrSS0rU 4S jrS rS rS\R                  4S jr
S	\4S
 jr\\         SS\R                   S-  S\R"                  S-  S\R$                  S-  S\R$                  S-  S\S-  S\R$                  S-  S\R                   S-  S\S-  S\\R"                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )PegasusForCausalLMi  rl  z!model.decoder.embed_tokens.weightc                 
  > [         R                  " U5      nSUl        SUl        [        TU ]  U5        [        U5      U l        [        R                  " UR                  UR                  SS9U l        U R                  5         g )NTFr   )copydeepcopyr   r>  r:   r;   r  r   r   r   hidden_sizer  rq  r  r   s     r.   r;   PegasusForCausalLM.__init__  sf    v& $)! *62
yy!3!3V5F5FUS 	r0   c                 B    U R                   R                  R                  $ rb   r   rN  r  r  s    r.   rQ  'PegasusForCausalLM.get_input_embeddings	  s    zz!!...r0   c                 8    XR                   R                  l        g rb   r  rT  s     r.   rU  'PegasusForCausalLM.set_input_embeddings  s    */

'r0   r7   c                 J    U R                   R                  R                  5       $ r  )r   rN  r  r  s    r.   r  *PegasusForCausalLM.get_position_embeddings  s     zz!!99;;r0   r  c                 n    XR                   l        U R                  R                  R	                  U5        grX  )r   r   r   rN  r  r  s     r.   r  -PegasusForCausalLM.resize_position_embeddings  s(     /J+

556QRr0   Nr#   rs   r   r   r   r"  r  r   logits_to_keeprv   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, PegasusForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
>>> model = PegasusForCausalLM.from_pretrained("google/pegasus-large")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```ra  r   Nr'   )r  r  r   r   r   r1  ra   )r   rN  r   rh   slicerq  r  rZ   r   r   r   r  r   r   r   r   r1  )r<   r#   rs   r   r   r   r"  r  r   r  rv   r  r   slice_indicesr  r  r  s                    r.   r]   PegasusForCausalLM.forward%  s   N >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r0   r  )	NNNNNNNNr   )rc   rd   re   rf   ri  r;   rQ  rU  r   r  r  rh   r  r   r   rH   
LongTensorrk   rJ   r	   r   r   r   r   r   r]   rl   rm   rn   s   @r.   r  r    sM   =
/0< <Sc S   .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r0   r  )r  r   rJ  r   )Nr   )Krg   r  r  collections.abcr   numpyrD   rH   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   r    configuration_pegasusr"   
get_loggerrc   r   rk   rh   r/   r  r2   Moduler   r   r   r   r   r   r   r.  rJ  r   r  r  __all__ra   r0   r.   <module>r     s      $    % & ! C C ) J B 9  G &  8 E 0 
		H	%%,, c [^ "-2<< -R !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)ryy r)l54 5rZ4 Zz 2_ 2 2$v
+ v
rY
+ Y
x E
) E
 E
P 
Yh&<o Yh
Yhz-2 -q
/ q
h nr0   