
    Z jh                        S SK r S SKJr  S SKrS SKJr  S SKJrJrJr  SSK	J
r  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1J2r2  SSK3J4r4  \,Rj                  " \65      r7 " S S\Rp                  5      r9\) " S S\$5      5       r: " S S\Rp                  5      r;  SBS\Rx                  S\Rz                  S\Rz                  S\Rz                  S\Rz                  S-  S \>S-  S!\>S"\&\(   4S# jjr? " S$ S%\Rx                  5      r@ " S& S'\5      rA " S( S)\:5      rB " S* S+\5      rC " S, S-\:5      rDS.\Rz                  S/\E4S0 jrF\) " S1 S2\:5      5       rG\)" S3S49 " S5 S6\:\5      5       rH " S7 S8\Rx                  5      rI\)" S9S49 " S: S;\:5      5       rJ " S< S=\:5      rK\)" S>S49 " S? S@\:\5      5       rL/ SAQrMg)C    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilingloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )PLBartConfigc            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )PLBartScaledWordEmbedding=   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)super__init__r-   )selfr*   r+   r,   r-   	__class__s        {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/plbart/modeling_plbart.pyr1   "PLBartScaledWordEmbedding.__init__B   s    D&    	input_idsc                 <   > [         TU ]  U5      U R                  -  $ r/   )r0   forwardr-   )r2   r7   r3   s     r4   r9   !PLBartScaledWordEmbedding.forwardF   s    wy)D,<,<<<r6   r-   )      ?__name__
__module____qualname____firstlineno____doc__intfloatr1   torchTensorr9   __static_attributes____classcell__r3   s   @r4   r(   r(   =   sJ    's '3 'S '_dgk_k ' '= = =r6   r(   c                   P   ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrU 4S jrSrU =r$ )	PLBartPreTrainedModelJ   configmodelTPLBartDecoderLayerPLBartEncoderLayerc                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g r/   )r0   _init_weights
isinstancePLBartForConditionalGenerationinitzeros_final_logits_bias)r2   moduler3   s     r4   rR   #PLBartPreTrainedModel._init_weightsT   s5    f%f<==KK001 >r6    )r>   r?   r@   rA   r&   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnrR   rG   rH   rI   s   @r4   rK   rK   J   s<    &*#-/CDN2 2r6   rK   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ ) PLBartLearnedPositionalEmbeddingZ   zF
This module learns positional embeddings up to a fixed maximum size.
r*   r+   c                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetr0   r1   )r2   r*   r+   r3   s      r4   r1   )PLBartLearnedPositionalEmbedding.__init___   s"     ++5}Er6   Nr7   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nrf   )dtypedevicer   )shaperE   arangelongweightrm   expand	unsqueezer0   r9   rg   )r2   r7   ri   rj   bszseq_lenr3   s         r4   r9   (PLBartLearnedPositionalEmbedding.forwarde   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r6   )rg   )r   N)r>   r?   r@   rA   rB   rC   r1   rE   rF   r9   rG   rH   rI   s   @r4   rc   rc   Z   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r6   rc   rX   querykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrn         rf   r   dimptrainingr%   )
sizerE   matmul	transposer   
functionalsoftmaxr}   r   
contiguous)
rX   rx   ry   rz   r{   r|   r}   r~   attn_weightsattn_outputs
             r4   eager_attention_forwardr   u   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r6   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )PLBartAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr}   
is_decoderbias	is_causalrM   	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )r0   r1   r   r   r}   head_dimrM   
ValueErrorr|   r   r   r   loggerwarning_oncer3   r>   r   Lineark_projv_projq_projout_proj)
r2   r   r   r}   r   r   r   rM   r   r3   s
            r4   r1   PLBartAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr6   hidden_stateskey_value_statespast_key_valuesr{   r~   returnc                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNrn   r%   rf   FT        )r}   r|   )ro   r   r   viewr   rS   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacerM   _attn_implementationr   r   r}   r|   reshaper   r   )r2   r   r   r   r{   r~   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r4   r9   PLBartAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r6   )rM   r}   r   r   r   r   r   r   r   r   r   r|   r   )r   FTFNNNNN)r>   r?   r@   rA   rB   rC   rD   boolr&   r1   rE   rF   r   r   r   tupler9   rG   rH   rI   s   @r4   r   r      s   G  &* $%C%C %C 	%C
 %C %C %C t#%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r6   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjrS\R                  S\R                  S\	\
   S	\R                  4S
 jrSrU =r$ )rP   i  NrM   r   c                 l  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  UUS9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   r}   rM   r   )r0   r1   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr}   r
   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normr2   rM   r   r3   s      r4   r1   PLBartEncoderLayer.__init__  s    (nn44,,
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r6   r   r{   r~   r   c                 2   UnU R                   " U4SU0UD6u  p[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXA-   nU R                  U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       dC  [        R                   " UR                  5      R"                  S-
  n[        R$                  " X* US9nU$ )Nr{   r   i  )minmax)r   r   r   r}   r   r   r   r   r   r   r   rl   rE   float16isfiniteallfinfor   clamp)r2   r   r{   r~   residual_clamp_values          r4   r9   PLBartEncoderLayer.forward  s[    !>>
)
 

 --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/}8U8Y8Y8[8[++m&9&9:>>EK!KK<[YMr6   )	r   r   r}   r   r   r   r   r   r   r/   )r>   r?   r@   rA   r&   rC   r1   rE   FloatTensorr   r   rF   r9   rG   rH   rI   s   @r4   rP   rP     se    =| =d
 = =&(( )) +,	
 
 r6   rP   c                      ^  \ rS rSrSr\\S.rS\4U 4S jjr	\
\\   SS\R                  S-  S\R                  S-  S	\R                   S-  S
\\   S\4
S jj5       5       5       rSrU =r$ )PLBartEncoderi9  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`PLBartEncoderLayer`].

Args:
    config: PLBartConfig
    embed_tokens (nn.Embedding): output embedding
)r   
attentionsrM   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSn[        UR                  X R                  US9U l        [!        UR                  U5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        [$        R0                  " U5      U l        SU l        U R7                  5         g s  snf Nr<   r;   )r   F)r0   r1   r}   encoder_layerdrop	layerdropr   pad_token_idr,   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtr(   
vocab_sizeembed_tokensrc   embed_positionsr   
ModuleListrangeencoder_layersrP   r   r   layernorm_embeddinggradient_checkpointing	post_init)r2   rM   r   r-   ir3   s        r4   r1   PLBartEncoder.__init__H  s    ~~11NN	!..$*$B$B!.4.D.Ddii	*#5y*:*:
  @** 
 mmV[\b\q\qVr$sVrQR%7%LVr$st#%<<	#: &+# %ts   0ENr7   r{   inputs_embedsr~   r   c                 h   US L US L-  (       a  [        S5      eUc  U R                  U5      nU R                  US S 2S S 2S4   5      nUR                  UR                  5      nX5-   nU R                  U5      n[        R                  R                  X`R                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pxSn	U R                  (       a'  [        R                  " / 5      n
XR                   :  a  Sn	U	(       a  MH  U" UU40 UD6nMT     [#        US9$ )Nz:You must specify exactly one of input_ids or inputs_embedsrn   r   )rM   r   r{   FT)last_hidden_state)r   r   r   torm   r   r   r   r}   r   r   rM   	enumerater   rE   randr   r   )r2   r7   r{   r   r~   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r4   r9   PLBartEncoder.forwardb  s,    -t";<YZZ  --i8M((q!Rx)@A	LL!5!56	%100?--m||VZVcVc-d2;;')

 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9 +
 	
r6   )	r}   r   r   r   r   r   r   r   r,   r   )r>   r?   r@   rA   rB   rP   r   _can_record_outputsr&   r1   r"   r$   r   rE   
LongTensorrF   r   r   r   r   r9   rG   rH   rI   s   @r4   r   r   9  s     ,%
| 4   .2.226	*
##d**
 t+*
 ((4/	*

 +,*
 
*
    *
r6   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )rO   i  NrM   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   r}   r   r   rM   r   )r}   r   rM   r   )r0   r1   r   r   r   decoder_attention_headsr   r   r}   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   r   s      r4   r1   PLBartDecoderLayer.__init__  s    (nn44,,
 ~~#F$>$>?"(";";$&LL$@!+NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r6   r   r{   encoder_hidden_statesencoder_attention_maskr   	use_cacher~   r   c                    UnU R                   " U4UUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUbb  UnU R                  " U4UUUS.UD6u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU$ )N)r   r{   r   )r   r{   r   )r   r   r   r}   r   r   r  r  r   r   r   r   r   )
r2   r   r{   r  r  r   r  r~   r   r   s
             r4   r9   PLBartDecoderLayer.forward  s    !  >>
+)
 	
 --m||VZVcVc-d 011-@ !,$H#00 !65 /	 
  M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<r6   )r   r   r}   r   r  r  r   r   r   r   r   r/   )NNNNT)r>   r?   r@   rA   r&   rC   r1   rE   rF   r   r   r   r   r9   rG   rH   rI   s   @r4   rO   rO     s    =| =d
 = =D /3596:(,!%/||/ t+/  %||d2	/
 !&t 3/ / $;/ +,/ 
/ /r6   rO   c                   J  ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
\\\       SS\R                  S
-  S\R                   S
-  S\R"                  S
-  S\R                  S
-  S\S
-  S\R"                  S
-  S\S
-  S\\   S\4S jj5       5       5       rSrU =r$ )PLBartDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PLBartDecoderLayer`]

Args:
    config: PLBartConfig
    embed_tokens (nn.Embedding): output embedding
r%   r   )index
layer_namer  )r   r   cross_attentionsrM   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        [!        UR                  UR                  5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        [$        R0                  " UR                  5      U l        SU l        U R7                  5         g s  snf r   )r0   r1   r}   decoder_layerdropr   r   r,   r   max_target_positionsr   r   r   r   r(   r   r   rc   r   r   r   r   decoder_layersrO   r   r   r   r   r   )r2   rM   r-   r   r3   s       r4   r1   PLBartDecoder.__init__  s
    ~~11!..$*$B$B!393I3Idii/s5v~~t/?/?[
  @**NN 
 mmV[\b\q\qVr$sVrQR%7%LVr$st#%<<#? &+# %ts   ENr7   r{   r  r  r   r   r  r~   r   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       ab  Uc_  Uc  U R                  R                  (       a.  [	        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       S S u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[        5      (       a  UR                  OUn[        U R                  UUUS9n[!        U R                  UUUS9nU R#                  [$        XS9nUR'                  UR                  5      nXo-   nU R)                  U5      n[*        R,                  R/                  UU R.                  U R0                  S	9n[3        U R4                  5       HN  u  nnU R0                  (       a(  [        R6                  " / 5      nUU R8                  :  a  M?  U" UUU4UUUS
.UD6nMP     [;        UUS9$ )NzJYou must specify exactly one of decoder_input_ids or decoder_inputs_embeds)rM   rn   r   rm   )rM   r   r{   r   )rM   r   r{   r  )rj   r   )r  r   r  )r   r   )r   r   rM   is_encoder_decoderr   r   r   get_seq_lengthrE   rp   rm   r   onesrS   r   r   r   r   inputr   r   r   r   r}   r   r   r   r   r   r   )r2   r7   r{   r  r  r   r   r  r~   
batch_size
seq_lengthri   rj   mask_seq_lengthself_attn_cache	positionsr   r  decoder_layerr  s                       r4   r9   PLBartDecoder.forward  sD    -t";<ijj  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 ,;;')+	
 ";;;'1"7	"
 ((0F(b	LL!5!56	%100?--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M #9" 9++
 	
r6   )	r}   r   r   r   r   r   r   r  r,   )NNNNNNN)r>   r?   r@   rA   rB   rO   r#   r   r  r&   r1   r"   r$   r   rE   r  rF   r   r   r   r   r   r   r9   rG   rH   rI   s   @r4   r  r    s    ,$_A+V*?!P^_| 0   .2.2:>:>(,26!%R
##d*R
 t+R
  %0047	R

 !& 0 04 7R
 R
 ((4/R
 $;R
 +,R
 
3R
    R
r6   r  r7   r   c                 `   U R                  5       nUc  [        S5      eUR                  US:H  U5        UR                  U5      R	                  SS9S-
  R                  S5      nUR                  SU5      R                  5       nUSS2SS24   R                  5       USS2SS24'   XBSS2S4'   U$ )z
Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that PLBart does not
have a single `decoder_start_token_id` in contrast to other Bart-like models.
Nz1self.model.config.pad_token_id has to be defined.ir%   r   rn   r   )cloner   masked_fill_nesumrt   gathersqueeze)r7   r   prev_output_tokensindex_of_eosdecoder_start_tokenss        r4   shift_tokens_rightr5  c  s    
 #*LMM##$6$$>M&)),7;;;BQFQQRTUL-44QEMMO 21crc6 : @ @ Bq!"u3q!tr6   c                     ^  \ rS rSrSSS.rS\4U 4S jjrS rS r\	\
\         SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\R                      S-  S\S-  S\R                   S-  S\R                   S-  S\S-  S\\   S\\R                     \-  4S jj5       5       5       rSrU =r$ )PLBartModeliw  zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightrM   c                 J  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        U5      U l        [        U5      U l        U R                  5         g )Nr<   r;   )r0   r1   r   r   r   r   r   r   r(   sharedr   encoderr  decoderr   )r2   rM   r,   r   r-   r3   s        r4   r1   PLBartModel.__init__~  ss     "("5"5v7H7HZ393I3Idii/s/
NNKq$V,$V,r6   c                     U R                   $ r/   )r9  r2   s    r4   get_input_embeddings PLBartModel.get_input_embeddings  s    {{r6   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r/   )r9  r:  r   r;  r2   rz   s     r4   set_input_embeddings PLBartModel.set_input_embeddings  s'    $(KK!$(KK!r6   Nr7   r{   decoder_input_idsdecoder_attention_maskencoder_outputsr   r   decoder_inputs_embedsr  r~   r   c
                    Uc"  Uc  [        XR                  R                  5      nUc  U R                  " S	UUUS.U
D6nOK[	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
N)r7   r{   r   r   r%   rf   )r   r   r   r7   r{   r  r  r   r   r  )r   r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsrZ   )r5  rM   r   r:  rS   r   lenr;  r   r   r   r   r   r  )r2   r7   r{   rE  rF  rG  r   r   rH  r  r~   decoder_outputss               r4   r9   PLBartModel.forward  s0   P $)>)F 29kk>V>V W"/3|| 0#-+0 	0O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO ,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r6   )r;  r:  r9  	NNNNNNNNN)r>   r?   r@   rA   _tied_weights_keysr&   r1   r?  rC  r"   r$   r   rE   r  rF   listr   r   r   r   r   r   r   r9   rG   rH   rI   s   @r4   r7  r7  w  sP    (7'6

| 
0
   .226596::>(,26:>!%J
##d*J
 ((4/J
 !++d2	J

 !&t 3J
 e//047J
 J
 ((4/J
  %0047J
 $;J
 +,J
 
u||	1	1J
    J
r6   r7  zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc                   .  ^  \ rS rSrSrS/rSS0rS\4U 4S jjr SS	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jr\\\          SS\R&                  S-  S\R&                  S-  S\R&                  S-  S\R(                  S-  S\\R,                     S-  S\S-  S\R,                  S-  S\R,                  S-  S\R(                  S-  S\
S-  S\\   S\\R(                     \-  4S jj5       5       5       rS\R(                  4S jrSrU =r$ )rT   i  rN   rW   lm_head.weightzmodel.shared.weightrM   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )NrW   r%   Fr   )r0   r1   r7  rN   register_bufferrE   zerosr9  r*   r   r   r   lm_headr   r2   rM   r3   s     r4   r1   'PLBartForConditionalGeneration.__init__  s~      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r6   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r0   resize_token_embeddings_resize_final_logits_biasrr   ro   )r2   r^  r_  r`  new_embeddingsr3   s        r4   rb  6PLBartForConditionalGeneration.resize_token_embeddings  s<     8]jk&&~'<'<'B'B1'EFr6   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nrn   r%   r  r   rW   )rW   ro   rE   rZ  rm   catrY  )r2   r^  old_num_tokensnew_bias
extra_biass        r4   rc  8PLBartForConditionalGeneration._resize_final_logits_bias  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r6   r7   r{   rE  rF  rG  r   r   rH  labelsr  r~   c                 R   U	b%  Uc"  Uc  [        XR                  R                  5      nU R                  " U4UUUUUUUU
S.UD6nU R	                  UR
                  5      nXR                  R                  UR                  5      -   nSnU	bF  [        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S9	$ )a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Mask-filling:

```python
>>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

>>> # en_XX is the language symbol id <LID> for English
>>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)

>>> tokenizer.decode(predictions).split()
['first', 'same', 'highest', 'result', 'number']
```
N)r{   rE  rG  rF  r   r   rH  r  rn   	losslogitsr   rK  rL  r  rM  r  rN  )r5  rM   r   rN   r[  r   rW   r   rm   r   r   r   r   r   rK  rL  r  rM  r  rN  )r2   r7   r{   rE  rF  rG  r   r   rH  rl  r  r~   outputs	lm_logitsmasked_lm_lossloss_fcts                   r4   r9   &PLBartForConditionalGeneration.forward  s,   @  (-B-J$6v{{?W?W$X!&*jj'
)/+#9+'"7'
 '
 LL!:!:;	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r6   c                 @    [        XR                  R                  5      $ r/   )r5  rM   r   )r2   rl  s     r4   %prepare_decoder_input_ids_from_labelsDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labelsj  s    !&++*B*BCCr6   r[  rN   )NT)
NNNNNNNNNN) r>   r?   r@   rA   r\   _keys_to_ignore_on_load_missingrS  r&   r1   rC   r   r   	Embeddingrb  rc  r"   r$   r   rE   r  rF   rT  r   r   r   r   r   r   r9   rw  rG   rH   rI   s   @r4   rT   rT     s     ':&;#/|  ae!7:TzY]	 < < <   .226596::>(,26:>&*!%_
##d*_
 ((4/_
 !++d2	_

 !&t 3_
 e//047_
 _
 ((4/_
  %0047_
 t#_
 $;_
 +,_
 
u||		._
    _
BDELL D Dr6   rT   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )PLBartClassificationHeadin  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )N)r   )r0   r1   r   r   denseDropoutr}   r   )r2   r~  r  r  r  r3   s        r4   r1   !PLBartClassificationHead.__init__q  s@     	YYy4
zzN3		)9r6   r   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r/   )r}   r  rE   tanhr   )r2   r   s     r4   r9    PLBartClassificationHead.forward}  sN    ]3

=1

=1]3m4r6   )r  r}   r   r=   rI   s   @r4   r}  r}  n  sQ    7
:
: 
: 	
:
 
:U\\ ell  r6   r}  z
    PLBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    c                   x  ^  \ rS rSrS\4U 4S jjr\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\R                     S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )PLBartForSequenceClassificationi  rM   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g r/   )
r0   r1   r7  rN   r}  r   
num_labelsclassifier_dropoutclassification_headr   )r2   rM   r~   r3   s      r4   r1   (PLBartForSequenceClassification.__init__  sZ    *6* (
#;NNNN%%	$
  	r6   Nr7   r{   rE  rF  rG  r   rH  rl  r  r~   r   c
                    Ub  Sn	Uc%  Ub"  [        SU R                  R                   35      eU R                  " U4UUUUUUU	S.U
D6nUS   nUR	                  U R
                  R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R                  5       S:H  S5        XSS24   R                  UR                  S5      SUR                  S5      5      SS2SSS24   nU R!                  U5      nSnUGb  UR                  UR                  5      nU R
                  R"                  c  U R
                  R$                  S:X  a  S	U R
                  l        OyU R
                  R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  S
U R
                  l        OSU R
                  l        U R
                  R"                  S	:X  aS  [-        5       nU R
                  R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" X5      nOU R
                  R"                  S
:X  aG  [1        5       nU" UR                  SU R
                  R$                  5      UR                  S5      5      nO,U R
                  R"                  S:X  a  [3        5       nU" X5      n[5        UUUR6                  UR8                  UR:                  UR<                  UR>                  UR@                  URB                  S9	$ )a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for )r{   rE  rF  rG  r   rH  r  r   r%   z7All examples must have the same number of <eos> tokens.rn   
regressionsingle_label_classificationmulti_label_classificationrn  )"NotImplementedErrorr3   r>   rN   eqrM   eos_token_idr   rm   r!   rE   unique_consecutiver/  numelr   r   r  problem_typer  rl   rq   rC   r   r1  r   r   r   r   rK  rL  r  rM  r  rN  )r2   r7   r{   rE  rF  rG  r   rH  rl  r  r~   rq  r   eos_masksentence_representationrp  ro  rt  s                     r4   r9   'PLBartForSequenceClassification.forward  s   P I!:%J4>>KbKbJcd  '+jj
'
)/#9+'"7
'
 
'
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#F3D))-JJ+-B0F0F GUWY))-II,./.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r6   )r  rN   rR  )r>   r?   r@   rA   r&   r1   r   r   rE   r  rF   rT  r   r   r   r   r   r   r9   rG   rH   rI   s   @r4   r  r    s,   |   .2.259:>:>26:>*.!%h
##d*h
 t+h
 !++d2	h

 !& 0 04 7h
 e//047h
 ((4/h
  %0047h
   4'h
 $;h
 +,h
 
0	0h
  h
r6   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )PLBartDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r/   )r0   r1   r  r;  r   r\  s     r4   r1   PLBartDecoderWrapper.__init__  s&     $V,r6   c                 &    U R                   " U0 UD6$ r/   r;  )r2   argsr~   s      r4   r9   PLBartDecoderWrapper.forward  s    ||T,V,,r6   r  )	r>   r?   r@   rA   rB   r1   r9   rG   rH   rI   s   @r4   r  r    s    

- -r6   r  zw
    PLBART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                   j  ^  \ rS rSrSS0rU 4S jrS rS r\\	         SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )PLBartForCausalLMi  rW  z!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   r   r0   r1   r  rN   r   r   hidden_sizer   r[  r   r\  s     r4   r1   PLBartForCausalLM.__init__   sX     $)! )&1
yy!3!3V5F5FUS 	r6   c                 B    U R                   R                  R                  $ r/   rN   r;  r   r>  s    r4   r?  &PLBartForCausalLM.get_input_embeddings+  s    zz!!...r6   c                 8    XR                   R                  l        g r/   r  rB  s     r4   rC  &PLBartForCausalLM.set_input_embeddings.  s    */

'r6   Nr7   r{   r  r  r   r   rl  r  logits_to_keepr~   r   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, PLBartForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
>>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```rJ  r   Nrn   )ro  rp  r   r   r   r  rZ   )rN   r;  rS   rC   slicer[  r   rm   r   r   rM   r   r   r   r   r   r  )r2   r7   r{   r  r  r   r   rl  r  r  r~   rq  r   slice_indicesrp  ro  rt  s                    r4   r9   PLBartForCausalLM.forward1  s   L >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r6   ry  )	NNNNNNNNr   )r>   r?   r@   rA   rS  r1   r?  rC  r   r   rE   r  rF   r   r   r   rC   r   r   r   r   r9   rG   rH   rI   s   @r4   r  r    s1    	=	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r6   r  )r  rT   r  r7  rK   )Nr   )Nr   collections.abcr   rE   r   torch.nnr   r   r    r	   rU   activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    r!   utils.genericr"   utils.output_capturingr#   r$   configuration_plbartr&   
get_loggerr>   r   r{  r(   rK   rc   ModulerF   rD   r   r   rP   r   rO   r  rC   r5  r7  rT   r}  r  r  r  __all__rZ   r6   r4   <module>r     sT  *  $   A A & ! C C ) J B 9  G &  8 E . 
		H	%
= 
= 2O 2 2;r|| ;B !%II%<<% 
% <<	%
 LL4'% T\% % '(%8r)bii r)j03 0fV
) V
rO3 Od|
) |
~%,, c ( g
' g
 g
T 
DD%:O DD
DDNryy 0 x
&; x
x
v-0 - 
Y
- Y

Y
xr6   