
    Z jC                        S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(   " S S\"5      r)\ " S S\5      5       r* " S S\ 5      r+ " S S\5      r,\ " S S\*5      5       r-\" SS 9 " S! S"\*\5      5       r. " S# S$\5      r/ " S% S&\$5      r0 " S' S(\!5      r1/ S)Qr2g)*zPyTorch PLBART model.    N)nn)CrossEntropyLoss   )initialization)Cache)GenerationMixin)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfigc                       \ rS rSrSrg)PLBartScaledWordEmbedding/    N__name__
__module____qualname____firstlineno____static_attributes__r        z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/plbart/modular_plbart.pyr   r   /       r'   r   c                   P   ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrU 4S jrSrU =r$ )	PLBartPreTrainedModel3   configmodelTPLBartDecoderLayerPLBartEncoderLayerc                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g N)super_init_weights
isinstancePLBartForConditionalGenerationinitzeros_final_logits_bias)selfmodule	__class__s     r(   r4   #PLBartPreTrainedModel._init_weights=   s5    f%f<==KK001 >r'   r    )r"   r#   r$   r%   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr4   r&   __classcell__r<   s   @r(   r+   r+   3   s<    &*#-/CDN2 2r'   r+   c                       \ rS rSrSrg)PLBartEncoderC   r    Nr!   r    r'   r(   rH   rH   C   r)   r'   rH   c                       \ rS rSrSrg)PLBartDecoderG   r    Nr!   r    r'   r(   rK   rK   G   r)   r'   rK   c                     ^  \ rS rSrSSS.rS\4U 4S jjrS rS r\	\
\         SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\R                      S-  S\S-  S\R                   S-  S\R                   S-  S\S-  S\\   S\\R                     \-  4S jj5       5       5       rSrU =r$ )PLBartModelK   zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr-   c                 J  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        U5      U l        [        U5      U l        U R                  5         g )Ng      ?)embed_scale)r3   __init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr   sharedrH   encoderrK   decoder	post_init)r:   r-   padding_idxrT   rQ   r<   s        r(   rR   PLBartModel.__init__R   ss     "("5"5v7H7HZ393I3Idii/s/
NNKq$V,$V,r'   c                     U R                   $ r2   )rY   )r:   s    r(   get_input_embeddings PLBartModel.get_input_embeddings^   s    {{r'   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r2   )rY   rZ   embed_tokensr[   )r:   values     r(   set_input_embeddings PLBartModel.set_input_embeddingsa   s'    $(KK!$(KK!r'   N	input_idsattention_maskdecoder_input_idsdecoder_attention_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embeds	use_cachekwargsreturnc
                    Uc"  Uc  [        XR                  R                  5      nUc  U R                  " S	UUUS.U
D6nOK[	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	S.U
D6n[        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
N)rg   rh   rm   r   r   r   )last_hidden_statehidden_states
attentions)rg   rh   encoder_hidden_statesencoder_attention_maskrl   rm   ro   )rs   rl   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterv   encoder_attentionsr    )r   r-   rS   rZ   r5   r	   lenr[   r   rs   rl   rt   ru   rz   )r:   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   decoder_outputss               r(   forwardPLBartModel.forwardf   s0   P $)>)F 29kk>V>V W"/3|| 0#-+0 	0O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO ,, 	
'1"1!"4#1+/	
 	
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r'   )r[   rZ   rY   )	NNNNNNNNN)r"   r#   r$   r%   _tied_weights_keysr   rR   r`   re   r   r   r   torch
LongTensorTensorlistFloatTensorr   boolr   r   tupler   r   r&   rE   rF   s   @r(   rN   rN   K   sP    (7'6

| 
0
   .226596::>(,26:>!%J
##d*J
 ((4/J
 !++d2	J

 !&t 3J
 e//047J
 J
 ((4/J
  %0047J
 $;J
 +,J
 
u||	1	1J
    J
r'   rN   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc                   .  ^  \ rS rSrSrS/rSS0rS\4U 4S jjr SS	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jr\\\          SS\R&                  S-  S\R&                  S-  S\R&                  S-  S\R(                  S-  S\\R,                     S-  S\S-  S\R,                  S-  S\R,                  S-  S\R(                  S-  S\
S-  S\\   S\\R(                     \-  4S jj5       5       5       rS\R(                  4S jrSrU =r$ )r6      r.   r9   zlm_head.weightzmodel.shared.weightr-   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr9   r   F)bias)r3   rR   rN   r.   register_bufferr   zerosrY   num_embeddingsr   LinearrX   lm_headr\   )r:   r-   r<   s     r(   rR   'PLBartForConditionalGeneration.__init__   s~      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r'   Nnew_num_tokenspad_to_multiple_ofmean_resizingrq   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )r3   resize_token_embeddings_resize_final_logits_biasweightshape)r:   r   r   r   new_embeddingsr<   s        r(   r   6PLBartForConditionalGeneration.resize_token_embeddings   s<     8]jk&&~'<'<'B'B1'EFr'   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr   )device)dimr9   )r9   r   r   r   r   catr   )r:   r   old_num_tokensnew_bias
extra_biass        r(   r   8PLBartForConditionalGeneration._resize_final_logits_bias   s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r'   rg   rh   ri   rj   rk   rl   rm   rn   labelsro   rp   c                 R   U	b%  Uc"  Uc  [        XR                  R                  5      nU R                  " U4UUUUUUUU
S.UD6nU R	                  UR
                  5      nXR                  R                  UR                  5      -   nSnU	bF  [        5       nU" UR                  SU R                  R                  5      U	R                  S5      5      n[        UUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S9	$ )a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Mask-filling:

```python
>>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

>>> # en_XX is the language symbol id <LID> for English
>>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)

>>> tokenizer.decode(predictions).split()
['first', 'same', 'highest', 'result', 'number']
```
N)rh   ri   rk   rj   rl   rm   rn   ro   r   )	losslogitsrl   rx   ry   rz   r{   rv   r|   )r   r-   rS   r.   r   rs   r9   tor   r   viewrT   r
   rl   rx   ry   rz   r{   rv   r|   )r:   rg   rh   ri   rj   rk   rl   rm   rn   r   ro   rp   outputs	lm_logitsmasked_lm_lossloss_fcts                   r(   r   &PLBartForConditionalGeneration.forward   s,   @  (-B-J$6v{{?W?W$X!&*jj'
)/+#9+'"7'
 '
 LL!:!:;	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r'   c                 @    [        XR                  R                  5      $ r2   )r   r-   rS   )r:   r   s     r(   %prepare_decoder_input_ids_from_labelsDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labels>  s    !&++*B*BCCr'   )r   r.   )NT)
NNNNNNNNNN) r"   r#   r$   r%   r?   _keys_to_ignore_on_load_missingr   r   rR   intr   r   	Embeddingr   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   r   r&   rE   rF   s   @r(   r6   r6      s     ':&;#/|  ae!7:TzY]	 < < <   .226596::>(,26:>&*!%_
##d*_
 ((4/_
 !++d2	_

 !&t 3_
 e//047_
 _
 ((4/_
  %0047_
 t#_
 $;_
 +,_
 
u||		._
    _
BDELL D Dr'   r6   c                       \ rS rSrSrg)PLBartClassificationHeadiB  r    Nr!   r    r'   r(   r   r   B  r)   r'   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )PLBartForSequenceClassificationiF  c                  :   > [        5       R                  " S0 U D6  g)a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
    See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (:
    obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior:
    generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr    r3   r   super_kwargsr<   s    r(   r   'PLBartForSequenceClassification.forwardG  s    4 	','r'   r    )r"   r#   r$   r%   r   r&   rE   rF   s   @r(   r   r   F  s    ( (r'   r   c                   <   ^  \ rS rSr\\U 4S j5       5       rSrU =r$ )PLBartForCausalLMid  c                  :   > [        5       R                  " S0 U D6  g)a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, PLBartForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
>>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```Nr    r   r   s    r(   r   PLBartForCausalLM.forwarde  s    2 	','r'   r    )	r"   r#   r$   r%   r   r   r   r&   rE   rF   s   @r(   r   r   d  s    (  (r'   r   )r   r6   r   rN   r+   )3__doc__rV   r   r   torch.nnr    r   r7   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   r   r+   rH   rK   rN   r6   r   r   r   __all__r    r'   r(   <module>r      s       % &   ) 
 . & I I 7 5  _ 5 .	 7 	 2O 2 2	K 		K 	 g
' g
 g
T 
DD%:O DD
DDN	5 	(&M (<( (:r'   