
    Z j                     $   S r SSKrSSKJr  SSKrSSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5  SSK6J7r7  \-" 5       (       a   \/Rp                  " \95      r:S\Rv                  S\<4S jr= " S S\R|                  5      r? " S S\R|                  5      r@  SDS\R                  S\Rv                  S\Rv                  S \Rv                  S!\Rv                  S-  S"\BS-  S#\BS$\(\*   4S% jjrC " S& S'\R                  5      rD " S( S)\5      rE " S* S+\5      rF " S, S-\R                  5      rG\+ " S. S/\&5      5       rH " S0 S1\H5      rI " S2 S3\H5      rJ\+ " S4 S5\H5      5       rK\+" S6S79 " S8 S9\H\5      5       rL\+" S:S79 " S; S<\H5      5       rM\+ " S= S>\H5      5       rN " S? S@\H5      rO " SA SB\H\5      rP/ SCQrQg)EzPyTorch MBART model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilingloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )MBartConfig	input_idspad_token_idc                 `   U R                  5       nUc  [        S5      eUR                  US:H  U5        UR                  U5      R	                  SS9S-
  R                  S5      nUR                  SU5      R                  5       nUSS2SS24   R                  5       USS2SS24'   XBSS2S4'   U$ )z
Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
have a single `decoder_start_token_id` in contrast to other Bart-like models.
Nz1self.model.config.pad_token_id has to be defined.ir'   dimr   )clone
ValueErrormasked_fill_nesum	unsqueezegathersqueeze)r)   r*   prev_output_tokensindex_of_eosdecoder_start_tokenss        y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mbart/modeling_mbart.pyshift_tokens_rightr;   @   s    
 #*LMM##$6$$>M&)),7;;;BQFQQRTUL-44QEMMO 21crc6 : @ @ Bq!"u3q!t    c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ )MBartLearnedPositionalEmbeddingU   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr@   rA   	__class__s      r:   rG   (MBartLearnedPositionalEmbedding.__init__Z   s"     ++5}Er<   Nr)   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].NrD   )dtypedevicer.   r   )shapetorcharangelongweightrO   expandr4   rF   forwardrE   )rH   r)   rK   rL   bszseq_lenrI   s         r:   rV   'MBartLearnedPositionalEmbedding.forward`   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r<   )rE   )r   N)__name__
__module____qualname____firstlineno____doc__intrG   rQ   TensorrV   __static_attributes____classcell__rI   s   @r:   r>   r>   U   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r<   r>   c            
       r   ^  \ rS rSrSrSS\S\S\S\S-  4U 4S jjjrS	\R                  4U 4S
 jjr
SrU =r$ )MBartScaledWordEmbeddingq   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r@   rA   padding_idxembed_scaleNc                 2   > [         TU ]  XU5        X@l        g N)rF   rG   rh   )rH   r@   rA   rg   rh   rI   s        r:   rG   !MBartScaledWordEmbedding.__init__v   s    D&r<   r)   c                 <   > [         TU ]  U5      U R                  -  $ rj   )rF   rV   rh   )rH   r)   rI   s     r:   rV    MBartScaledWordEmbedding.forwardz   s    wy)D,<,<<<r<   rh   )      ?rZ   r[   r\   r]   r^   r_   floatrG   rQ   r`   rV   ra   rb   rc   s   @r:   re   re   q   sJ    's '3 'S '_dgk_k ' '= = =r<   re   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr.         rD   r   r,   ptrainingr'   )
sizerQ   matmul	transposer   
functionalsoftmaxrx   r~   
contiguous)
rr   rs   rt   ru   rv   rw   rx   ry   attn_weightsattn_outputs
             r:   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r<   c                   $  ^  \ rS rSrSr      SS\S\S\S\S\S	\S
\S-  S\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\S-  S\
R                  S-  S\\   S\\
R                  \
R                  S-  4   4S jjrSrU =r$ )MBartAttention   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsrx   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r{   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   )rF   rG   r   r   rx   head_dimr   r0   rw   r   r   r   loggerwarning_oncerI   rZ   r   Lineark_projv_projq_projout_proj)
rH   r   r   rx   r   r   r   r   r   rI   s
            r:   rG   MBartAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr<   hidden_stateskey_value_statespast_key_valuesrv   ry   returnc                 ,   USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
Ub]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      n/ UR                   SS QSPU R                  P7nUR                  U5      R	                  SS5      nUR                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U U	UUU4U R.                  (       d  SOU R0                  U R2                  S.UD6u  nnUR4                  " / UQSP76 R7                  5       nU R9                  U5      nUU4$ )	z#Input shape: Batch x Time x ChannelNr.   r'   rD   FT        )rx   rw   )rP   r   r   viewr   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updater   get_interfacer   _attn_implementationr   r~   rx   rw   reshaper   r   )rH   r   r   r   rv   ry   is_cross_attentioninput_shapehidden_shapequery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shapeattention_interfacer   r   s                      r:   rV   MBartAttention.forward   sc    .T9 $))#2.88b8$--8 {{=166|DNNqRST
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6LF--cr2FBFFH#2<<QBJ',,X6@@AFL*+?+F+Fzaeaoao+p(
%*_FY*Z*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "));;;;FFHmmK0L((r<   )r   rx   r   r   r   r   r   r   r   r   r   rw   r   )r   FTFNNNNN)rZ   r[   r\   r]   r^   r_   rq   boolr(   rG   rQ   r`   r   r   r   tuplerV   ra   rb   rc   s   @r:   r   r      s   G  %) $%C%C %C 	%C
 %C %C %C d"%C :%C %CT 15(,.2H)||H)  ,,-H) 	H)
 t+H) -.H) 
u||U\\D00	1H) H)r<   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\	   S\R                  4S jr
S	rU =r$ )
MBartEncoderLayeri  r   c                 j  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  US9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)r   r   rx   r   )rF   rG   d_modelr   r   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrx   r
   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normrH   r   rI   s     r:   rG   MBartEncoderLayer.__init__  s    'nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r<   r   rv   ry   r   c                    UnU R                  U5      nU R                  " SUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nXA-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXA-   nUR                  [        R                  :X  aC  [        R                  " UR                  5      R                  S-
  n[        R                   " X* US9nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
)r   rv   r|   i  )minmax )r   r   r   r   rx   r~   r   r   r   r   r   rN   rQ   float16finfor   clamp)rH   r   rv   ry   residual_clamp_values          r:   rV   MBartEncoderLayer.forward$  sD    !11-@>> 
')
 

 --m||VZVcVc-d 0 --m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0%--/++m&9&9:>>EK!KK<[YMr<   )	r   r   rx   r   r   r   r   r   r   )rZ   r[   r\   r]   r(   rG   rQ   r`   r   r   rV   ra   rb   rc   s   @r:   r   r     sQ    ={ =$"||" " +,	"
 
" "r<   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr     SS\R                  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\\   S\R                  4S jjrSrU =r$ )MBartDecoderLayeriI  Nr   r   c           
        > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   rx   r   r   r   r   )rx   r   r   r   )rF   rG   r   r   r   decoder_attention_headsr   r   rx   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )rH   r   r   rI   s      r:   rG   MBartDecoderLayer.__init__J  s    'nn44,,
 ~~#F$>$>?"(";";$&LL$@!*NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r<   r   rv   encoder_hidden_statesencoder_attention_maskr   	use_cachery   r   c                    UnU R                  U5      nU R                  " SUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUbb  UnU R                  U5      nU R                  " SUUUUS.UD6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    past_key_values (`Cache`): cached past key and value projection states
)r   r   rv   r|   )r   r   rv   r   r   )r   r   r   r   rx   r~   r   r   r   r   r   r   r   )
rH   r   rv   r   r   r   r   ry   r   r   s
             r:   rV   MBartDecoderLayer.forwardi  s   * !11-@  >> 
'+)
 	
 --m||VZVcVc-d 0 !,$H 88GM#00  +!65 /	 
  M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0r<   )r   r   rx   r   r   r   r   r   r   r   r   rj   )NNNNT)rZ   r[   r\   r]   r(   r_   rG   rQ   r`   r   r   r   r   rV   ra   rb   rc   s   @r:   r   r   I  s    ={ =sTz = =D /3596:(,!%:||: t+:  %||d2	:
 !&t 3: : $;: +,: 
: :r<   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )MBartClassificationHeadi  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )N)r}   )rF   rG   r   r   denseDropoutrx   r   )rH   r   r   r   r   rI   s        r:   rG    MBartClassificationHead.__init__  s@     	YYy4
zzN3		)9r<   r   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rj   )rx   r   rQ   tanhr   )rH   r   s     r:   rV   MBartClassificationHead.forward  sN    ]3

=1

=1]3m4r<   )r   rx   r   rp   rc   s   @r:   r   r     sQ    7
:
: 
: 	
:
 
:U\\ ell  r<   r   c                   d   ^  \ rS rSr% \\S'   SrSr/ SQrSr	Sr
SrSrU 4S jr\S 5       rSrU =r$ )	MBartPreTrainedModeli  r   modelT)r   r   r   c                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g rj   )rF   _init_weightsr   MBartForConditionalGenerationinitzeros_final_logits_bias)rH   rr   rI   s     r:   r   "MBartPreTrainedModel._init_weights  s5    f%f;<<KK001 =r<   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      rD   r         rD   rO   )rv   r)   )r   r*   rQ   tensorrO   r2   )rH   	pad_tokenr)   dummy_inputss       r:   r  !MBartPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r<   r   )rZ   r[   r\   r]   r(   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   propertyr  ra   rb   rc   s   @r:   r   r     sK    &*#TN!2
  r<   r   c                      ^  \ rS rSrSr\\" \SSS9S.rS\	4U 4S jjr
S	 r\\   SS\R                  S
-  S\R                   S
-  S\R"                  S
-  S\\   S\\-  4
S jj5       5       rSrU =r$ )MBartEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MBartEncoderLayer`].

Args:
    config: MBartConfig
    embed_tokens (nn.Embedding): output embedding
r'   r   index
layer_name)r   
attentionsr   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSn[        UR                  X R                  US9U l        [!        UR                  U5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        U5      PM     sn5      U l        Xl        [$        R2                  " U5      U l        [$        R2                  " UR
                  5      U l        SU l        U R;                  5         g s  snf )Nro   rn   F)rF   rG   rx   encoder_layerdrop	layerdropr   r*   rg   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtre   
vocab_sizeembed_tokensr>   embed_positionsr   
ModuleListrangeencoder_layersr   r   r   r   layernorm_embedding
layer_normgradient_checkpointing	post_init)rH   r   r   rh   r   rI   s        r:   rG   MBartEncoder.__init__  s    ~~11NN	!..$*$B$B!.4.D.Ddii	*#4y*:*:
  ?** 
 mmfNcNcHd$eHd1%6v%>Hd$ef#%<<	#: ,,v~~6&+# %fs   0E/c                     U R                   (       a.  [        U R                  SS5      (       a  U R                  5         g g g )Nr)  F)r  getattrr   gradient_checkpointing_enablerH   s    r:   ._backward_compatibility_gradient_checkpointing;MBartEncoder._backward_compatibility_gradient_checkpointing  s4    //GDKKIach4i4i..0 5j/r<   Nr)   rv   inputs_embedsry   r   c                 x   USL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  US   5      nX5R                  UR                  5      -   nU R                  U5      n[        R                  R                  X`R                  U R                  S9n[        U R                  UUS9n[        U R                  5       HR  u  pxSn	U R                  (       a'  [        R                  " / 5      n
XR                   :  a  Sn	U	(       a  MH  U" UU40 UD6nMT     U R#                  U5      n[%        US9$ )	a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embeds).r.   r|   )r   r2  rv   FT)last_hidden_state)r0   r"  r#  torO   r'  r   r   rx   r~   r   r   	enumerater   rQ   randr  r(  r   )rH   r)   rv   r2  ry   	embed_posr   idxencoder_layerto_dropdropout_probabilitys              r:   rV   MBartEncoder.forward
  s*   > -t";<YZZ  --i8M((w)?@	%]5I5I(JJ00?--m||VZVcVc-d2;;')
 #,DKK"8CG}}&+jjn#&7"G7 -!"! ! #9 6??r<   )r   rx   r#  r"  r)  r(  r  r'  r   r  rg   r   )rZ   r[   r\   r]   r^   r   r%   r   _can_record_outputsr(   rG   r0  r$   r&   rQ   
LongTensorr`   FloatTensorr   r   r   r   rV   ra   rb   rc   s   @r:   r  r    s     +$^1U
{ 81
   .2.226	@@##d*@@ t+@@ ((4/	@@
 +,@@ 
	 @@   @@r<   r  c                   F  ^  \ rS rSrSr\\" \SSS9\" \SSS9S.rS\	4U 4S	 jjr
\\       SS\R                  S
-  S\R                  S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\R                   S
-  S\S
-  S\\   S\\-  4S jj5       5       rSrU =r$ )MBartDecoderiO  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]

Args:
    config: MBartConfig
    embed_tokens (nn.Embedding): output embedding
r'   r   r  r   )r   r  cross_attentionsr   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        [!        UR                  UR                  5      U l        [$        R&                  " [)        UR*                  5       Vs/ s H  n[-        XS9PM     sn5      U l        Xl        [$        R2                  " UR                  5      U l        [$        R2                  " UR                  5      U l        SU l        U R;                  5         g s  snf )Nro   rn   )r   F)rF   rG   rx   decoder_layerdropr  r*   rg   r  max_target_positionsr  r  r   r   re   r!  r"  r>   r#  r   r$  r%  decoder_layersr   r   r   r   r'  r(  r)  r*  )rH   r   rh   irI   s       r:   rG   MBartDecoder.__init__^  s"    ~~11!..$*$B$B!393I3Idii/s4v~~t/?/?[
  ?**NN 
 mmUZ[a[p[pUq$rUqPQ%6v%KUq$rs#%<<#? ,,v~~6&+# %ss   F
Nr)   rv   r   r   r   r2  r   ry   r   c                    USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       ab  Uc_  Uc  U R                  R                  (       a.  [	        [        U R                  S9[        U R                  S95      O[        U R                  S9nUR                  5       SS u  pUb  UR                  5       OSn[        R                  " XR                  S9U-   nUc2  [        5       (       d#  X-   n[        R                  " XUR                  S9n[        U[        5      (       a  UR                  OUn[        U R                  UUUS9n[!        U R                  UUUS9nU R#                  [$        XS	9nXlR'                  UR                  5      -   nU R)                  U5      n[*        R,                  R/                  UU R.                  U R0                  S
9n[3        U R4                  5       HN  u  nnU R0                  (       a(  [        R6                  " / 5      nUU R8                  :  a  M?  U" UUU4UUUS.UD6nMP     U R;                  U5      n[=        UUS9$ )a 
  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time)r   r.   r   r  )r   r2  rv   r   )r   r2  rv   r   )rL   r|   )r   r   r   )r4  r   )r0   r"  r   is_encoder_decoderr   r   r   get_seq_lengthrQ   rR   rO   r!   onesr   r   r   r   r#  inputr5  r'  r   r   rx   r~   r6  r   r7  r  r(  r   )rH   r)   rv   r   r   r   r2  r   ry   
batch_size
seq_lengthrK   rL   mask_seq_lengthself_attn_cachecausal_maskr   r9  decoder_layerr<  s                       r:   rV   MBartDecoder.forwardx  sN   n -t";<stt  --i8M 0 )48V8V $L$DlZ^ZeZeFfg!5  "/!3!3!5cr!:
ETE`!?!?!Afg||J7K7KLOee!*B*D*D4AO"ZZ
ML`L`aN /+>?? 00  	 );;')+	
 ";;;'1"7	"
 ++E3I+e%8L8L(MM00?--mt||VZVcVc-d"+DKK"8C}}&+jjn#&7)% (> /# M #9" 68++
 	
r<   )r   rx   r#  r"  r)  r(  r  r'  r   rF  rg   )NNNNNNN)rZ   r[   r\   r]   r^   r   r%   r   r>  r(   rG   r$   r&   rQ   r?  r`   r@  r   r   r   r   r   r   rV   ra   rb   rc   s   @r:   rB  rB  O  s    +$^1U*>~^{ 4   .2.2:>:>(,26!%}
##d*}
 t+}
  %0047	}

 !& 0 04 7}
 }
 ((4/}
 $;}
 +,}
 
:	:}
   }
r<   rB  c                     ^  \ rS rSrSSS.rS\4U 4S jjrS rS r\	\
          SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\\\R                        S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\\   S\\\R                     -  4S jj5       5       rSrU =r$ )
MBartModeli  zshared.weight)zdecoder.embed_tokens.weightzencoder.embed_tokens.weightr   c                 J  > [         TU ]  U5        UR                  UR                  p2UR                  (       a   [
        R                  " UR                  5      OSn[        X1R                  X$S9U l	        [        U5      U l        [        U5      U l        U R                  5         g )Nro   rn   )rF   rG   r*   r!  r  r  r   r   re   sharedr  encoderrB  decoderr*  )rH   r   rg   r!  rh   rI   s        r:   rG   MBartModel.__init__  su     "("5"5v7H7HZ393I3Idii/s.z>>;p#F+#F+ 	r<   c                     U R                   $ rj   )rY  r/  s    r:   get_input_embeddingsMBartModel.get_input_embeddings  s    {{r<   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g rj   )rY  rZ  r"  r[  rH   ru   s     r:   set_input_embeddingsMBartModel.set_input_embeddings  s'    $(KK!$(KK!r<   Nr)   rv   decoder_input_idsdecoder_attention_maskencoder_outputsr   r2  decoder_inputs_embedsr   return_dictry   r   c                 n   U
b  U
OU R                   R                  n
Uc"  Uc  [        XR                   R                  5      nUc  U R                  " S	UUUU
S.UD6nORU
(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  " S	UUUS   UUUU	U
S.UD6nU
(       d  X-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )
a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
N)r)   rv   r2  rh  r   r'   rD   )r4  r   r  )r)   rv   r   r   r   r2  r   rh  )r4  r   decoder_hidden_statesdecoder_attentionsrC  encoder_last_hidden_stater   encoder_attentionsr   )r   use_return_dictr;   r*   rZ  r   r   lenr[  r   r4  r   r   r  rC  )rH   r)   rv   rd  re  rf  r   r2  rg  r   rh  ry   decoder_outputss                r:   rV   MBartModel.forward  s\   J &1%<k$++B]B] $)>)F 29kk>V>V W""ll #-+'	
 O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,, 

'1"1!"4#1+/#

 

 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r<   )r[  rZ  rY  
NNNNNNNNNN)rZ   r[   r\   r]   _tied_weights_keysr(   rG   r^  rb  r   r   rQ   r?  r`   r   r@  r   r   r   r   r   rV   ra   rb   rc   s   @r:   rW  rW    s`    (7'6
{ 0
  .2.259:>BF(,26:>!%#'S
##d*S
 t+S
 !++d2	S

 !& 0 04 7S
 uU%6%6784?S
 S
 ((4/S
  %0047S
 $;S
 D[S
 +,S
 
eE$5$56	6S
  S
r<   rW  z
    The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.
    )custom_introc                   ,  ^  \ rS rSrSrS/rSS0rS\4U 4S jjr SS	\	S
\	S-  S\
S\R                  4U 4S jjjrS	\	SS4S jr\           SS\R"                  S-  S\R$                  S-  S\R"                  S-  S\R"                  S-  S\\\R(                        S-  S\S-  S\R(                  S-  S\R(                  S-  S\R"                  S-  S\
S-  S\
S-  S\\   S\\\R(                     -  4S jj5       rS\R$                  4S jrSrU =r$ ) r   in  r   r   lm_head.weightzmodel.shared.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r'   Fr   )rF   rG   rW  r   register_bufferrQ   zerosrY  r@   r   r   r   lm_headr*  r   s     r:   rG   &MBartForConditionalGeneration.__init__x  s     '
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r<   Nnew_num_tokenspad_to_multiple_ofmean_resizingr   c                 x   > [         TU ]  XU5      nU R                  UR                  R                  S   5        U$ )Nr   )rF   resize_token_embeddings_resize_final_logits_biasrT   rP   )rH   r|  r}  r~  new_embeddingsrI   s        r:   r  5MBartForConditionalGeneration.resize_token_embeddings  s<     8]jk&&~'<'<'B'B1'EFr<   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr.   r'   r  r,   r   )r   rP   rQ   ry  rO   catrx  )rH   r|  old_num_tokensnew_bias
extra_biass        r:   r  7MBartForConditionalGeneration._resize_final_logits_bias  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r<   r)   rv   rd  re  rf  r   r2  rg  labelsr   rh  ry   c                    Ub  UOU R                   R                  nU	bC  U
(       a  [        R                  S5        Sn
Uc"  Uc  [	        XR                   R
                  5      nU R                  " U4UUUUUUUU
US.	UD6nU R                  US   5      U R                  -   nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  S9	$ )	u  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
    varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example Translation:

```python
>>> from transformers import AutoTokenizer, MBartForConditionalGeneration

>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")

>>> example_english_phrase = "42 is the answer"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt")

>>> # Translate
>>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'42 este răspuns'
```

Mask filling example:

```python
>>> from transformers import AutoTokenizer, MBartForConditionalGeneration

>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")

>>> # de_DE is the language symbol id <LID> for German
>>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"

>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits

>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)

>>> tokenizer.decode(predictions).split()
['nett', 'sehr', 'ganz', 'nicht', 'so']
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)	rv   rd  rf  re  r   r2  rg  r   rh  r   r.   r'   	losslogitsr   rj  rk  rC  rl  r   rm  )r   rh  r   warningr;   r*   r   rz  r   r   r   r!  r   r   rj  rk  rC  rl  r   rm  )rH   r)   rv   rd  re  rf  r   r2  rg  r  r   rh  ry   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                     r:   rV   %MBartForConditionalGeneration.forward  sr   ` &1%<k$++BYBYklI (-B-J$6v{{?W?W$X!**
)/+#9+'"7#
 
 LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r<   c                 @    [        XR                  R                  5      $ rj   )r;   r   r*   )rH   r  s     r:   %prepare_decoder_input_ids_from_labelsCMBartForConditionalGeneration.prepare_decoder_input_ids_from_labels  s    !&++*B*BCCr<   rz  r   )NT)NNNNNNNNNNN)rZ   r[   r\   r]   r  _keys_to_ignore_on_load_missingrs  r(   rG   r_   r   r   	Embeddingr  r  r   rQ   r?  r`   r   r@  r   r   r   r   rV   r  ra   rb   rc   s   @r:   r   r   n  s     ':&;#*,AB{  ae!7:TzY]	 < < <  .2.259:>BF(,26:>*.!%#'z
##d*z
 t+z
 !++d2	z

 !& 0 04 7z
 uU%6%6784?z
 z
 ((4/z
  %0047z
   4'z
 $;z
 D[z
 +,z
 
5!2!23	3z
 z
xDELL D Dr<   r   z
    MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   x  ^  \ rS rSrS\4U 4S jjr\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\\R                     S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )MBartForSequenceClassificationi  r   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g rj   )
rF   rG   rW  r   r   r   
num_labelsclassifier_dropoutclassification_headr*  )rH   r   ry   rI   s      r:   rG   'MBartForSequenceClassification.__init__  sZ    *6*'
#:NNNN%%	$
  	r<   Nr)   rv   rd  re  rf  r2  rg  r  r   ry   r   c
                    Ub  Sn	Uc%  Ub"  [        SU R                  R                   35      eU R                  " U4UUUUUUU	S.U
D6nUS   nUR	                  U R
                  R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R                  5       S:H  S5        XSS24   R                  UR                  S5      SUR                  S5      5      SS2SSS24   nU R!                  U5      nSnUGb  UR                  UR                  5      nU R
                  R"                  c  U R
                  R$                  S:X  a  S	U R
                  l        OyU R
                  R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  S
U R
                  l        OSU R
                  l        U R
                  R"                  S	:X  aS  [-        5       nU R
                  R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" X5      nOU R
                  R"                  S
:X  aG  [1        5       nU" UR                  SU R
                  R$                  5      UR                  S5      5      nO,U R
                  R"                  S:X  a  [3        5       nU" X5      n[5        UUUR6                  UR8                  UR:                  UR<                  UR>                  UR@                  URB                  S9	$ )aU  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for rv   rd  re  rf  r2  rg  r   r   r'   z7All examples must have the same number of <eos> tokens.r.   
regressionsingle_label_classificationmulti_label_classificationr  )"NotImplementedErrorrI   rZ   r   eqr   eos_token_idr5  rO   r#   rQ   unique_consecutiver3   numelr   r   r  problem_typer  rN   rS   r_   r   r6   r   r   r   r   rj  rk  rC  rl  r   rm  )rH   r)   rv   rd  re  rf  r2  rg  r  r   ry   r  r   eos_masksentence_representationr  r  r  s                     r:   rV   &MBartForSequenceClassification.forward&  s   T I!:%J4>>KbKbJcd  '+jj
'
)/#9+'"7
'
 
'
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#F3D))-JJ+-B0F0F GUWY))-II,./.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r<   )r  r   )	NNNNNNNNN)rZ   r[   r\   r]   r(   rG   r   r   rQ   r?  r`   listr@  r   r   r   r   r   rV   ra   rb   rc   s   @r:   r  r    s,   {   .2.259:>:>26:>*.!%i
##d*i
 t+i
 !++d2	i

 !& 0 04 7i
 e//047i
 ((4/i
  %0047i
   4'i
 $;i
 +,i
 
0	0i
  i
r<   r  c                     ^  \ rS rSrU 4S jr\\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\
\R                     S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )MBartForQuestionAnsweringi  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rC   )
rF   rG   r  rW  r   r   r   hidden_size
qa_outputsr*  r   s     r:   rG   "MBartForQuestionAnswering.__init__  s[      ++'
))F$6$68I8IJ 	r<   Nr)   rv   rd  re  rf  start_positionsend_positionsr2  rg  r   ry   r   c                 <   Ub  Ub  Sn
U R                   " U4UUUUUU	U
S.UD6nUS   nU R                  U5      nUR                  SSS9u  nnUR                  S5      R	                  5       nUR                  S5      R	                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S	-  n[        UUUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S
9
$ )aC  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
NFr  r   r'   r.   r,   )ignore_indexrD   )
r  start_logits
end_logitsr   rj  rk  rC  rl  r   rm  )r   r  splitr6   r   ro  r   r   r   r   r   rj  rk  rC  rl  r   rm  )rH   r)   rv   rd  re  rf  r  r  r2  rg  r   ry   r  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_losss                         r:   rV   !MBartForQuestionAnswering.forward  s   P &=+DI&*jj
'
)/#9+'"7
'
 
'
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r<   )r   r  r  rr  )rZ   r[   r\   r]   rG   r   r   rQ   r`   r?  r  r@  r   r   r   r   r   rV   ra   rb   rc   s   @r:   r  r    s<   
  *..259:>:>371526:>!%W
<<$&W
 t+W
 !++d2	W

 !& 0 04 7W
 e//047W
 ))D0W
 ''$.W
 ((4/W
  %0047W
 $;W
 +,W
 
4	4W
  W
r<   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MBartDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rj   )rF   rG   rB  r[  r*  r   s     r:   rG   MBartDecoderWrapper.__init__  s&     #F+r<   c                 &    U R                   " U0 UD6$ rj   r[  )rH   argsry   s      r:   rV   MBartDecoderWrapper.forward  s    ||T,V,,r<   r  )	rZ   r[   r\   r]   r^   rG   rV   ra   rb   rc   s   @r:   r  r    s    

- -r<   r  c                   j  ^  \ rS rSrSS0rU 4S jrS rS r\\	         SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )MBartForCausalLMi  rv  z!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   rK  rF   rG   r  r   r   r   r  r!  rz  r*  r   s     r:   rG   MBartForCausalLM.__init__  sX     $)! (0
yy!3!3V5F5FUS 	r<   c                 B    U R                   R                  R                  $ rj   r   r[  r"  r/  s    r:   r^  %MBartForCausalLM.get_input_embeddings!  s    zz!!...r<   c                 8    XR                   R                  l        g rj   r  ra  s     r:   rb  %MBartForCausalLM.set_input_embeddings$  s    */

'r<   Nr)   rv   r   r   r   r2  r  r   logits_to_keepry   r   c
                 
   U R                   R                  " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUba  UR                  UR                  5      n[        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MBartForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
>>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```)r)   rv   r   r   r   r2  r   r   Nr.   )r  r  r   r   r  rC  r   )r   r[  r   r_   slicerz  r5  rO   r   r   r   r!  r   r   r   r  rC  )rH   r)   rv   r   r   r   r2  r  r   r  ry   r  r   slice_indicesr  r  r  s                    r:   rV   MBartForCausalLM.forward'  s   L >BZZ=O=O 	>
)"7#9+'	>
 	>
  
8B>SV8W8W~ot4]kmA}a,?@AYYv}}-F')HFKKDKK,B,BCV[[QS_UD0#33!//))$55
 	
r<   r  )	NNNNNNNNr   )rZ   r[   r\   r]   rs  rG   r^  rb  r   r   rQ   r?  r`   r@  r   r   r_   r   r   r   r   rV   ra   rb   rc   s   @r:   r  r    s/   =	/0  .2.2:>;?(,26*.!%-.A
##d*A
 t+A
  %0047	A

 !& 1 1D 8A
 A
 ((4/A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
r<   r  )r  r   r  r  rW  r   )Nr   )Rr^   r  collections.abcr   rQ   r   torch.nnr   r   r    r	   r   activationsr
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   r#   utils.genericr$   utils.output_capturingr%   r&   configuration_mbartr(   
get_loggerrZ   r   r`   r_   r;   r  r>   re   Modulerq   r   r   r   r   r   r   r  rB  rW  r   r  r  r  r  __all__r   r<   r:   <module>r     sz     $   A A & ! C C ) J :   G &   8 E ,  !! 
		H	%%,, c *;bll ;8
=r|| 
=( !%II%<<% 
% <<	%
 LL4'% T\% % '(%:r)RYY r)j52 5pZ2 Z|bii 0 ?  4r@' r@jh
' h
V p
% p
 p
f 
\D$8/ \D
\D~ z
%9 z
z
z g
 4 g
 g
V-. - Y
+_ Y
xr<   