
    Z jX                       S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'  \$RP                  " \)5      r*S\RV                  S\,S\,4S jr- " S S\R\                  5      r/ " S S\R`                  5      r1 " S S\5      r2 " S S\5      r3 " S S\R`                  5      r4 " S S \R`                  5      r5\# " S! S"\!5      5       r6 " S# S$\65      r7 " S% S&\65      r8\# " S' S(\65      5       r9\#" S)S*9 " S+ S,\6\5      5       r:\#" S-S*9 " S. S/\65      5       r;\# " S0 S1\65      5       r< " S2 S3\65      r= " S4 S5\6\5      r>/ S6Qr?g)7zPyTorch MVP model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringloggingtorch_compilable_check   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r    shifted_input_idss       u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr*   .   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S	\R                  S-  4U 4S
 jjjr	Sr
U =r$ )MvpLearnedPositionalEmbedding?   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr/   r0   	__class__s      r)   r6   &MvpLearnedPositionalEmbedding.__init__D   s"     ++5}Er+   Nr   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr3   )dtypedevicer"   r   )r$   torcharangelongweightr>   expand	unsqueezer5   forwardr4   )r7   r   r:   r;   bszseq_lenr8   s         r)   rE   %MvpLearnedPositionalEmbedding.forwardJ   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r+   )r4   )r   N)__name__
__module____qualname____firstlineno____doc__intr6   r?   TensorrE   __static_attributes____classcell__r8   s   @r)   r-   r-   ?   sW    Fs F3 F mq;;?B;V[VbVbeiVi; ;r+   r-   c                   b  ^  \ rS rSrSr    SS\S\S\S-  S\S-  S\S-  S	\S-  4U 4S
 jjjr     SS\	R                  S\	R                  S-  S\S-  S\	R                  S-  S\	R                  S-  S\S\\	R                  \	R                  S-  \\	R                     S-  4   4S jjrSrU =r$ )MvpAttentionZ   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rZ   )r5   r6   rV   rW   rX   head_dimr&   scalingrY   r[   r   Lineark_projv_projq_projout_proj)r7   rV   rW   rX   rY   rZ   r[   r8   s          r)   r6   MvpAttention.__init__]   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr+   hidden_stateskey_value_statespast_key_valuesattention_maskattn_promptoutput_attentionsreturnc                 4   USLnUR                  5       u  pnU R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUbU  WR'                  UUU R                  5      u  nnU(       a.  [        U[        5      (       a  SUR
                  U R                  '   Ub  [(        R*                  " US   R-                  U	SSS5      U/SS9n[(        R*                  " US   R-                  U	SSS5      U/SS9nUbZ  [(        R.                  " U	SXS   R                  S5      5      R1                  UR2                  5      n[(        R*                  " UU/SS9nXR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR4                  " U6 nUR4                  " U6 nUR4                  " U6 nUR                  S5      n[(        R6                  " UUR%                  SS5      5      nUR                  5       XR                   -  U
U4:w  a.  [9        S	XR                   -  U
U4 S
UR                  5        35      eUbz  UR                  5       U	SU
U4:w  a#  [9        SU	SU
U4 S
UR                  5        35      eUR                  XR                   U
U5      U-   nUR                  XR                   -  U
U5      n[:        R<                  R?                  USS9nU(       a=  UR                  XR                   U
U5      nUR                  XR                   -  U
U5      nOSn[:        R<                  RA                  UU R@                  U RB                  S9n[(        R6                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [9        SXR                   XR"                  4 S
UR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR5                  XU RD                  5      nU RG                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr"   r   r3   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size ptrainingz `attn_output` should be of size )$sizerc   r_   
isinstancer   
is_updatedgetr[   cross_attention_cacheself_attention_cachelayerskeysvaluesra   rb   viewrW   r^   	transposeupdater?   catrC   zerostor>   reshapebmmr&   r   
functionalsoftmaxrX   rr   rV   rd   )r7   rf   rg   rh   ri   rj   rk   kwargsis_cross_attentionrF   tgt_len_query_statesru   curr_past_key_valuescurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                            r)   rE   MvpAttention.forwardz   s    .T9',,.a {{=1DLL@
&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*+?+F+FzS_aeaoao+p(
L%*_FY*Z*ZAEO..t~~>"KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q'q>;N;Nq;QRUUVdVkVkl!&K+Hr!SNN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r+   )rX   rV   r^   rY   ra   r[   rW   rd   rc   r_   rb   )g        FTN)NNNNF)rI   rJ   rK   rL   rM   rN   floatboolr6   r?   rO   r
   tuplerE   rP   rQ   rR   s   @r)   rT   rT   Z   s#   G !$"' !%CC C 	C
 4KC TkC $;C C@ 15(,.2+/"'p2||p2  ,,-p2 	p2
 t+p2 \\D(p2  p2 
u||U\\D0%2E2LL	Mp2 p2r+   rT   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\S-  S	\	\R                  \R                  S-  4   4
S
 jjr
SrU =r$ )MvpEncoderLayer   configc                 h  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)rV   rW   rX   )r5   r6   d_modelrV   rT   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrX   r	   activation_functionactivation_fnactivation_dropoutr`   encoder_ffn_dimfc1fc2final_layer_normr7   r   r8   s     r)   r6   MvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r+   rf   ri   self_attn_promptrk   Nrl   c                 .   UnU R                  UUUUS9u  p[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXQ-   nU R                  U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       dC  [        R                   " UR                  5      R"                  S-
  n[        R$                  " X* US9nX4$ )a]  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, encoder_attention_heads, pro_len, head_dim)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rf   ri   rj   rk   rp   i  )minmax)r   r   r   rX   rr   r   r   r   r   r   r   r=   r?   float16isfiniteallfinfor   clamp)r7   rf   ri   r   rk   residualr   clamp_values           r)   rE   MvpEncoderLayer.forward   sZ   $ !&*nn')(/	 '5 '
# --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/}8U8Y8Y8[8[++m&9&9:>>EK!KK<[YM**r+   )	r   r   rX   rV   r   r   r   r   r   F)rI   rJ   rK   rL   r   r6   r?   FloatTensorr   r   rE   rP   rQ   rR   s   @r)   r   r      s    =y =* */)+(()+ )))+  ++	)+
  $;)+ 
u  %"3"3d"::	;)+ )+r+   r   c                   x  ^  \ rS rSrSS\4U 4S jjjr        SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\	S-  S\	S-  S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )MvpDecoderLayeri*  Nr   c                   > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)rV   rW   rX   rY   r[   )rX   rY   r[   )r5   r6   r   rV   rT   decoder_attention_headsr   r   rX   r	   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr`   decoder_ffn_dimr   r   r   )r7   r   r[   r8   s      r)   r6   MvpDecoderLayer.__init__+  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r+   rf   ri   encoder_hidden_statesencoder_attention_maskr   cross_attn_promptrh   rk   	use_cacherl   c
           	         UnU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb_  UnU R                  UUUUUUS9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  XU4-  nU$ )ar  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rf   rh   ri   rj   rk   rp   N)rf   rg   ri   rj   rh   rk   )r   r   r   rX   rr   r   r   r   r   r   r   r   r   )r7   rf   ri   r   r   r   r   rh   rk   r   r   r   self_attn_weightscross_attn_weightsoutputss                  r)   rE   MvpDecoderLayer.forwardG  s   > ! ,0>>'+)(/ ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65- /"3 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< "+=>>Gr+   )r   r   rX   rV   r   r   r   r   r   r   r   N)NNNNNNFT)rI   rJ   rK   rL   r   r6   r?   rO   r
   r   r   r   rE   rP   rQ   rR   s   @r)   r   r   *  s   =y = => /3596:0415(,).!%L||L t+L  %||d2	L
 !&t 3L  ,,-L !<<$.L L  $;L $;L 
u  %(9(95;L;L(L"MPT"TT	UL Lr+   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )MvpClassificationHeadi  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )Nrq   )r5   r6   r   r`   denseDropoutrX   rd   )r7   r   r   r   r   r8   s        r)   r6   MvpClassificationHead.__init__  s@     	YYy4
zzN3		)9r+   rf   rl   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r   )rX   r   r?   tanhrd   )r7   rf   s     r)   rE   MvpClassificationHead.forward  sN    ]3

=1

=1]3m4r+   )r   rX   rd   )rI   rJ   rK   rL   rM   rN   r   r6   r?   rO   rE   rP   rQ   rR   s   @r)   r   r     sQ    7
:
: 
: 	
:
 
:U\\ ell  r+   r   c                   l   ^  \ rS rSrSrU 4S jrS\R                  S\\R                     4S jr	Sr
U =r$ )	MvpPrompti  z)Layer-wise prompt for encoder or decoder.c           	      :  > [         TU ]  5         UR                  U l        X l        X0l        UR
                  U-  U l        [        R                  " UR                  S9U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " [        R                  " UR
                  UR                  5      [        R                  " 5       [        R                  " UR                  US-  UR
                  -  5      5      U l        g )Nr   r3   )r5   r6   prompt_length
num_layersrW   r   r^   r   r   rX   	Embeddingprompt_embedding
Sequentialr`   prompt_mid_dimGELUprompt_trans)r7   r   r   rW   r8   s       r)   r6   MvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r+   
prompt_idsrl   c                 *   U R                  U R                  U5      5      nUR                  U R                  U R                  S-  U R
                  U R                  5      nU R                  U5      nUR                  / SQ5      R                  S5      nU$ )Nr3   )r   r3   r   r   )
r   r   r|   r   r   rW   r^   rX   permutesplit)r7   r   prompts      r)   rE   MvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r+   )rX   r^   rW   r   r   r   r   )rI   rJ   rK   rL   rM   r6   r?   rO   r   rE   rP   rQ   rR   s   @r)   r   r     s0    3
%,, 53F  r+   r   c                   L   ^  \ rS rSr% \\S'   SrSrU 4S jr\	S 5       r
SrU =r$ )MvpPreTrainedModeli  r   modelTc                    > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g g r   )r5   _init_weightsrt   MvpForConditionalGenerationinitzeros_final_logits_bias)r7   moduler8   s     r)   r    MvpPreTrainedModel._init_weights  s5    f%f9::KK001 ;r+   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      r3   r         r3   r>   )ri   r   )r   r   r?   tensorr>   ne)r7   	pad_tokenr   dummy_inputss       r)   r  MvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r+    )rI   rJ   rK   rL   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr  rP   rQ   rR   s   @r)   r   r     s.    &*#2
  r+   r   c                      ^  \ rS rSrSrSS\S\R                  S-  S\S-  4U 4S jjjr	      SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )
MvpEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MvpEncoderLayer`].

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
Nr   embed_tokens
use_promptc                 <  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        [        R                  " UR                   X@R                  5      U l        [%        UR                  U5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        U5      PM     sn5      U l        [        R2                  " U5      U l        X0l        U(       a7  UR8                  U l        [;        UUR,                  UR<                  5      U l        SU l         U RC                  5         g s  snf )N      ?F)"r5   r6   rX   encoder_layerdrop	layerdropr   r   padding_idxmax_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   
vocab_sizer  r-   embed_positions
ModuleListrangeencoder_layersr   ry   r   layernorm_embeddingr  r   r   r   r   gradient_checkpointing	post_init)r7   r   r  r  rV   r   r8   s         r)   r6   MvpEncoder.__init__  s6    ~~11NN	!..$*$B$B!393I3I499Y/sLL):):IGWGWX<** 
 mmeFLaLaFb$cFb_V%<Fb$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   Fr   ri   inputs_embedsrk   output_hidden_statesreturn_dictrl   c                 "   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb$  UnUR
                  n	UR                  SU	S   5      nO.Ub   UR                  5       SS n	USS2SS2S4   nO[	        S5      eUc  U R                  U5      U R                  -  nU R                  U5      n
X:-   nU R                  U5      n[        R                  R                  XR                  U R                  S9nU R                   (       aJ  ["        R$                  " U R&                  5      R)                  U R*                  5      nU R-                  U5      nUb  [/        U R                   UUS9nU(       a  SOSnU(       a  SOSn[1        U R2                  5       H  u  nnU(       a  X4-   nSnU R                  (       a(  ["        R4                  " / 5      nUU R6                  :  a  S	nU(       a  S
nO$U" UUU R                   (       a  WU   OSUS9nUS   nU(       d  M  UUS   4-   nM     U(       a  X4-   nU(       d  [9        S XU4 5       5      $ [;        XUS9$ )ap  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   z5You have to specify either input_ids or inputs_embedsrp   )r   r"  ri   r  FT)NN)r   rk   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  .0vs     r)   	<genexpr>%MvpEncoder.forward.<locals>.<genexpr>{  s     e$Sq$Ss   	last_hidden_staterf   
attentions)r   rk   r#  r$  r&   r$   r|   rs   r  r  r  r  r   r   rX   rr   r  r?   r@   r   r   r>   r   r   	enumeratery   randr  r   r   )r7   r   ri   r"  rk   r#  r$  r   inputinput_shape	embed_posrf   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r)   rE   MvpEncoder.forward
  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%100?--m||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %6{{+-N  40d"+DKK"8C#!/2B!BG}}&+jjn#&7"G , -!"?C&6s&;TX&7	! !.a 0  !/=3C2E!E/ #92  +.>>Ne]N$Seee+Vd
 	
r+   )rX   r  r  r  r  r  r  ry   r  r  r   r   r  NF)NNNNNN)rI   rJ   rK   rL   rM   r   r   r   r   r6   r?   
LongTensorrO   r   r   r   rE   rP   rQ   rR   s   @r)   r  r    s    y t8K `dgk`k  F .2.226)-,0#'t
##d*t
 t+t
 ((4/	t

  $;t
 #Tkt
 D[t
 
	 t
 t
r+   r  c                   6  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr          SS\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jjrSrU =r$ )
MvpDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
r   r  Nc           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                   UR                  U R                  5      U l        [%        UR                  UR                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [        R2                  " UR                  5      U l        X l        U(       a]  UR8                  U l        [;        UUR,                  UR<                  5      U l        [;        UUR,                  UR<                  5      U l         SU l!        U RE                  5         g s  snf )Nr  )r[   F)#r5   r6   rX   decoder_layerdropr  r   r  r  max_target_positionsr  r  r  r   r  r   r   r  r  r-   r  r  r  decoder_layersr   ry   r   r  r  r   r   r   r   r   r  r   )r7   r   r  ir8   s       r)   r6   MvpDecoder.__init__  sf    ~~11!..$*$B$B!8>8N8N499V^^4TWLL):):FNNDL\L\]<**NN 
 mmSXY_YnYnSo$pSoa_V%ISo$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   Gr   ri   r   r   rh   r"  r   rk   r#  r$  rl   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  Ub  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eUc  U R                  U5      U R                  -  nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       ab  Uc_  Uc  U R                   R                  (       a.  [!        [#        U R                   S9[#        U R                   S95      O[#        U R                   S9nUb  UR%                  5       OSn['        U R                   UUUS	9nUb  Ub  [)        U R                   UUUS
9nU R+                  X5      nXo-   nU R-                  U5      n[.        R0                  R3                  UU R2                  U R                  S9nU R4                  (       a[  [6        R8                  " U R:                  5      R=                  U R>                  5      nU RA                  U5      nU RC                  U5      nU	(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[E        U RF                  5       H  u  nnU	(       a  UU4-  nU R                  (       a(  [6        RH                  " / 5      nUU RJ                  :  a  ML  U" UUUUU R4                  (       a  WU   OSU R4                  (       a  WU   OSUUUS9	nUS   nU(       d  M  UUS   4-  nUc  M  UUS   4-  nM     U	(       a  UU4-  nU
(       d  [M        S UUUUU4 5       5      $ [O        UUUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer"   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   r   )r   r"  ri   rh   )r   r"  ri   r   rp   r  )r   r   r   rh   rk   r   r   r3   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  r'  s     r)   r*  %MvpDecoder.forward.<locals>.<genexpr>W  s      rA rs   	)r-  rh   rf   r.  cross_attentions)(r   rk   r#  r   r$  r&   r$   r|   rs   r  r  r  rr   loggerwarning_onceis_encoder_decoderr   r   get_seq_lengthr   r   r  r  r   r   rX   r  r?   r@   r   r   r>   r   r   r/  ry   r0  r  r   r   )r7   r   ri   r   r   rh   r"  r   rk   r#  r$  r   r1  r2  r:   	positionsrf   r   r   r   all_hidden_statesall_self_attnsall_cross_attentionsr6  decoder_layerr9  r:  s                              r)   rE   MvpDecoder.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg+;;')+	
 !,1G1S%>{{+5&;	&" ((G	%100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh"+DKK"8C#!m%55!}}&+jjn#&7)%'=;???"23"7PT=A__#4S#9RV /"3#
M *!,M  =#3"55(4(]1-=,??(3 #98  -!11 ':K^]qr  
 9+++%1
 	
r+   )r   rX   r  r  r  r  r  r  ry   rB  r  r   r   r  r   )
NNNNNNNNNN)rI   rJ   rK   rL   rM   r   r   r6   r?   r=  rO   r   r
   r   r   rE   rP   rQ   rR   s   @r)   r?  r?    s    y  dTk    H .2.2:>:>(,26!%)-,0#'u
##d*u
 t+u
  %0047	u

 !& 0 04 7u
 u
 ((4/u
 $;u
  $;u
 #Tku
 D[u
 
:	:u
 u
r+   r?  c                     ^  \ rS rSrS/rSSS.rS\4U 4S jjrS rS r	S	 r
\            SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\\R                      S
-  S\S
-  S\R                   S
-  S\R                   S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\\-  4S jj5       rSrU =r$ )MvpModelie  r   zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr   c                 H  > [         TU ]  U5        UR                  UR                  p2UR                  U l        [
        R                  " X1R                  U5      U l        [        XR                  5      U l
        [        XR                  5      U l        U R                  5         g r   )r5   r6   r   r  r  r   r   r   sharedr  encoderr?  decoderr   )r7   r   r  r  r8   s       r)   r6   MvpModel.__init__m  sv     "("5"5v7H7HZ ++ll:~~{K!&*;*;<!&*;*;< 	r+   c                     U R                   $ r   )rW  r7   s    r)   get_input_embeddingsMvpModel.get_input_embeddingsz  s    {{r+   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r   )rW  rX  r  rY  r7   values     r)   set_input_embeddingsMvpModel.set_input_embeddings}  s'    $(KK!$(KK!r+   c                 4   U R                   (       d   S5       eU R                  S5        U R                  R                  R                  S5        U R                  R                  R                  S5        U R                  R
                  R                  S5        g )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r  requires_grad_rX  r   rY  r   r\  s    r)   set_lightweight_tuningMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r+   Nr   ri   decoder_input_idsdecoder_attention_maskencoder_outputsrh   r"  decoder_inputs_embedsr   rk   r#  r$  rl   c                 J   UcE  UcB  Uc  [        S5      e[        XR                  R                  U R                  R                  5      nU
b  U
OU R                  R
                  n
Ub  UOU R                  R                  nU	b  U	OU R                  R                  n	Ub  UOU R                  R                  nUc  U R                  UUUU
UUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  UUUS   UUUU	U
UUS9
nU(       d  X-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S	9$ )
aA  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   ri   r"  rk   r#  r$  r   r   r3   r,  
r   ri   r   r   rh   r"  r   rk   r#  r$  )r-  rh   decoder_hidden_statesdecoder_attentionsrI  encoder_last_hidden_stater   encoder_attentions)r&   r*   r   r   r    rk   r#  r   r$  rX  rt   r   lenrY  r   r-  rh   rf   r.  rI  )r7   r   ri   rh  ri  rj  rh   r"  rk  r   rk   r#  r$  r   decoder_outputss                  r)   rE   MvpModel.forward  s   T $)>)F  U  !3;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY""ll#-+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1+//!5# ' 
 "44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r+   )rY  rX  rW  r  NNNNNNNNNNNN)rI   rJ   rK   rL   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r6   r]  rb  rf  r   r?   r=  rO   listr   r
   r   r   r   rE   rP   rQ   rR   s   @r)   rU  rU  e  sh   *=)>&'6'6
y 0
<  .2.259:>:>(,26:>!%)-,0#'g
##d*g
 t+g
 !++d2	g

 !& 0 04 7g
 e//047g
 g
 ((4/g
  %0047g
 $;g
  $;g
 #Tkg
 D[g
 
#	#g
 g
r+   rU  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc                     ^  \ rS rSrSS0rS\4U 4S jjr SS\S\S-  S	\S
\	R                  4U 4S jjjrS\S
S4S jrS r\             SS\R                   S-  S\R"                  S-  S\R                   S-  S\R                   S-  S\\R&                     S-  S\S-  S\R&                  S-  S\R&                  S-  S\R                   S-  S\S-  S\S-  S\S-  S\S-  S
\\-  4S jj5       rS\R"                  4S jrSrU =r$ ) r   i  lm_head.weightzmodel.shared.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nr   r   Fr]   )r5   r6   rU  r   register_bufferr?   r   rW  r/   r   r`   r   lm_headr   r   s     r)   r6   $MvpForConditionalGeneration.__init__  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r+   Nnew_num_tokenspad_to_multiple_ofmean_resizingrl   c                 J   > [         TU ]  XU5      nU R                  U5        U$ r   )r5   resize_token_embeddings_resize_final_logits_bias)r7   r  r  r  new_embeddingsr8   s        r)   r  3MvpForConditionalGeneration.resize_token_embeddings  s+     8]jk&&~6r+   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr"   r   r   rn   r   )r   r$   r?   r   r>   r   r}  )r7   r  old_num_tokensnew_bias
extra_biass        r)   r  5MvpForConditionalGeneration._resize_final_logits_bias  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r+   c                 n    U R                   R                  5         U R                  R                  S5        g r<  r   rf  r~  re  r\  s    r)   rf  2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r+   r   ri   rh  ri  rj  rh   r"  rk  labelsr   rk   r#  r$  c                    Ub  UOU R                   R                  nU	bX  U
(       a  [        R                  S5        Sn
Uc7  Uc4  [	        XR                   R
                  U R                   R                  5      nU R                  UUUUUUUUU
UUUS9nU R                  US   5      U R                  -   nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  S9	$ )	a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example of summarization:

Fine-tuning a model
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
...     return_tensors="pt",
... )
>>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     generated_ids = model.generate(**inputs)

>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)ri   rh  rj  ri  rh   r"  rk  r   rk   r#  r$  r   r"   r   	losslogitsrh   rn  ro  rI  rp  r   rq  )r   r$  rJ  warningr*   r   r    r   r~  r   r   r|   r  r   rh   rn  ro  rI  rp  r   rq  )r7   r   ri   rh  ri  rj  rh   r"  rk  r  r   rk   r#  r$  r   r   	lm_logitsmasked_lm_lossloss_fctoutputs                       r)   rE   #MvpForConditionalGeneration.forward  s   R &1%<k$++BYBYklI (-B-J$6KK44dkk6X6X%! **)/+#9+'"7/!5#  
 LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r+   c                 j    [        XR                  R                  U R                  R                  5      $ r   )r*   r   r   r    )r7   r  s     r)   %prepare_decoder_input_ids_from_labelsAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr+   r~  r   )NTNNNNNNNNNNNNN)rI   rJ   rK   rL   rw  r   r6   rN   r   r   r   r  r  rf  r   r?   r=  rO   rx  r   r
   r   r   rE   r  rP   rQ   rR   s   @r)   r   r     s    	/y  ae!7:TzY]	 < < <+  .2.259:>:>(,26:>*.!%)-,0#'v
##d*v
 t+v
 !++d2	v

 !& 0 04 7v
 e//047v
 v
 ((4/v
  %0047v
   4'v
 $;v
  $;v
 #Tkv
 D[v
  
	 !v
 v
phELL h hr+   r   z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                     ^  \ rS rSrS\4U 4S jjrS r\            SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\\R                     S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )MvpForSequenceClassificationi  r   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g r   )
r5   r6   rU  r   r   r   
num_labelsclassifier_dropoutclassification_headr   )r7   r   r   r8   s      r)   r6   %MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r+   c                 n    U R                   R                  5         U R                  R                  S5        g r<  )r   rf  r  re  r\  s    r)   rf  3MvpForSequenceClassification.set_lightweight_tuning  s&    

))+  //6r+   Nr   ri   rh  ri  rj  r"  rk  r  r   rk   r#  r$  rl   c                    Ub  UOU R                   R                  nUb  Sn	Uc%  Ub"  [        SU R                  R                   35      eU R                  UUUUUUUU	U
UUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      R                  5       S:H  S5        UUSS24   R                  UR!                  S5      SUR!                  S5      5      SS2SSS24   nU R#                  U5      nSnUGb  U R                   R$                  c  U R                   R&                  S:X  a  S	U R                   l        OyU R                   R&                  S:  aN  UR(                  [        R*                  :X  d  UR(                  [        R,                  :X  a  S
U R                   l        OSU R                   l        U R                   R$                  S	:X  aT  [/        5       nU R                   R&                  S:X  a&  U" UR1                  5       UR1                  5       5      nOU" UU5      nOU R                   R$                  S
:X  aG  [3        5       nU" UR                  SU R                   R&                  5      UR                  S5      5      nO-U R                   R$                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  UR>                  UR@                  URB                  URD                  S9	$ )af	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

Fine-tuning a model on `num_labels` classes
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForSequenceClassification

>>> num_labels = 2  # for example, this is a binary classification task
>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

>>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor(1)  # the real label for inputs

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax()
```
NFz8Passing input embeddings is currently not supported for 
ri   rh  ri  rj  r"  rk  r   rk   r#  r$  r   r   z7All examples must have the same number of <eos> tokens.r"   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r$  NotImplementedErrorr8   rI   r   eqeos_token_idr   r>   r   r?   unique_consecutivesumnumelr|   rs   r  problem_typer  r=   rA   rN   r   squeezer   r   r   rh   rn  ro  rI  rp  r   rq  )r7   r   ri   rh  ri  rj  r"  rk  r  r   rk   r#  r$  r   r   rf   eos_masksentence_representationr  r  r  r  s                         r)   rE   $MvpForSequenceClassification.forward  s	   J &1%<k$++BYBYI!:%J4>>KbKbJcd  **)/#9+'"7/!5#  
  
<< 8 89<<]=Q=QR$$X\\!_5;;=BE	
 #0!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r+   )r  r   ru  )rI   rJ   rK   rL   r   r6   rf  r   r?   r=  rO   rx  r   r   r   r   rE   rP   rQ   rR   s   @r)   r  r    sN   y 7  .2.259:>:>26:>*.!%)-,0#'K
##d*K
 t+K
 !++d2	K

 !& 0 04 7K
 e//047K
 ((4/K
  %0047K
   4'K
 $;K
  $;K
 #TkK
 D[K
 
0	0K
 K
r+   r  c                     ^  \ rS rSrU 4S jrS r\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\
\R                     S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )MvpForQuestionAnsweringi@  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r2   )
r5   r6   r  rU  r   r   r`   hidden_size
qa_outputsr   r   s     r)   r6    MvpForQuestionAnswering.__init__B  s[      ++f%
))F$6$68I8IJ 	r+   c                 n    U R                   R                  5         U R                  R                  S5        g r<  )r   rf  r  re  r\  s    r)   rf  .MvpForQuestionAnswering.set_lightweight_tuningN  s$    

))+&&u-r+   Nr   ri   rh  ri  rj  start_positionsend_positionsr"  rk  r   rk   r#  r$  rl   c                    Ub  UOU R                   R                  nUb  Ub  Sn
U R                  UUUUUUU	U
UUUS9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" UU5      nU" UU5      nUU-   S	-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  UR                  UR                  UR                   UR"                  UR$                  S
9
$ )a	  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.

Example:

Fine-tuning a model for extrative question answering, and our model also supports generative question answering
using `BartForConditionalGeneration`
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForQuestionAnswering

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
...     return_tensors="pt",
... )
>>> target_start_index = torch.tensor([18])
>>> target_end_index = torch.tensor([19])

>>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
>>> predict_answer = tokenizer.decode(predict_answer_tokens)
```
NFr  r   r   r"   rn   )ignore_indexr3   )
r  start_logits
end_logitsrh   rn  ro  rI  rp  r   rq  )r   r$  r   r  r   r  
contiguousrr  rs   r   r   r   rh   rn  ro  rI  rp  r   rq  )r7   r   ri   rh  ri  rj  r  r  r"  rk  r   rk   r#  r$  r   r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                             r)   rE   MvpForQuestionAnswering.forwardR  s   V &1%<k$++BYBY&=+DI**)/#9+'"7/!5#  
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r+   )r   r  r  r  )rI   rJ   rK   rL   r6   rf  r   r?   rO   r=  rx  r   r   r   r   rE   rP   rQ   rR   s   @r)   r  r  @  s^   
.  *..259:>:>371526:>!%)-,0#'F
<<$&F
 t+F
 !++d2	F

 !& 0 04 7F
 e//047F
 ))D0F
 ''$.F
 ((4/F
  %0047F
 $;F
  $;F
 #TkF
 D[F
  
4	4!F
 F
r+   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MvpDecoderWrapperi  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r5   r6   r?  rY  r   r   s     r)   r6   MvpDecoderWrapper.__init__  s&     !&)r+   c                 &    U R                   " U0 UD6$ r   rY  )r7   argsr   s      r)   rE   MvpDecoderWrapper.forward  s    ||T,V,,r+   r  )	rI   rJ   rK   rL   rM   r6   rE   rP   rQ   rR   s   @r)   r  r    s    

- -r+   r  c                     ^  \ rS rSrSS0rU 4S jrS rS rS r\	            SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\S-  S\S-  S\\
R                  -  S\\-  4S jj5       rSrU =r$ )MvpForCausalLMi  r{  z!model.decoder.embed_tokens.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr]   )rY   rL  r5   r6   r  r   r   r`   r  r  r~  r   r   s     r)   r6   MvpForCausalLM.__init__  sX     $)! &v.
yy!3!3V5F5FUS 	r+   c                 B    U R                   R                  R                  $ r   r   rY  r  r\  s    r)   r]  #MvpForCausalLM.get_input_embeddings  s    zz!!...r+   c                 8    XR                   R                  l        g r   r  r`  s     r)   rb  #MvpForCausalLM.set_input_embeddings  s    */

'r+   c                 n    U R                   R                  5         U R                  R                  S5        g r<  r  r\  s    r)   rf  %MvpForCausalLM.set_lightweight_tuning   r  r+   Nr   ri   r   r   rh   r"  r  r   rk   r#  r$  logits_to_keeprl   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  R                  UUUUUUUU	U
US9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MvpForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> list(logits.shape)
[1, 8, 50267]
```Nrm  r   r"   r   )r  r  rh   rf   r.  rI  )r   rk   r#  r$  r   rY  rt   rN   slicer~  r   r|   r  r   rh   rf   r.  rI  )r7   r   ri   r   r   rh   r"  r  r   rk   r#  r$  r  r   r   rf   slice_indicesr  r  r  r  s                        r)   rE   MvpForCausalLM.forward  so   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY **$$)"7#9+'/!5# % 
  
8B>SV8W8W~ot4]kmA}a,?@A')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r+   r  )NNNNNNNNNNNr   )rI   rJ   rK   rL   rw  r6   r]  rb  rf  r   r?   r=  rO   r   r
   r   rN   r   r   rE   rP   rQ   rR   s   @r)   r  r    sO   *,OP	/0+  .2.2:>;?(,26*.!%)-,0#'-.O
##d*O
 t+O
  %0047	O

 !& 1 1D 8O
 O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
 D[O
 ell*O
 
2	2O
 O
r+   r  )r  r   r  r  rU  r   )@rM   r  r?   r   torch.nnr   r   r    r   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_mvpr   
get_loggerrI   rJ  rO   rN   r*   r   r-   ModulerT   r   r   r   r   r   r  r?  rU  r   r  r  r  r  __all__r  r+   r)   <module>r     s       A A & ! C C ) J 9   . D D ( 
		H	%%,, c [^ ";BLL ;6P2299 P2f:+0 :+zi0 iZBII 0		 2   *`
# `
Fa
# a
H L
! L
 L
^ 
\h"4o \h
\h~ ^
#5 ^
^
B X
0 X
 X
x-* -h
' h
Vr+   