
    Z j|                     2   S r SSKrSSKrSSKJr  SSKJrJrJrJr  SSKJ	r
  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJrJr  SSKJ r   \RB                  " \"5      r#S'S jr$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\5      r(\ " S S\5      5       r)\ " S S\)5      5       r*\" SS9 " S S\)\5      5       r+\" SS9 " S  S!\)5      5       r,\ " S" S#\)5      5       r-\ " S$ S%\)5      5       r./ S&Qr/g)(zPyTorch MPT model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfigc                 H   [         R                  " SU-
  S[         R                  US9R                  SSSU5      nS[        R
                  " [        R                  " U 5      5      -  n[         R                  " SUS-   [         R                  US9R                  5       nXbU-  -  nS[         R                  " SU5      -  nUR                  SUSS5      nXP:w  a7  [         R                  " USS2SSS2S4   USS2SSS2S4   /SS9SS2SU 2S4   nXG-  nUR                  S5      $ )	a  
Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
r   )dtypedevice   g      ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr3   *   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D$889D599Q%%F[[0!Q7F(vaAsl3VAssCK5HIqQRSU_V_U_adRdeNE==    c            
          ^  \ rS rSrSrSS\S\S-  4U 4S jjjr  SS\R                  S\R                  S	\
S-  S
\R                  S-  4S jjrSrU =r$ )MptAttentionA   zrMulti-head self attention.
Using torch or triton attention implementation enables user to also use additive bias.
Nconfig	layer_idxc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  R                  U l        U R                  c5  S[        R                  " U R                  U R                  -  5      -  U l        UR                  R                  U l        UR                  R                  U l        [        R                  " U R                  SU R                  -  SS9U l        [        R                  " U R                  U R                  SS9U l        X l        g )Nr   r	   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler#   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr9   )selfr8   r9   	__class__s      r2   r>   MptAttention.__init__F   s   !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Q"r4   hidden_statesposition_biaspast_key_valuesattention_maskc                 R   UR                   S S u  pgU R                  U5      nU R                  (       a%  UR                  U R                  * U R                  S9nUR	                  SSS9u  pnU	R                  XgU R                  U R                  5      R                  SS5      n	U
R                  XgU R                  U R                  5      R                  SS5      n
UR                  XgU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        R                  " XR                  SS5      5      U R                  -  nUc  UOXsR                  5       -   nUb  [        UR                   5      S:w  a!  [!        S[        UR                   5       35      eU
R                   S   n[#        S	UR%                  S5      U-
  5      n[#        S	UR%                  S5      U-
  5      nUS S 2US 2US 24   nX-   nUb:  UR'                  U[        R(                  " U	R*                  5      R,                  5      n[.        R0                  R3                  UR5                  5       SS9R7                  UR*                  5      n[.        R0                  R9                  UU R:                  U R<                  S
9n[        R                  " UU5      nUR?                  S	SSS5      RA                  5       RC                  XgS5      nU RE                  U5      nUU4$ )Nr   )minmaxr	   r   r   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperK   rI   clampchunkreshaper@   rC   	transposeupdater9   r   matmulrE   get_seq_lengthlen
ValueErrorrV   sizemasked_fillfinfor   rU   r   r   softmaxr'   todropoutrH   r[   permute
contiguousr"   rL   )rM   rP   rQ   rR   rS   kwargs
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                       r2   forwardMptAttention.forwardV   s    "/!4!4Ra!8
IIm,	==!T]]NNI1:1J.,#++JDLLRVR_R_`jjklnop''
dmm\ffghjkl
#++JDLLRVR_R_`jjklnop&'6'='=jX\XfXf'g$J <<6J6J2r6RSVZVhVhh%4%<z*OmOmOoBo$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjfhimmN3L((r4   )
rK   rH   rI   rC   r?   r9   rB   r@   rL   rE   N)NN)__name__
__module____qualname____firstlineno____doc__r   intr>   r   Tensorr
   r}   __static_attributes____classcell__rN   s   @r2   r6   r6   A   sq    #y #S4Z # #( )-.20)||0) ||0) 	0)
 t+0) 0)r4   r6   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	MptMLP   r8   c                   > [         TU ]  5         UR                  n[        R                  " USU-  SS9U l        [        R                  " SS9U l        [        R                  " SU-  USS9U l        UR                  R                  U l        g )N   Fr;   none)approximate)r=   r>   r?   r   rJ   up_projGELUact	down_projrD   rG   hidden_dropout)rM   r8   r?   rN   s      r2   r>   MptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r4   rP   residualreturnc                     U R                  U R                  U5      5      nU R                  U5      n[        R                  " X0R
                  U R                  S9nXB-   nU$ )NrY   )r   r   r   Frk   r   r[   )rM   rP   r   intermediate_outputoutputs        r2   r}   MptMLP.forward   sS    m!<="nn];.2E2EPTP]P]^"r4   )r   r   r   r   )r   r   r   r   r   r>   r   r   r}   r   r   r   s   @r2   r   r      s:    <y <U\\ U\\ ell  r4   r   c                      ^  \ rS rSrSS\S\S-  4U 4S jjjr   SS\R                  S\R                  S\R                  S	\	S-  S
\
S\
4S jjrSrU =r$ )MptBlock   Nr8   r9   c                   > [         TU ]  5         UR                  n[        X1R                  S9U l        S U R
                  l        UR                  U l        [        X5      U l
        [        X1R                  S9U l        S U R                  l        [        U5      U l        UR                  R                  U l        ["        R$                  " U R                   5      U l        g )Neps)r=   r>   r?   r   layer_norm_epsilonnorm_1r<   r@   r+   r6   attnnorm_2r   ffnrD   rG   dropout_rater   Dropoutresid_attn_dropout)rM   r8   r9   r?   rN   s       r2   r>   MptBlock.__init__   s    ((1J1JK 3	1J1JK&>"..99"$**T->->"?r4   rP   rQ   rS   
layer_past	use_cacheoutput_attentionsc                     U R                  U5      nUn	U R                  UUUUS9u  pU R                  U
5      U	-   nU R                  U5      nUn	U R	                  X5      nX4$ )N)rQ   rS   rR   )r   r   r   r   r   )rM   rP   rQ   rS   r   r   r   rn   layernorm_outputr   attn_outputsrz   r   s                r2   r}   MptBlock.forward   s      ;;}5  &*YY')&	 &/ &
" //=H;;}5 ! *5##r4   )r   r   r   r   r   r+   r   r   )NFF)r   r   r   r   r   r   r>   r   r   r
   boolr}   r   r   r   s   @r2   r   r      s    @y @S4Z @ @2 $("'!$||!$ ||!$ 	!$
 DL!$ !$  !$ !$r4   r   c                   .    \ rS rSr% \\S'   SrSrS/rSr	g)MptPreTrainedModel   r8   transformerTr    N)
r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r   r4   r2   r   r      s    %&*##r4   r   c                   >  ^  \ rS rSrS\4U 4S jjrS rSS jrS\R                  4S jr
\        SS	\R                  S-  S
\S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  S4   \-  4S jj5       rSrU =r$ )MptModel   r8   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [        U R                  UR                  S9U l        S U R                   l        SU l        U R'                  5         g s  snf )N)r9   r   F)r=   r>   r?   r@   r+   r   	Embedding
vocab_sizewte
ModuleListrangen_layersr   blocksr   r   norm_fr<   gradient_checkpointing	post_init)rM   r8   irN   s      r2   r>   MptModel.__init__   s     !-- << 1 143C3CD mmERXRaRaLb$cLbqXf%BLb$cd   0 0f6O6OP&+# 	 %ds   
C5c                     U R                   $ r   r   )rM   s    r2   get_input_embeddingsMptModel.get_input_embeddings   s    xxr4   Nc                     [        XX45      $ r   )r3   )rM   r+   r,   r-   r   s        r2   r3   MptModel.build_mpt_alibi_tensor   s    %i.YYr4   new_embeddingsc                     Xl         g r   r   rM   r   s     r2   set_input_embeddingsMptModel.set_input_embeddings   s    !r4   	input_idsrR   rS   inputs_embedsr   r   output_hidden_statesreturn_dictr   .c	           
      P   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  [        U R                   S9nUnU(       a  SOSnU(       a  SOSnU R                  U R                  U R                   R                  UR                   S9n[#        U R                   UUUS	9R%                  [&        R(                  5      nU R*                   H1  nU(       a  X4-   nU" UUUUUUS
9nUS   nU(       d  M(  UUS   4-   nM3     U R-                  U5      nU(       a  X4-   nU(       d  [/        S XX4 5       5      $ [1        UUUUS9$ )j  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r8   r   r   )r8   r   rS   rR   )r   rS   r   r   rQ   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r2   	<genexpr>#MptModel.forward.<locals>.<genexpr>^  s      cacs   	)last_hidden_staterR   rP   
attentions)r8   r   r   r   r   re   r\   r   r[   loggerwarning_oncer   r   r3   r+   rA   r   r   rj   r   r   r   r   tupler   )rM   r   rR   rS   r   r   r   r   r   rn   ro   rp   _rP   all_self_attentionsall_hidden_statesr.   causal_maskblockoutputss                       r2   r}   MptModel.forward   s6   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY ]%>cdd"%.__"J
&(5(;(;%JATUU&&4==##p "	  HHY/M0*$++>O%$5b4"6BD ++DNNDKK<S<S\i\p\p+q(;;')+	

 "UZZ. 	 [[E#$58H$H!**#"3#G $AJM  &9WQZM&I# !$ M2 14D D )<Mc   9+++*	
 	
r4   )r   r   r?   r   r+   r      NNNNNNNNN)r   r   r   r   r   r>   r   r3   r   r   r   r   
LongTensorr
   r   r   r   r}   r   r   r   s   @r2   r   r      s   y ,Z"5<< "  .2(,.215!%)-,0#'f
##d*f
 f
 t+	f

 ''$.f
 $;f
  $;f
 #Tkf
 D[f
 
u||S 	!$M	Mf
 f
r4   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   r  ^  \ rS rSrSS0rS\4U 4S jjrS\R                  4S jr	\
          SS	\R                  S-  S
\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  -  S\\R                     \-  4S jj5       rSrU =r$ )MptForCausalLMij  zlm_head.weightztransformer.wte.weightr8   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr;   )
r=   r>   r   r   r   rJ   r?   r   lm_headr   rM   r8   rN   s     r2   r>   MptForCausalLM.__init__s  sI     #F+yy!3!3V5F5FUS 	r4   r   c                     Xl         g r   )r   r   s     r2   set_output_embeddings$MptForCausalLM.set_output_embeddings{  s    %r4   Nr   rR   rS   r   labelsr   r   r   r   logits_to_keepr   c                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                   R                  S.UD6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
NrR   rS   r   r   r   r   r   r   )logitsr   r   r   lossr  rR   rP   r   r   )r8   r   r   
isinstancer   slicer   loss_functionr   r   rR   rP   r   )rM   r   rR   rS   r   r   r   r   r   r   r  rn   transformer_outputsrP   slice_indicesr  r  r   s                     r2   r}   MptForCausalLM.forward~  s   @ &1%<k$++BYBY"..+)'/!5# / 	
 ,A.8B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopDY!4QR!88F)-)9TGf$EvE0/??-;;*55
 	
r4   )r   r   )
NNNNNNNNNr   )r   r   r   r   _tied_weights_keysr   r>   r   r   r   r   r   r
   r   r   r   r   r}   r   r   r   s   @r2   r   r   j  s    +,DEy &ELL &  .2(,.2-1&*!%)-,0#'-.?
##d*?
 ?
 t+	?

 ||d*?
 t#?
 $;?
  $;?
 #Tk?
 D[?
 ell*?
 
u||	@	@?
 ?
r4   r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   J  ^  \ rS rSrS\4U 4S jjrS\R                  4S jr\	         SS\R                  S-  S\S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )MptForSequenceClassificationi  r8   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r   )
r=   r>   
num_labelsr   r   r   rJ   r?   scorer   r   s     r2   r>   %MptForSequenceClassification.__init__  sV      ++#F+YYv1163D3D5Q
 	r4   r   c                     Xl         g r   )r  r   s     r2   r   2MptForSequenceClassification.set_output_embeddings  s    #
r4   Nr   rR   rS   r   r   r   r   r   r   r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [5        UUUR6                  UR8                  UR:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rW   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r8   r   r   r  r\   pad_token_idre   rj   r   r   r!   r    argmaxr   r   rN   r   problem_typer  r   longr   r   r*   r   r   r   rR   rP   r   )rM   r   rR   rS   r   r   r   r   r   r   rn   r
  rP   r  ro   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr   s                         r2   r}   $MptForSequenceClassification.forward  s   > &1%<k$++BYBY"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r4   )r  r  r   	NNNNNNNNN)r   r   r   r   r   r>   r   r   r   r   r   r
   r   r   r   r}   r   r   r   s   @r2   r  r    s   y $ELL $  .2(,.2-1&*!%)-,0#'e
##d*e
 e
 t+	e

 ||d*e
 t#e
 $;e
  $;e
 #Tke
 D[e
 
u||	?	?e
 e
r4   r  c                   (  ^  \ rS rSrS\4U 4S jjr\         SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\S-  S\\R                     \-  4S jj5       rSrU =r$ )MptForTokenClassificationiE  r8   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)r=   r>   r  r   r   hasattrr)  r   r   r   rk   rJ   r?   
classifierr   )rM   r8   r)  rN   s      r2   r>   "MptForTokenClassification.__init__G  s      ++#F+6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r4   Nr   rR   rS   r   r   r   r   r   r   r   c
                 
   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbl  UR                  UR                  5      nUR                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )r  Nr  r   r   )r  r  rP   r   )r8   r   r   rk   r+  rj   r   r\   r   r"   r  r   rP   r   )rM   r   rR   rS   r   r   r   r   r   r   deprecated_argumentsr
  rP   r  r  ro   rp   r#  r   s                      r2   r}   !MptForTokenClassification.forwardX  s+   > &1%<k$++BYBY"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r4   )r+  rk   r  r   r%  )r   r   r   r   r   r>   r   r   r   r
   r   r   r   r   r}   r   r   r   s   @r2   r'  r'  E  s    y "  .2(,.2-1&*!%)-,0#'B
##d*B
 B
 t+	B

 ||d*B
 t#B
 $;B
  $;B
 #TkB
 D[B
 
u||	4	4B
 B
r4   r'  c                     ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S-  S
\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )MptForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr   )	r=   r>   r   r   r   rJ   r?   
qa_outputsr   r   s     r2   r>    MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r4   Nr   rS   r   start_positionsend_positionsr   r   r   r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  U
R                  S	9$ )
r   N)rS   r   r   r   r   r   r   rW   r   )ignore_indexr   )r  start_logits
end_logitsrP   r   )r8   r   r   r3  splitr*   rm   rd   rf   r]   r   r   rP   r   )rM   r   rS   r   r5  r6  r   r   r   rn   r   sequence_outputr  r9  r:  
total_lossignored_indexr#  
start_lossend_lossr   s                        r2   r}   MptForQuestionAnswering.forward  s   4 &1%<k$++BYBY"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r4   )r3  r   r   )r   r   r   r   r>   r   r   r   FloatTensorr   r   r   r}   r   r   r   s   @r2   r1  r1    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
r4   r1  )r   r   r   r  r'  r1  r   )0r   r#   r   r   torch.nnr   r   r   r   r   r   cache_utilsr
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr   r   r3   Moduler6   r   r   r   r   r   r  r'  r1  __all__r   r4   r2   <module>rO     s]       L L $ . ) / 9  . , ( 
		H	%.E)299 E)PRYY *6$) 6$r % % % G
! G
 G
T N
' N
N
b s
#5 s
s
l U
 2 U
 U
p P
0 P
 P
fr4   