
    Z jļ                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJrJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r- " S S\RT                  5      r. " S S\RT                  5      r/ " S S\RT                  5      r0 " S S\RT                  5      r1 " S S \5      r2 " S! S"\RT                  5      r3 " S# S$\RT                  5      r4 " S% S&\RT                  5      r5 " S' S(\RT                  5      r6\# " S) S*\5      5       r7\#" S+S,9 " S- S.\75      5       r8\# " S/ S0\75      5       r9\#" S1S,9 " S2 S3\7\5      5       r:\#" S4S,9 " S5 S6\75      5       r;\# " S7 S8\75      5       r<\# " S9 S:\75      5       r=\# " S; S<\75      5       r>/ S=Qr?g)>zPyTorch RemBERT model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )RemBertConfigc                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjrSrU =r$ )RemBertEmbeddings.   zGConstruct the embeddings from word, position and token_type embeddings.c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  g )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizeinput_embedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/rembert/modeling_rembert.pyr)   RemBertEmbeddings.__init__1   s    !||v::H[H[ 
 $&<<0N0NPVPkPk#l %'\\&2H2H&JeJe%f"f&A&AvG\G\]zz&"<"<= 	ELL)G)GHOOPWXej 	 	
    N	input_idstoken_type_idsr$   inputs_embedspast_key_values_lengthreturnc                    Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUc  U R                  U5      nU R                  U5      nXH-   n	U R                  U5      n
X-  n	U R                  U	5      n	U R                  U	5      n	U	$ )Nr&   r   dtypedevice)sizer$   r9   zeroslongrK   r.   r2   r0   r3   r7   )r=   rC   rD   r$   rE   rF   input_shape
seq_lengthr2   
embeddingsr0   s              r@   forwardRemBertEmbeddings.forwardA   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J":
"66|D)
^^J/
\\*-
rB   )r3   r7   r0   r2   r.   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r)   r9   
LongTensorFloatTensorintTensorrR   __static_attributes____classcell__r?   s   @r@   r   r   .   s    Q
$ .2260426&'##d* ((4/ &&-	
 ((4/ !$ 
 rB   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertPoolerc   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g N)r(   r)   r   Linearhidden_sizedenseTanh
activationr<   s     r@   r)   RemBertPooler.__init__d   s9    YYv1163E3EF
'')rB   hidden_statesrG   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )rg   ri   )r=   rk   first_token_tensorpooled_outputs       r@   rR   RemBertPooler.forwardi   s6     +1a40

#566rB   )ri   rg   
rT   rU   rV   rW   r)   r9   r\   rR   r]   r^   r_   s   @r@   ra   ra   c   s(    $
U\\ ell  rB   ra   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	S	\
4S
 jjrSrU =r$ )RemBertSelfAttentionr   Nc                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        UR"                  U l        X l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r(   r)   rf   num_attention_headshasattr
ValueErrorr[   attention_head_sizeall_head_sizer   re   querykeyvaluer5   attention_probs_dropout_probr7   
is_decoder	layer_idxr=   r>   r   r?   s      r@   r)   RemBertSelfAttention.__init__s   s1    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++"rB   rk   attention_maskencoder_hidden_statespast_key_valuesoutput_attentionsrG   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Sn
US LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      n
U(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U
(       aG  WR                  U R                     R                  nUR                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R!                  U5      R                  U5      R	                  SS5      nUbS  WR#                  XU R                  5      u  pU(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        R&                  " XR	                  SS5      5      nU[(        R*                  " U R                  5      -  nUb  UU-   n[,        R.                  R1                  USS9nU R3                  U5      n[$        R&                  " UU5      nUR5                  SSSS	5      R7                  5       nUR9                  5       S S U R:                  4-   nUR                  " U6 nUU4$ )
Nr&   r      FTdimr   r   )shaperz   r|   view	transpose
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr}   r~   updater9   matmulmathsqrtr   
functionalsoftmaxr7   permute
contiguousrL   r{   )r=   rk   r   r   r   r   kwargsrO   hidden_shapequery_layerr   is_cross_attentioncurr_past_key_valuescurrent_states	key_layervalue_layerkv_shapeattention_scoresattention_probscontext_layernew_context_layer_shapes                        r@   rR   RemBertSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQR
2$>&/+>??,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*,33DNNCHHI.55dnnELLKQ--cr2QBQ8P8PQH055h?II!QOI**^499(CMMaQRSK*)=)D)DY]a]k]k)l&	%*_FY*Z*ZAEO..t~~> !<<5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDo--rB   )	r{   rz   r7   r   r}   r   rw   r|   r~   rd   NNNFrT   rU   rV   rW   r)   r9   r\   rZ   r
   booltuplerR   r]   r^   r_   s   @r@   rr   rr   r   s    #0 48:>(,"'@.||@. ))D0@.  %0047	@.
 @.  @. 
@. @.rB   rr   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RemBertSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr"   )r(   r)   r   re   rf   rg   r3   r4   r5   r6   r7   r<   s     r@   r)   RemBertSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rB   rk   input_tensorrG   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rd   rg   r7   r3   r=   rk   r   s      r@   rR   RemBertSelfOutput.forward   5    

=1]3}'CDrB   r3   rg   r7   rp   r_   s   @r@   r   r      6    >U\\  RWR^R^  rB   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	S-  S	\
\R                     4S
 jjrSrU =r$ )RemBertAttention   Nc                 ^   > [         TU ]  5         [        XS9U l        [	        U5      U l        g )Nr   )r(   r)   rr   r=   r   outputr   s      r@   r)   RemBertAttention.__init__   s&    (E	'/rB   rk   r   r   r   r   rG   c                 h    U R                  UUUUUS9nU R                  US   U5      nU4USS  -   n	U	$ )Nr   r   r   r   r   r   )r=   r   )
r=   rk   r   r   r   r   r   self_outputsattention_outputoutputss
             r@   rR   RemBertAttention.forward   sV     yy)"7+/ ! 
  ;;|AF#%QR(88rB   )r   r=   rd   r   r   r_   s   @r@   r   r      s    0 48:>(,).|| ))D0  %0047	
   $; 
u||	 rB   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertIntermediate   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rd   )r(   r)   r   re   rf   intermediate_sizerg   r   
hidden_actstrr	   intermediate_act_fnr<   s     r@   r)   RemBertIntermediate.__init__   s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rB   rk   rG   c                 J    U R                  U5      nU R                  U5      nU$ rd   rg   r   r=   rk   s     r@   rR   RemBertIntermediate.forward   s&    

=100?rB   r   rp   r_   s   @r@   r   r      s(    9U\\ ell  rB   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RemBertOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r(   r)   r   re   r   rf   rg   r3   r4   r5   r6   r7   r<   s     r@   r)   RemBertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rB   rk   r   rG   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rd   r   r   s      r@   rR   RemBertOutput.forward  r   rB   r   rp   r_   s   @r@   r   r     r   rB   r   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\R                     4S jjrS rSrU =r$ )RemBertLayeri  Nc                 r  > [         TU ]  5         UR                  U l        SU l        [	        X5      U l        UR                  U l        UR                  U l        U R                  (       a-  U R                  (       d  [        U  S35      e[	        XS9U l	        [        U5      U l        [        U5      U l        g )Nr   z> should be used as a decoder model if cross attention is addedr   )r(   r)   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionry   crossattentionr   intermediater   r   r   s      r@   r)   RemBertLayer.__init__  s    '-'E'E$)&< ++#)#=#= ##?? D6)g!hii"26"OD/7#F+rB   rk   r   r   encoder_attention_maskr   r   rG   c                 H   U R                  UUUUS9nUS   n	USS  n
U R                  (       aB  Ub?  [        U S5      (       d  [        SU  S35      eU R	                  U	UUUUS9nUS   n	XSS  -   n
[        U R                  U R                  U R                  U	5      nU4U
-   n
U
$ )N)r   r   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   r   rx   ry   r   r   feed_forward_chunkr   r   )r=   rk   r   r   r   r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                r@   rR   RemBertLayer.forward$  s     "&)/+	 "0 "
 2!4(,??4@4!122 =dV DD D 
 '+&9&9 5&; /"3 ': '#  7q9 ;;G0##T%A%A4CSCSUe
  /G+rB   c                 J    U R                  U5      nU R                  X!5      nU$ rd   )r   r   )r=   r   intermediate_outputr   s       r@   r   RemBertLayer.feed_forward_chunkP  s)    "//0@A{{#6IrB   )r   r   r   r   r   r   r   r   rd   )NNNNF)rT   rU   rV   rW   r)   r9   r\   rZ   r
   r   r   rR   r   r]   r^   r_   s   @r@   r   r     s    ,$ 48:>;?(,).)||) ))D0)  %0047	)
 !& 1 1D 8) )  $;) 
u||	)X rB   r   c                      ^  \ rS rSrU 4S jr        SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\	S\	S\	S\
\-  4S jjrSrU =r$ )RemBertEncoderiV  c           
      2  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        g s  snf )Nr   F)r(   r)   r>   r   re   r,   rf   embedding_hidden_mapping_in
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r=   r>   ir?   s      r@   r)   RemBertEncoder.__init__W  ss    +-99V5P5PRXRdRd+e(]]uU[UmUmOn#oOn!L$EOn#op
&+# $ps   -BNrk   r   r   r   r   	use_cacher   output_hidden_statesreturn_dictrG   c
           	         U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a1  Uc.  [	        [        U R                  S9[        U R                  S95      nU R                  U5      nU(       a  SOS nU(       a  SOS nU(       a  U R                  R                  (       a  SOS n[        U R                  5       H[  u  pU(       a  X4-   nU" UUUUUU5      nUS   nU(       d  M,  UUS   4-   nU R                  R                  (       d  MR  UUS   4-   nM]     U(       a  X4-   nU	(       d  [        S UUUUU4 5       5      $ [        UUUUUS	9$ )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r>    r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frd   r  ).0vs     r@   	<genexpr>)RemBertEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   rk   
attentionscross_attentions)r   trainingloggerwarning_oncer   r   r>   r   r   	enumerater   r   r   )r=   rk   r   r   r   r   r   r   r   r  r   all_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_outputss                    r@   rR   RemBertEncoder.forward_  s{    &&4==##p "	01,dkk2RT`hlhshsTtuO88G"6BD$5b4%64;;;Z;Zr`d(4OA#$58H$H!(%&!M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(#  5&   14D D 
 "#%'(
 
 
 9+++*1
 	
rB   )r>   r   r   r   )NNNNNFFT)rT   rU   rV   rW   r)   r9   r\   rZ   r
   r   r   r   rR   r]   r^   r_   s   @r@   r   r   V  s    , 48:>;?(,!%"'%* D
||D
 ))D0D
  %0047	D

 !& 1 1D 8D
 D
 $;D
  D
 #D
 D
 
:	:D
 D
rB   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )r(   r)   r   re   rf   rg   r   r   r   r	   transform_act_fnr3   r4   r<   s     r@   r)   'RemBertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrB   rk   rG   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rd   )rg   r  r3   r   s     r@   rR   &RemBertPredictionHeadTransform.forward  s4    

=1--m<}5rB   )r3   rg   r  rp   r_   s   @r@   r  r    s)    UU\\ ell  rB   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertLMPredictionHeadi  c                 n  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  5      U l        [        UR                     U l        [        R                  " UR
                  UR                  S9U l        g r   )r(   r)   r   re   rf   output_embedding_sizerg   r+   decoderr	   r   ri   r3   r4   r<   s     r@   r)    RemBertLMPredictionHead.__init__  sz    YYv1163O3OP
yy!=!=v?P?PQ !2!23f&B&BH]H]^rB   rk   rG   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rd   )rg   ri   r3   r!  r   s     r@   rR   RemBertLMPredictionHead.forward  s@    

=16}5]3rB   )r3   ri   r!  rg   rp   r_   s   @r@   r  r    s)    _U\\ ell  rB   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RemBertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rd   )r(   r)   r  predictionsr<   s     r@   r)   RemBertOnlyMLMHead.__init__  s    26:rB   sequence_outputrG   c                 (    U R                  U5      nU$ rd   r(  )r=   r*  prediction_scoress      r@   rR   RemBertOnlyMLMHead.forward  s     ,,_=  rB   r,  rp   r_   s   @r@   r&  r&    s(    ;!u|| ! ! !rB   r&  c                   <   ^  \ rS rSr% \\S'   SrSrU 4S jrSr	U =r
$ )RemBertPreTrainedModeli  r>   rembertTc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )Nr&   r%   )r(   _init_weightsr   r   initcopy_r$   r9   r:   r   r;   )r=   moduler?   s     r@   r3  $RemBertPreTrainedModel._init_weights  s^    f%f/00JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 1rB   r  )rT   rU   rV   rW   r   __annotations__base_model_prefixsupports_gradient_checkpointingr3  r]   r^   r_   s   @r@   r0  r0    s!    !&*#i irB   r0  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                   v  ^  \ rS rSrSU 4S jjrS rS r\            SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )RemBertModeli  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r(   r)   r>   r   rQ   r   encoderra   pooler	post_init)r=   r>   add_pooling_layerr?   s      r@   r)   RemBertModel.__init__  sK    
 	 +F3%f-/@mF+d 	rB   c                 .    U R                   R                  $ rd   rQ   r.   r=   s    r@   get_input_embeddings!RemBertModel.get_input_embeddings  s    ...rB   c                 $    XR                   l        g rd   rE  )r=   r~   s     r@   set_input_embeddings!RemBertModel.set_input_embeddings  s    */'rB   NrC   r   rD   r$   rE   r   r   r   r   r   r   r  rG   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U	b  U	OU R                   R
                  n	OSn	Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nUc  SOUR                  5       nUc  [        R                  " UUU-   4US9nUc$  [        R                  " U[        R                  US9nU R                  X.5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R!                  U5      nOS nU R#                  UUUUUS9nU R%                  UUUUUU	U
UUS	9	nUS   nU R&                  b  U R'                  U5      OS nU(       d
  UU4US
S  -   $ [)        UUUR*                  UR,                  UR.                  UR0                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer&   z5You have to specify either input_ids or inputs_embedsr   )rK   rI   )rC   r$   rD   rE   rF   )r   r   r   r   r   r   r   r  r   )r	  pooler_outputr   rk   r
  r  )r>   r   r   r  r   r   ry   %warn_if_padding_and_no_attention_maskrL   rK   get_seq_lengthr9   onesrM   rN   get_extended_attention_maskinvert_attention_maskrQ   r?  r@  r   r   rk   r
  r  )r=   rC   r   rD   r$   rE   r   r   r   r   r   r   r  r   rO   
batch_sizerP   rK   rF   extended_attention_maskencoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputsr*  rn   s                                r@   rR   RemBertModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY;;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T&5&=?CaCaCc!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+??%)'#9 + 
 ,,2"7#B+/!5# ' 

 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rB   )r>   rQ   r?  r@  )T)NNNNNNNNNNNN)rT   rU   rV   rW   r)   rG  rJ  r   r9   rY   rZ   r
   r   r   r   rR   r]   r^   r_   s   @r@   r=  r=    sB    /0  .226260426:>;?(,!%)-,0#']
##d*]
 ((4/]
 ((4/	]

 &&-]
 ((4/]
  %0047]
 !& 1 1D 8]
 ]
 $;]
  $;]
 #Tk]
 D[]
 
=	=]
 ]
rB   r=  c                   z  ^  \ rS rSrU 4S jrS rS r\           SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )RemBertForMaskedLMic  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NznIf you want to use `RemBertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.FrB  
r(   r)   r   r  warningr=  r1  r&  clsrA  r<   s     r@   r)   RemBertForMaskedLM.__init__e  sR     NN1
 $FeD%f- 	rB   c                 B    U R                   R                  R                  $ rd   rc  r(  r!  rF  s    r@   get_output_embeddings(RemBertForMaskedLM.get_output_embeddingst      xx##+++rB   c                 8    XR                   R                  l        g rd   rf  r=   new_embeddingss     r@   set_output_embeddings(RemBertForMaskedLM.set_output_embeddingsw      '5$rB   NrC   r   rD   r$   rE   r   r   labelsr   r   r  rG   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)	r   rD   r$   rE   r   r   r   r   r  r   r&   r   losslogitsrk   r
  )
r>   r  r1  rc  r   r   r+   r   rk   r
  )r=   rC   r   rD   r$   rE   r   r   rp  r   r   r  r   r   r*  r-  masked_lm_lossloss_fctr   s                      r@   rR   RemBertForMaskedLM.forwardz  s    , &1%<k$++BYBY,,))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rB   rc  r1  )NNNNNNNNNNN)rT   rU   rV   rW   r)   rg  rm  r   r9   rY   rZ   r   r   r   rR   r]   r^   r_   s   @r@   r^  r^  c  s(   ,6  .226260426:>;?*.)-,0#'5
##d*5
 ((4/5
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
  $;5
 #Tk5
 D[5
 
	5
 5
rB   r^  zS
    RemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            !         ^  \ rS rSrU 4S jrS rS r\              SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\\R                  -  S\\-  4S jj5       rSrU =r$ )RemBertForCausalLMi  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzOIf you want to use `RemBertForCausalLM` as a standalone, add `is_decoder=True.`Fr`  ra  r<   s     r@   r)   RemBertForCausalLM.__init__  sL       NNlm#FeD%f- 	rB   c                 B    U R                   R                  R                  $ rd   rf  rF  s    r@   rg  (RemBertForCausalLM.get_output_embeddings  ri  rB   c                 8    XR                   R                  l        g rd   rf  rk  s     r@   rm  (RemBertForCausalLM.set_output_embeddings  ro  rB   NrC   r   rD   r$   rE   r   r   r   rp  r   r   r   r  logits_to_keeprG   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUUS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b)  U R                  " SUXR                   R                  S.UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, RemBertForCausalLM, RemBertConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/rembert")
>>> config = RemBertConfig.from_pretrained("google/rembert")
>>> config.is_decoder = True
>>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```N)r   rD   r$   rE   r   r   r   r   r   r   r  r   )rt  rp  r+   r   )rs  rt  r   rk   r
  r  r  )r>   r  r1  r   r[   slicerc  loss_functionr+   r   r   rk   r
  r  )r=   rC   r   rD   r$   rE   r   r   r   rp  r   r   r   r  r  r   r   rk   slice_indicesrt  rs  r   s                         r@   rR   RemBertForCausalLM.forward  s(   R &1%<k$++BYBY,,))%'"7#9+/!5#  
  
8B>SV8W8W~ot4]k-=!(;<=%%pVF{{OeOepiopDY,F)-)9TGf$EvE0#33!//))$55
 	
rB   rx  )NNNNNNNNNNNNNr   )rT   rU   rV   rW   r)   rg  rm  r   r9   rY   rZ   r
   r   r[   r\   r   r   rR   r]   r^   r_   s   @r@   rz  rz    sr   
,6  .226260426:>;?(,*.!%)-,0#'-.M
##d*M
 ((4/M
 ((4/	M

 &&-M
 ((4/M
  %0047M
 !& 1 1D 8M
 M
   4'M
 $;M
  $;M
 #TkM
 D[M
 ell*M
" 
2	2#M
 M
rB   rz  z
    RemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ ) RemBertForSequenceClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rd   r(   r)   
num_labelsr=  r1  r   r5   classifier_dropout_probr7   re   rf   
classifierrA  r<   s     r@   r)   )RemBertForSequenceClassification.__init__#  si      ++#F+zz&"@"@A))F$6$68I8IJ 	rB   NrC   r   rD   r$   rE   rp  r   r   r  rG   c
                 P   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   rD   r$   rE   r   r   r  r   
regressionsingle_label_classificationmulti_label_classificationr&   r   rr  )r>   r  r1  r7   r  problem_typer  rJ   r9   rN   r[   r   squeezer   r   r   r   rk   r
  )r=   rC   r   rD   r$   rE   rp  r   r   r  r   r   rn   rt  rs  rv  r   s                    r@   rR   (RemBertForSequenceClassification.forward-  s   ( &1%<k$++BYBY,,))%'/!5#  	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rB   r  r7   r  r1  	NNNNNNNNN)rT   rU   rV   rW   r)   r   r9   rZ   rY   r   r   r   rR   r]   r^   r_   s   @r@   r  r    s      /337261526*.)-,0#'D
$$t+D
 ))D0D
 ((4/	D

 ''$.D
 ((4/D
   4'D
  $;D
 #TkD
 D[D
 
)	)D
 D
rB   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )RemBertForMultipleChoiceiu  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r(   r)   r=  r1  r   r5   r  r7   re   rf   r  rA  r<   s     r@   r)   !RemBertForMultipleChoice.__init__w  sV     #F+zz&"@"@A))F$6$6: 	rB   NrC   r   rD   r$   rE   rp  r   r   r  rG   c
                 X   U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r&   r   r  r   rr  )r>   r  r   r   rL   r1  r7   r  r   r   rk   r
  )r=   rC   r   rD   r$   rE   rp  r   r   r  r   num_choicesr   rn   rt  reshaped_logitsrs  rv  r   s                      r@   rR    RemBertForMultipleChoice.forward  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ,,))%'/!5#  	
  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rB   )r  r7   r1  r  )rT   rU   rV   rW   r)   r   r9   rZ   rY   r   r   r   rR   r]   r^   r_   s   @r@   r  r  u  s      /337261526*.)-,0#'W
$$t+W
 ))D0W
 ((4/	W

 ''$.W
 ((4/W
   4'W
  $;W
 #TkW
 D[W
 
*	*W
 W
rB   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )RemBertForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g NFr`  r  r<   s     r@   r)   &RemBertForTokenClassification.__init__  sk      ++#FeDzz&"@"@A))F$6$68I8IJ 	rB   NrC   r   rD   r$   rE   rp  r   r   r  rG   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r&   r   rr  )r>   r  r1  r7   r  r   r   r  r   rk   r
  )r=   rC   r   rD   r$   rE   rp  r   r   r  r   r   r*  rt  rs  rv  r   s                    r@   rR   %RemBertForTokenClassification.forward  s    $ &1%<k$++BYBY,,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rB   r  r  )rT   rU   rV   rW   r)   r   r9   rZ   rY   r   r   r   rR   r]   r^   r_   s   @r@   r  r    s    	  /337261526*.)-,0#'1
$$t+1
 ))D01
 ((4/	1

 ''$.1
 ((4/1
   4'1
  $;1
 #Tk1
 D[1
 
&	&1
 1
rB   r  c                   N  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )RemBertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r(   r)   r  r=  r1  r   re   rf   
qa_outputsrA  r<   s     r@   r)   $RemBertForQuestionAnswering.__init__   sU      ++#FeD))F$6$68I8IJ 	rB   NrC   r   rD   r$   rE   start_positionsend_positionsr   r   r  rG   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5        UR                  SU5        [        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r&   r   )ignore_indexr   )rs  start_logits
end_logitsrk   r
  )r>   r  r1  r  splitr  lenrL   clamp_r   r   rk   r
  )r=   rC   r   rD   r$   rE   r  r  r   r   r  r   r   r*  rt  r  r  
total_lossignored_indexrv  
start_lossend_lossr   s                          r@   rR   #RemBertForQuestionAnswering.forward+  s    &1%<k$++BYBY,,))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rB   )r  r  r1  )
NNNNNNNNNN)rT   rU   rV   rW   r)   r   r9   rZ   rY   r   r   r   rR   r]   r^   r_   s   @r@   r  r    s   	  /3372615263715)-,0#'=
$$t+=
 ))D0=
 ((4/	=

 ''$.=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
rB   r  )	rz  r^  r  r  r  r  r   r=  r0  )@rX   r   r9   r   torch.nnr   r   r    r   r4  activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_rembertr   
get_loggerrT   r  Moduler   ra   rr   r   r   r   r   r   r   r  r  r&  r0  r=  r^  rz  r  r  r  r  __all__r  rB   r@   <module>r     sA       A A & ! C C ) 9	 	 	 . 6 , 0 
		H	%1		 1jBII V.299 V.t		 ryy 8"))  BII ?- ?DM
RYY M
bRYY "bii "! ! i_ i i 	u
) u
u
p L
/ L
 L
^ 
a
/ a

a
H P
'= P
P
f c
5 c
 c
L >
$: >
 >
B J
"8 J
 J
Z
rB   