
    Z j                        S r SSKJr  SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SS	KJrJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3  SSK4J5r5  SSK6J7r7  \0Rp                  " \95      r: " S S\Rv                  5      r<  SVS\Rv                  S\Rz                  S\Rz                  S\Rz                  S\Rz                  S-  S\>S-  S\>S \)\.   4S! jjr? " S" S#\Rv                  5      r@ " S$ S%\Rv                  5      rA " S& S'\Rv                  5      rB " S( S)\Rv                  5      rC " S* S+\Rv                  5      rD " S, S-\Rv                  5      rE " S. S/\5      rF " S0 S1\Rv                  5      rG " S2 S3\Rv                  5      rH " S4 S5\Rv                  5      rI\/ " S6 S7\'5      5       rJ\/" S8S99\ " S: S;\-5      5       5       rK\/ " S< S=\J5      5       rL " S> S?\Rv                  5      rM " S@ SA\Rv                  5      rN\/" SBS99 " SC SD\J5      5       rO\/" SES99 " SF SG\J5      5       rP\/" SHS99 " SI SJ\J5      5       rQ\/" SKS99 " SL SM\J5      5       rR\/ " SN SO\J5      5       rS\/ " SP SQ\J5      5       rT\/" SRS99 " SS ST\J\5      5       rU/ SUQrVg)WzPyTorch ELECTRA model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )ElectraConfigc                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjrSrU =r$ )ElectraEmbeddings8   zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxepsposition_idsr'   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr0   sizelongselfconfig	__class__s     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/electra/modeling_electra.pyr8   ElectraEmbeddings.__init__;   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`"f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr4   r0   inputs_embedspast_key_values_lengthreturnc                    Ub  UR                  5       nOUR                  5       S S nUu  pxUc  U R                  S S 2XXU-   24   nUc  [        U S5      (       aQ  U R                  R	                  UR
                  S   S5      n	[        R                  " U	SUS9n	U	R	                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )Nr2   r4   r   r'   )dimindex)r6   device)rL   r0   hasattrr4   rJ   shaperH   gatherrK   rM   r\   r=   rA   r?   rB   rF   )rO   rU   r4   r0   rV   rW   input_shape
batch_size
seq_lengthbuffered_token_type_idsrA   
embeddingsr?   s                rR   forwardElectraEmbeddings.forwardM   sG     #..*K',,.s3K!,
,,Q0FVlIl0l-lmL
 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
rT   )rB   rF   r?   rA   r=   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r8   rH   
LongTensorFloatTensorintTensorre   __static_attributes____classcell__rQ   s   @rR   r*   r*   8   s    Q
( .2260426&'(##d*( ((4/( &&-	(
 ((4/( !$( 
( (rT   r*   modulequerykeyvalueattention_maskscalingrF   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr2            r	   rZ   )ptrainingr'   )
rL   rH   matmul	transposer   
functionalsoftmaxrF   r   
contiguous)
rs   rt   ru   rv   rw   rx   rF   ry   attn_weightsattn_outputs
             rR   eager_attention_forwardr   y   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rT   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )ElectraSelfAttention   Nc                 N  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        UR&                  U l        X l        X0l        g Nr   r;   zThe hidden size (z6) is not a multiple of the number of attention heads ()r{   )r7   r8   hidden_sizenum_attention_headsr]   
ValueErrorrP   rn   attention_head_sizeall_head_sizerx   r   Linearrt   ru   rv   rD   attention_probs_dropout_probrF   
is_decoder	is_causal	layer_idxrO   rP   r   r   rQ   s       rR   r8   ElectraSelfAttention.__init__   sM    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rT   hidden_statesrw   past_key_valuesry   rX   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n	UbA  Un
[        U[        5      (       a  UR                  n
U
R                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nX4$ )Nr2   r'   r|           rF   rx   )r^   r   rt   viewr   ru   rv   
isinstancer   self_attention_cacheupdater   r   get_interfacerP   _attn_implementationr   r   rF   r~   rx   reshaper   )rO   r   rw   r   ry   r`   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                 rR   re   ElectraSelfAttention.forward   s    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS&&5#/+>??*9*N*N' &=%C%CI\`\j\j%k"I(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rT   )r   r   rP   rF   r   r   ru   r   r   rt   rx   rv   FN)NNrg   rh   ri   rj   r8   rH   ro   rm   r   r   r!   tuplere   rp   rq   rr   s   @rR   r   r      sl    #6 48(,	')||') ))D0') 	')
 +,') 
u||	') ')rT   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                     4S
 jjrSrU =r$ )ElectraCrossAttention   Nc                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        X l        X0l        g r   )r7   r8   r   r   r]   r   rP   rn   r   r   rx   r   r   rt   ru   rv   rD   r   rF   r   r   r   s       rR   r8   ElectraCrossAttention.__init__   s@    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rT   r   encoder_hidden_statesrw   r   ry   rX   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nUb%  UR
                  R                  U R                  5      OSn	Ubb  U	(       a[  UR                  R                  U R                     R                  n
UR                  R                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUbA  UR                  R                  XU R                  5      u  pSUR
                  U R                  '   [        R                   " U R"                  R$                  [&        5      nU" U UU
UU4U R(                  (       d  SOU R*                  R,                  U R.                  S.UD6u  pUR0                  " / UQSP76 R3                  5       nX4$ )Nr2   r'   r|   FTr   r   )r^   r   rt   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesru   rv   r   r   r   rP   r   r   r   rF   r~   rx   r   r   )rO   r   r   rw   r   ry   r`   r   r   r   r   r   kv_shaper   r   r   s                   rR   re   ElectraCrossAttention.forward   s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)UDNN*&	 >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rT   )r   r   rP   rF   r   ru   r   r   rt   rx   rv   r   )NNN)rg   rh   ri   rj   r8   rH   ro   rm   r   r   r!   r   re   rp   rq   rr   s   @rR   r   r      s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1) 1)rT   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ElectraSelfOutputi'  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr.   )r7   r8   r   r   r   denserB   rC   rD   rE   rF   rN   s     rR   r8   ElectraSelfOutput.__init__(  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rT   r   input_tensorrX   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rF   rB   rO   r   r   s      rR   re   ElectraSelfOutput.forward.  5    

=1]3}'CDrT   rB   r   rF   
rg   rh   ri   rj   r8   rH   ro   re   rp   rq   rr   s   @rR   r   r   '  6    >U\\  RWR^R^  rT   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\\R                     4S jjrSrU =r$ )ElectraAttentioni6  Nc                    > [         TU ]  5         X@l        U(       a  [        O[        nU" XUS9U l        [        U5      U l        g )Nr   r   )r7   r8   is_cross_attentionr   r   rO   r   output)rO   rP   r   r   r   attention_classrQ   s         rR   r8   ElectraAttention.__init__7  s9    "43E/K_#F9U	'/rT   r   rw   r   encoder_attention_maskr   ry   rX   c                     U R                   (       d  UOUnU R                  " U4UUUS.UD6u  pxU R                  Xq5      nXx4$ )N)r   rw   r   )r   rO   r   )	rO   r   rw   r   r   r   ry   attention_outputr   s	            rR   re   ElectraAttention.forward>  s\     04/F/FLb)-*
"7)+	*

 *
&  ;;'7G--rT   )r   r   rO   )FNFNNNNr   rr   s   @rR   r   r   6  s    0 48:>;?(,.||. ))D0.  %0047	.
 !& 1 1D 8. . +,. 
u||	. .rT   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ElectraIntermediateiT  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r7   r8   r   r   r   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrN   s     rR   r8   ElectraIntermediate.__init__U  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rT   r   rX   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )rO   r   s     rR   re   ElectraIntermediate.forward]  s&    

=100?rT   r   r   rr   s   @rR   r   r   T  s(    9U\\ ell  rT   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ElectraOutputid  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r7   r8   r   r   r   r   r   rB   rC   rD   rE   rF   rN   s     rR   r8   ElectraOutput.__init__e  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rT   r   r   rX   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      rR   re   ElectraOutput.forwardk  r   rT   r   r   rr   s   @rR   r   r   d  r   rT   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\R                  4S jjrS rSrU =r$ )ElectraLayeris  Nc                   > [         TU ]  5         UR                  U l        SU l        [	        XR
                  US9U l        UR
                  U l        UR                  U l        U R                  (       a0  U R
                  (       d  [        U  S35      e[	        USUSS9U l	        [        U5      U l        [        U5      U l        g )Nr'   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r7   r8   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rO   rP   r   rQ   s      rR   r8   ElectraLayer.__init__t  s    '-'E'E$)&<M<MYbc ++#)#=#= ##?? D6)g!hii"2##'	#D 07#F+rT   r   rw   r   r   r   ry   rX   c                 2   U R                   " UU4SU0UD6u  pxUn	U R                  (       a?  Ub<  [        U S5      (       d  [        SU  S35      eU R                  " US UU4SU0UD6u  pU
n	[        U R                  U R                  U R                  U	5      nU$ )Nr   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   r]   r   r   r   feed_forward_chunkr   r   )rO   r   rw   r   r   r   ry   self_attention_output_r   cross_attention_outputlayer_outputs               rR   re   ElectraLayer.forward  s     $(>>$
 ,$
 	$
  1??4@4!122 =dV DD D 
 )-(;(;%%&	)
 !0) )%"  60##T%A%A4CSCSUe
 rT   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )rO   r   intermediate_outputr   s       rR   r   ElectraLayer.feed_forward_chunk  s)    "//0@A{{#6IrT   )r   r   r   r   r   r   r   r   r   r   )rg   rh   ri   rj   r8   rH   ro   rm   r   r   r!   re   r   rp   rq   rr   s   @rR   r   r   s  s    ,, 48:>;?(,%||% ))D0%  %0047	%
 !& 1 1D 8% % +,% 
%N rT   r   c                      ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\   S\\R                     \-  4S jjrSrU =r$ )ElectraEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )N)r   )	r7   r8   rP   r   
ModuleListrangenum_hidden_layersr   layer)rO   rP   irQ   s      rR   r8   ElectraEncoder.__init__  sH    ]]uU[UmUmOn#oOn!L$EOn#op
#os   ANr   rw   r   r   r   	use_cachery   rX   c                     [        U R                  5       H  u  pU	" UUU4UUS.UD6nM     [        UU(       a  US9$ S S9$ )N)r   r   last_hidden_stater   )	enumerater  r   )
rO   r   rw   r   r   r   r
  ry   r  layer_modules
             rR   re   ElectraEncoder.forward  sg      )4OA(% (> / M  5 9+/8O
 	
>B
 	
rT   )rP   r  )NNNNN)rg   rh   ri   rj   r8   rH   ro   rm   r   boolr   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s    q 48:>;?(,!%
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 +,
 
u||	H	H
 
rT   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraDiscriminatorPredictionsi  zEPrediction module for the discriminator, made up of two dense layers.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  5      U l        [        R                  " UR                  S5      U l	        Xl
        g Nr'   )r7   r8   r   r   r   r   r   r   
activationdense_predictionrP   rN   s     rR   r8   (ElectraDiscriminatorPredictions.__init__  s\    YYv1163E3EF
():):; "		&*<*<a @rT   c                     U R                  U5      nU R                  U5      nU R                  U5      R                  S5      nU$ )Nr2   )r   r  r  squeeze)rO   discriminator_hidden_statesr   logitss       rR   re   'ElectraDiscriminatorPredictions.forward  s?    

#>?6&&}5==bArT   )r  rP   r   r  	rg   rh   ri   rj   rk   r8   re   rp   rq   rr   s   @rR   r  r    s    O rT   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraGeneratorPredictionsi  zAPrediction module for the generator, made up of two dense layers.c                    > [         TU ]  5         [        S5      U l        [        R
                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )Ngelur.   )r7   r8   r   r  r   rB   r;   rC   r   r   r   rN   s     rR   r8   $ElectraGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
rT   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r  rB   )rO   generator_hidden_statesr   s      rR   re   #ElectraGeneratorPredictions.forward  s3    

#:;6}5rT   )rB   r  r   r  rr   s   @rR   r   r     s    KJ rT   r   c                   P   ^  \ rS rSr\rSrSrSrSr	Sr
Sr\\\S.rU 4S jrSrU =r$ )ElectraPreTrainedModeli  electraT)r   
attentionscross_attentionsc                 F  > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R                  " UR                  5        g g )Nr2   r1   )r7   _init_weightsr   r*   initcopy_r0   rH   rI   r^   rJ   zeros_r4   )rO   rs   rQ   s     rR   r-  $ElectraPreTrainedModel._init_weights	  sp    f%f/00JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 1rT    )rg   rh   ri   rj   r(   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsr-  rp   rq   rr   s   @rR   r(  r(    sF     L!&*#N"&%*1/ /rT   r(  z3
    Output type of [`ElectraForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	ElectraForPreTrainingOutputi  a  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss of the ELECTRA objective.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Prediction scores of the head (scores for each token before SoftMax).
Nlossr  r   r*  r2  )rg   rh   ri   rj   rk   r>  rH   rm   __annotations__r  r   r   r*  rp   r2  rT   rR   r=  r=    sg     &*D%

d
")'+FE$+59M5**+d2926Je''(4/6rT   r=  c                     ^  \ rS rSrU 4S jrS rS r\\\	         SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\\
R                     S-  S\S-  S\\   S\\
R                     \-  4S jj5       5       5       rS rSrU =r$ )ElectraModeli$  c                 2  > [         TU ]  U5        [        U5      U l        UR                  UR
                  :w  a0  [        R                  " UR                  UR
                  5      U l        [        U5      U l
        Xl        SU l        U R                  5         g )NF)r7   r8   r*   rd   r;   r   r   r   embeddings_projectr  encoderrP   gradient_checkpointing	post_initrN   s     rR   r8   ElectraModel.__init__&  sr     +F3  F$6$66&(ii0E0EvGYGY&ZD#%f-&+#rT   c                 .    U R                   R                  $ r   rd   r=   rO   s    rR   get_input_embeddings!ElectraModel.get_input_embeddings3  s    ...rT   c                 $    XR                   l        g r   rI  )rO   rv   s     rR   set_input_embeddings!ElectraModel.set_input_embeddings6  s    */'rT   NrU   rw   r4   r0   rV   r   r   r   r
  ry   rX   c
           
         US L US L-  (       a  [        S5      eU R                  R                  (       a  U	b  U	OU R                  R                  n	OSn	U	(       ab  Uc_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUb  UR                  5       OSnU R                  UUUUUS9n[        U S5      (       a  U R                  U5      nU R                  UUUUUS9u  p'U R                  " U4UUUUU	US.U
D6n[        UR                  UR                  S	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsF)rP   r   )rU   r0   r4   rV   rW   rC  )rw   r   embedding_outputr   r   )rw   r   r   r   r
  r0   r  )r   rP   r   r
  is_encoder_decoderr   r   get_seq_lengthrd   r]   rC  _create_attention_masksrD  r   r  r   )rO   rU   rw   r4   r0   rV   r   r   r   r
  ry   rW   rQ  encoder_outputss                 rR   re   ElectraModel.forward9  su     -t";<YZZ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg??%)'#9 + 
 4-..#667GH151M1M)#9-"7+ 2N 2
. ,,	
)"7#9+%	
 	
 9-??+;;
 	
rT   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)rP   rV   rw   r   )rP   rV   rw   )rP   rV   rw   r   )rP   r   r   r   )rO   rw   r   rQ  r   r   s         rR   rT  $ElectraModel._create_attention_masks}  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55rT   )rP   rd   rC  rD  rE  )	NNNNNNNNN)rg   rh   ri   rj   r8   rK  rN  r%   r&   r"   rH   ro   listrm   r  r   r!   r   r   re   rT  rp   rq   rr   s   @rR   rA  rA  $  s2   /0   *..2.2,0-1596::>!%>
<<$&>
 t+>
 t+	>

 llT)>
 ||d*>
  %||d2>
 !&t 3>
 e//047>
 $;>
 +,>
 
u||	A	A>
    >
B6 6rT   rA  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )ElectraClassificationHeadi  z-Head for sentence-level classification tasks.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        S5      U l	        [        R                  " U5      U l        [        R                  " UR                  UR                  5      U l        g )Nr"  )r7   r8   r   r   r   r   classifier_dropoutrE   r   r  rD   rF   
num_labelsout_projrO   rP   r]  rQ   s      rR   r8   "ElectraClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 )0zz"45		&"4"4f6G6GHrT   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )Nr   )rF   r   r  r_  )rO   featuresry   xs       rR   re   !ElectraClassificationHead.forward  sZ    Q1WLLOJJqMOOALLOMM!rT   )r  r   rF   r_  r  rr   s   @rR   r[  r[    s    7I rT   r[  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\R                  4S	 jjr
S
rU =r$ )ElectraSequenceSummaryi  a  
Compute a single vector summary of a sequence hidden states.

Args:
    config ([`ElectraConfig`]):
        The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
        config class of your model for the default values it uses):

        - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

            - `"last"` -- Take the last token hidden state (like XLNet)
            - `"first"` -- Take the first token hidden state (like Bert)
            - `"mean"` -- Take the mean of all tokens hidden states
            - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
            - `"attn"` -- Not implemented now, use multi-head attention

        - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
        - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
          (otherwise to `config.hidden_size`).
        - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
          another string or `None` will add no activation.
        - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
        - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
rP   c                   > [         TU ]  5         [        USS5      U l        U R                  S:X  a  [        e[
        R                  " 5       U l        [        US5      (       a  UR                  (       aq  [        US5      (       a.  UR                  (       a  UR                  S:  a  UR                  nOUR                  n[
        R                  " UR                  U5      U l        [        USS 5      nU(       a  [        U5      O[
        R                  " 5       U l        [
        R                  " 5       U l        [        US5      (       a5  UR"                  S:  a%  [
        R$                  " UR"                  5      U l        [
        R                  " 5       U l        [        US	5      (       a7  UR(                  S:  a&  [
        R$                  " UR(                  5      U l        g g g )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r7   r8   getattrri  NotImplementedErrorr   Identitysummaryr]   rl  rm  r^  r   r   r   r  first_dropoutro  rD   last_dropoutrp  )rO   rP   num_classesactivation_stringrQ   s       rR   r8   ElectraSequenceSummary.__init__  sa   #FNFC& &%{{}6-..63J3Jv788V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]62338T8TWX8X!#F,H,H!IDKKM6122v7R7RUV7V "

6+F+F GD 8W2rT   Nr   	cls_indexrX   c                    U R                   S:X  a  USS2S4   nGOU R                   S:X  a  USS2S4   nGOU R                   S:X  a  UR                  SS9nOU R                   S	:X  a  Uc?  [        R                  " US
SS2SS24   UR                  S   S-
  [        R
                  S9nOXUR                  S5      R                  S5      nUR                  SUR                  5       S-
  -  UR                  S5      4-   5      nUR                  SU5      R                  S5      nOU R                   S:X  a  [        eU R                  W5      nU R                  U5      nU R                  U5      nU R!                  U5      nU$ )a#  
Compute a single vector summary of a sequence hidden states.

Args:
    hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
        The hidden states of the last layer.
    cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
        Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

Returns:
    `torch.FloatTensor`: The summary of the sequence hidden states.
rj  Nr2   firstr   meanr'   r}   rz  .r5   )r2   rk  )ri  r}  rH   	full_liker^   rM   	unsqueezerJ   rZ   rL   r_   r  rr  ru  rt  r  rv  )rO   r   rz  r   s       rR   re   ElectraSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*rT   )r  ru  rv  rt  ri  r   )rg   rh   ri   rj   rk   r(   r8   rH   rm   rl   re   rp   rq   rr   s   @rR   rg  rg    sV    2H} H< VZ)"..);@;K;Kd;R)			) )rT   rg  z
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ ) ElectraForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        U R                  5         g r   )	r7   r8   r^  rP   rA  r)  r[  
classifierrF  rN   s     rR   r8   )ElectraForSequenceClassification.__init__   sF      ++#F+3F; 	rT   NrU   rw   r4   r0   rV   labelsry   rX   c           	         U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
SnUGb  U R                  R                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR
                  [        R                  :X  d  UR
                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [        5       nU" U
R                  S	U R                  5      UR                  S	5      5      nO,U R                  R                  S:X  a  [        5       nU" X5      n[        UU
UR                  UR                   S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Trw   r4   r0   rV   return_dictr   Nr'   
regressionsingle_label_classificationmulti_label_classificationr2   r>  r  r   r*  )r)  r  rP   problem_typer^  r6   rH   rM   rn   r   r  r   r   r   r   r   r*  )rO   rU   rw   r4   r0   rV   r  ry   r  sequence_outputr  r>  loss_fcts                rR   re   (ElectraForSequenceClassification.forward*  s   $ '+ll'
))%''
 '
# 6a81{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'5CC2==	
 	
rT   )r  rP   r)  r^  NNNNNN)rg   rh   ri   rj   r8   r$   r"   rH   ro   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s      *..2.2,0-1&*9
<<$&9
 t+9
 t+	9

 llT)9
 ||d*9
 t#9
 +,9
 
u||	7	79
  9
rT   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )ElectraForPreTrainingih  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r7   r8   rA  r)  r  discriminator_predictionsrF  rN   s     rR   r8   ElectraForPreTraining.__init__p  s3     #F+)H)P&rT   NrU   rw   r4   r0   rV   r  ry   rX   c           	          U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
SnUb  [        R                  " 5       nUb_  UR	                  SU	R
                  S   5      S:H  nU
R	                  SU	R
                  S   5      U   nXm   nU" XR                  5       5      nO4U" U
R	                  SU	R
                  S   5      UR                  5       5      n[        UU
UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
    Indices should be in `[0, 1]`:

    - 0 indicates the token is an original token,
    - 1 indicates the token was replaced.

Examples:

```python
>>> from transformers import ElectraForPreTraining, AutoTokenizer
>>> import torch

>>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
>>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

>>> sentence = "The quick brown fox jumps over the lazy dog"
>>> fake_sentence = "The quick brown fox fake over the lazy dog"

>>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
>>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
>>> discriminator_outputs = discriminator(fake_inputs)
>>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

>>> fake_tokens
['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']

>>> predictions.squeeze().tolist()
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
```Tr  r   Nr2   r'   r  )
r)  r  r   r   r   r^   floatr=  r   r*  )rO   rU   rw   r4   r0   rV   r  ry   r  discriminator_sequence_outputr  r>  r  active_lossactive_logitsactive_labelss                   rR   re   ElectraForPreTraining.forwardx  s)   V '+ll'
))%''
 '
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d & 3/B/B/DEB0M0S0STU0V WY_YeYeYgh*5CC2==	
 	
rT   )r  r)  r  )rg   rh   ri   rj   r8   r$   r"   rH   ro   r   r!   r   r=  re   rp   rq   rr   s   @rR   r  r  h  s      *..2.2,0-1&*F
<<$&F
 t+F
 t+	F

 llT)F
 ||d*F
 t#F
 +,F
 
u||	:	:F
  F
rT   r  z
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    c                   L  ^  \ rS rSrSS0rU 4S jrS rS r\\	      SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )ElectraForMaskedLMi  generator_lm_head.weight)electra.embeddings.word_embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  UR                  5      U l
        U R                  5         g r   )r7   r8   rA  r)  r   generator_predictionsr   r   r;   r:   generator_lm_headrF  rN   s     rR   r8   ElectraForMaskedLM.__init__  sR     #F+%@%H"!#6+@+@&BSBS!TrT   c                     U R                   $ r   r  rJ  s    rR   get_output_embeddings(ElectraForMaskedLM.get_output_embeddings      %%%rT   c                     Xl         g r   r  )rO   r=   s     rR   set_output_embeddings(ElectraForMaskedLM.set_output_embeddings  s    !0rT   NrU   rw   r4   r0   rV   r  ry   rX   c           	      n   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
U R                  U
5      n
SnUbQ  [        R                  " 5       nU" U
R                  SU R                  R                  5      UR                  S5      5      n[        UU
UR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
Tr  r   Nr2   r  )r)  r  r  r   r   r   rP   r:   r   r   r*  )rO   rU   rw   r4   r0   rV   r  ry   r%  generator_sequence_outputprediction_scoresr>  r  s                rR   re   ElectraForMaskedLM.forward  s    $ #',,#
))%'#
 #
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D$1??.99	
 	
rT   r)  r  r  r  )rg   rh   ri   rj   _tied_weights_keysr8   r  r  r$   r"   rH   ro   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s     56ab&1  *..2.2,0-1&*)
<<$&)
 t+)
 t+	)

 llT))
 ||d*)
 t#)
 +,)
 
u||	~	-)
  )
rT   r  z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )ElectraForTokenClassificationi  c                 f  > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r7   r8   r^  rA  r)  r]  rE   r   rD   rF   r   r   r  rF  r`  s      rR   r8   &ElectraForTokenClassification.__init__  s      ++#F+)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJrT   NrU   rw   r4   r0   rV   r  ry   rX   c           	      D   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUb<  [        5       nU" U
R	                  SU R
                  5      UR	                  S5      5      n[        UU
UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr  r   Nr2   r  )	r)  rF   r  r   r   r^  r   r   r*  )rO   rU   rw   r4   r0   rV   r  ry   r  r  r  r>  r  s                rR   re   %ElectraForTokenClassification.forward!  s      '+ll'
))%''
 '
# )DA(F%(,5R(S%!>?')HFKKDOO<fkk"oND$5CC2==	
 	
rT   )r  rF   r)  r^  r  )rg   rh   ri   rj   r8   r$   r"   rH   ro   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s      *..2.2,0-1&*&
<<$&&
 t+&
 t+	&

 llT)&
 ||d*&
 t#&
 +,&
 
u||	4	4&
  &
rT   r  c                   `  ^  \ rS rSr\rSrU 4S jr\\	       SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )ElectraForQuestionAnsweringiL  r)  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
r7   r8   r^  rA  r)  r   r   r   
qa_outputsrF  rN   s     rR   r8   $ElectraForQuestionAnswering.__init__Q  sS      ++#F+))F$6$68I8IJ 	rT   NrU   rw   r4   r0   rV   start_positionsend_positionsry   rX   c           	         U R                   " U4UUUUSS.UD6n	U	S   n
U R                  U
5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  n[        UUUU	R                  U	R                  S	9$ )
NTr  r   r'   r2   r}   )ignore_indexr|   )r>  start_logits
end_logitsr   r*  )r)  r  splitr  r   lenrL   clampr   r   r   r*  )rO   rU   rw   r4   r0   rV   r  r  ry   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_losss                      rR   re   #ElectraForQuestionAnswering.forward[  sz    '+ll'
))%''
 '
# 6a81#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!5CC2==
 	
rT   )r)  r^  r  )NNNNNNN)rg   rh   ri   rj   r(   r3  r4  r8   r$   r"   rH   ro   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r  L  s     L!  *..2.2,0-1/3-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 ,3
 ||d*3
 +,3
 
u||	;	;3
  3
rT   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )ElectraForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        R                  " UR                  S5      U l	        U R                  5         g r  )r7   r8   rA  r)  rg  sequence_summaryr   r   r   r  rF  rN   s     rR   r8   !ElectraForMultipleChoice.__init__  sM     #F+ 6v >))F$6$6: 	rT   NrU   rw   r4   r0   rV   r  ry   rX   c           	         Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U4UUUUSS.UD6n	U	S   n
U R	                  U
5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      n[        UUU	R                  U	R                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr'   r2   r~  Tr  r   r  )
r^   r   rL   r)  r  r  r   r   r   r*  )rO   rU   rw   r4   r0   rV   r  ry   num_choicesr  r  pooled_outputr  reshaped_logitsr>  r  s                   rR   re    ElectraForMultipleChoice.forward  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 '+ll'
))%''
 '
# 6a8--o>/ ++b+6')HO4D("5CC2==	
 	
rT   )r  r)  r  r  )rg   rh   ri   rj   r8   r$   r"   rH   ro   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s      *..2.2,0-1&*N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
 +,N
 
u||	8	8N
  N
rT   r  zS
    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
    c                     ^  \ rS rSrSS0rU 4S jrS rS r\\	           SS\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\\
R                  -  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )ElectraForCausalLMi  r  r  c                 0  > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g )NzOIf you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`)r7   r8   r   loggerwarningrA  r)  r   r  r   r   r;   r:   r  rF  rN   s     rR   r8   ElectraForCausalLM.__init__  sh       NNlm#F+%@%H"!#6+@+@&BSBS!TrT   c                     U R                   $ r   r  rJ  s    rR   r  (ElectraForCausalLM.get_output_embeddings  r  rT   c                     Xl         g r   r  )rO   new_embeddingss     rR   r  (ElectraForCausalLM.set_output_embeddings	  s    !/rT   NrU   rw   r4   r0   rV   r   r   r  r   r
  logits_to_keepry   rX   c                    Ub  Sn
U R                   " U4UUUUUUU	U
SS.	UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  U R                  USS2USS24   5      5      nSnUb)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
>>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
>>> config.is_decoder = True
>>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NFT)	rw   r4   r0   rV   r   r   r   r
  r  )r  r  r:   )r>  r  r   r   r*  r+  r2  )r)  r  r   rn   slicer  r  loss_functionrP   r:   r   r   r   r*  r+  )rO   rU   rw   r4   r0   rV   r   r   r  r   r
  r  ry   outputsr   slice_indicesr  r>  s                     rR   re   ElectraForCausalLM.forward  s   N I=A\\>
))%'"7#9+>
 >
  118B>SV8W8W~ot4]k''(B(B=QRTacdQdCe(fg%%pVF{{OeOepiopD0#33!//))$55
 	
rT   r  )NNNNNNNNNNr   )rg   rh   ri   rj   r  r8   r  r  r$   r"   rH   ro   r   r  rn   r   r!   r   r   re   rp   rq   rr   s   @rR   r  r    s\    56ab
&0  *..2.2,0-1596:&*(,!%-.F
<<$&F
 t+F
 t+	F

 llT)F
 ||d*F
  %||d2F
 !&t 3F
 t#F
 F
 $;F
 ell*F
 +,F
 
u||	@	@F
  F
rT   r  )	r  r  r  r  r  r  r  rA  r(  )Nr   )Wrk   collections.abcr   dataclassesr   rH   r   torch.nnr   r   r    r
   r.  activationsr   r   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr    r!   r"   r#   utils.genericr$   r%   utils.output_capturingr&   configuration_electrar(   
get_loggerrg   r  Moduler*   ro   r  r   r   r   r   r   r   r   r   r  r  r   r(  r=  rA  r[  rg  r  r  r  r  r  r  r  __all__r2  rT   rR   <module>r     sH    $ !   A A & 1 C C ) J 9	 	 	 G & 6  J 5 0 
		H	%=		 =N !%II%<<% 
% <<	%
 LL4'% T\% % '(%:@)299 @)HI)BII I)Z		 .ryy .<"))  BII >- >D
RYY 
@bii &")) $ /_ / /* 
 7+ 7 7 v6) v6 v6r		 0`RYY `F F
'= F
F
R Q
2 Q
Q
h >
/ >
>
B 6
$: 6
6
r C
"8 C
 C
L [
5 [
 [
| 
]
/ ]

]
@
rT   