
    Z j                        S r SSKJr  SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJrJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3  SSK4J5r5  SSK6J7r7  \0Rp                  " \95      r: " S S\Rv                  5      r<  S\S\Rv                  S\Rz                  S\Rz                  S\Rz                  S\Rz                  S-  S\>S-  S\>S \)\.   4S! jjr? " S" S#\Rv                  5      r@ " S$ S%\Rv                  5      rA " S& S'\Rv                  5      rB " S( S)\Rv                  5      rC " S* S+\Rv                  5      rD " S, S-\Rv                  5      rE " S. S/\5      rF " S0 S1\Rv                  5      rG " S2 S3\Rv                  5      rH " S4 S5\Rv                  5      rI " S6 S7\Rv                  5      rJ " S8 S9\Rv                  5      rK " S: S;\Rv                  5      rL " S< S=\Rv                  5      rM\/ " S> S?\'5      5       rN\/" S@SA9\ " SB SC\-5      5       5       rO\/" SDSA9 " SE SF\N5      5       rP\/" SGSA9 " SH SI\N5      5       rQ\/" SJSA9 " SK SL\N\5      5       rR\/ " SM SN\N5      5       rS\/" SOSA9 " SP SQ\N5      5       rT\/" SRSA9 " SS ST\N5      5       rU\/ " SU SV\N5      5       rV\/ " SW SX\N5      5       rW\/ " SY SZ\N5      5       rX/ S[QrYg)]zPyTorch BERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )
BertConfigc                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjrSrU =r$ )BertEmbeddings5   zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxepsposition_idsr'   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr0   sizelongselfconfig	__class__s     w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bert/modeling_bert.pyr7   BertEmbeddings.__init__8   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    N	input_idsr4   r0   inputs_embedspast_key_values_lengthreturnc                    Ub  UR                  5       nOUR                  5       S S nUu  pxUc  U R                  S S 2XXU-   24   nUc  [        U S5      (       aQ  U R                  R	                  UR
                  S   S5      n	[        R                  " U	SUS9n	U	R	                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )Nr2   r4   r   r'   )dimindex)r5   device)rK   r0   hasattrr4   rI   shaperG   gatherrJ   rL   r[   r<   r@   r>   rA   rE   )rN   rT   r4   r0   rU   rV   input_shape
batch_size
seq_lengthbuffered_token_type_idsr@   
embeddingsr>   s                rQ   forwardBertEmbeddings.forwardH   sG     #..*K',,.s3K!,
,,Q0FVlIl0l-lmL
 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
rS   )rA   rE   r>   r@   r<   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r7   rG   
LongTensorFloatTensorintTensorrd   __static_attributes____classcell__rP   s   @rQ   r*   r*   5   s    Q
$ .2260426&'(##d*( ((4/( &&-	(
 ((4/( !$( 
( (rS   r*   modulequerykeyvalueattention_maskscalingrE   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr2            r	   rY   )ptrainingr'   )
rK   rG   matmul	transposer   
functionalsoftmaxrE   r~   
contiguous)
rr   rs   rt   ru   rv   rw   rE   rx   attn_weightsattn_outputs
             rQ   eager_attention_forwardr   s   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rS   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )BertSelfAttention   Nc                 N  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        UR&                  U l        X l        X0l        g Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rz   )r6   r7   r:   num_attention_headsr\   
ValueErrorrO   rm   attention_head_sizeall_head_sizerw   r   Linearrs   rt   ru   rC   attention_probs_dropout_probrE   
is_decoder	is_causal	layer_idxrN   rO   r   r   rP   s       rQ   r7   BertSelfAttention.__init__   sM    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rS   hidden_statesrv   past_key_valuesrx   rW   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n	UbA  Un
[        U[        5      (       a  UR                  n
U
R                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nX4$ )Nr2   r'   r{           rE   rw   )r]   r   rs   viewr   rt   ru   
isinstancer   self_attention_cacheupdater   r   get_interfacerO   _attn_implementationr   r~   rE   r}   rw   reshaper   )rN   r   rv   r   rx   r_   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                 rQ   rd   BertSelfAttention.forward   s    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS&&5#/+>??*9*N*N' &=%C%CI\`\j\j%k"I(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rS   )r   r   rO   rE   r   r   rt   r   r   rs   rw   ru   FN)NNrf   rg   rh   ri   r7   rG   rn   rl   r   r   r!   tuplerd   ro   rp   rq   s   @rQ   r   r      sl    #6 48(,	')||') ))D0') 	')
 +,') 
u||	') ')rS   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                     4S
 jjrSrU =r$ )BertCrossAttention   Nc                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        X l        X0l        g r   )r6   r7   r:   r   r\   r   rO   rm   r   r   rw   r   r   rs   rt   ru   rC   r   rE   r   r   r   s       rQ   r7   BertCrossAttention.__init__   s@    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rS   r   encoder_hidden_statesrv   r   rx   rW   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nUb%  UR
                  R                  U R                  5      OSn	Ubb  U	(       a[  UR                  R                  U R                     R                  n
UR                  R                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUbA  UR                  R                  XU R                  5      u  pSUR
                  U R                  '   [        R                   " U R"                  R$                  [&        5      nU" U UU
UU4U R(                  (       d  SOU R*                  R,                  U R.                  S.UD6u  pUR0                  " / UQSP76 R3                  5       nX4$ )Nr2   r'   r{   FTr   r   )r]   r   rs   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesrt   ru   r   r   r   rO   r   r   r~   rE   r}   rw   r   r   )rN   r   r   rv   r   rx   r_   r   r   r   r   r   kv_shaper   r   r   s                   rQ   rd   BertCrossAttention.forward   s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)UDNN*&	 >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rS   )r   r   rO   rE   r   rt   r   r   rs   rw   ru   r   )NNN)rf   rg   rh   ri   r7   rG   rn   rl   r   r   r!   r   rd   ro   rp   rq   s   @rQ   r   r      s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1) 1)rS   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BertSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr.   )r6   r7   r   r   r:   denserA   rB   rC   rD   rE   rM   s     rQ   r7   BertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rS   r   input_tensorrW   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rE   rA   rN   r   r   s      rQ   rd   BertSelfOutput.forward%  5    

=1]3}'CDrS   rA   r   rE   
rf   rg   rh   ri   r7   rG   rn   rd   ro   rp   rq   s   @rQ   r   r     6    >U\\  RWR^R^  rS   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\\R                     4S jjrSrU =r$ )BertAttentioni,  Nc                    > [         TU ]  5         X@l        U(       a  [        O[        nU" XUS9U l        [        U5      U l        g )Nr   r   )r6   r7   is_cross_attentionr   r   rN   r   output)rN   rO   r   r   r   attention_classrP   s         rQ   r7   BertAttention.__init__-  s9    "40B,HY#F9U	$V,rS   r   rv   r   encoder_attention_maskr   rx   rW   c                     U R                   (       d  UOUnU R                  " U4UUUS.UD6u  pxU R                  Xq5      nXx4$ )N)r   rv   r   )r   rN   r   )	rN   r   rv   r   r   r   rx   attention_outputr   s	            rQ   rd   BertAttention.forward4  s\     04/F/FLb)-*
"7)+	*

 *
&  ;;'7G--rS   )r   r   rN   )FNFNNNNr   rq   s   @rQ   r   r   ,  s    - 48:>;?(,.||. ))D0.  %0047	.
 !& 1 1D 8. . +,. 
u||	. .rS   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertIntermediateiI  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r6   r7   r   r   r:   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrM   s     rQ   r7   BertIntermediate.__init__J  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rS   r   rW   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rN   r   s     rQ   rd   BertIntermediate.forwardR  s&    

=100?rS   r   r   rq   s   @rQ   r   r   I  s(    9U\\ ell  rS   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
BertOutputiX  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r6   r7   r   r   r   r:   r   rA   rB   rC   rD   rE   rM   s     rQ   r7   BertOutput.__init__Y  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rS   r   r   rW   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      rQ   rd   BertOutput.forward_  r   rS   r   r   rq   s   @rQ   r   r   X  r   rS   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\R                  4S jjrS rSrU =r$ )	BertLayerif  Nc                   > [         TU ]  5         UR                  U l        SU l        [	        XR
                  US9U l        UR
                  U l        UR                  U l        U R                  (       a0  U R
                  (       d  [        U  S35      e[	        USUSS9U l	        [        U5      U l        [        U5      U l        g )Nr'   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r6   r7   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rN   rO   r   rP   s      rQ   r7   BertLayer.__init__g  s    '-'E'E$&v9J9JV_` ++#)#=#= ##?? D6)g!hii"/##'	#D -V4 (rS   r   rv   r   r   r   rx   rW   c                 2   U R                   " UU4SU0UD6u  pxUn	U R                  (       a?  Ub<  [        U S5      (       d  [        SU  S35      eU R                  " US UU4SU0UD6u  pU
n	[        U R                  U R                  U R                  U	5      nU$ )Nr   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   r\   r   r   r   feed_forward_chunkr   r   )rN   r   rv   r   r   r   rx   self_attention_output_r   cross_attention_outputlayer_outputs               rQ   rd   BertLayer.forwardz  s     $(>>$
 ,$
 	$
  1??4@4!122 =dV DD D 
 )-(;(;%%&	)
 !0) )%"  60##T%A%A4CSCSUe
 rS   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )rN   r   intermediate_outputr   s       rQ   r   BertLayer.feed_forward_chunk  s)    "//0@A{{#6IrS   )r   r   r   r   r   r   r   r   r   r   )rf   rg   rh   ri   r7   rG   rn   rl   r   r   r!   rd   r   ro   rp   rq   s   @rQ   r   r   f  s    ), 48:>;?(,%||% ))D0%  %0047	%
 !& 1 1D 8% % +,% 
%N rS   r   c                      ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\   S\\R                     \-  4S jjrSrU =r$ )BertEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )N)r   )	r6   r7   rO   r   
ModuleListrangenum_hidden_layersr   layer)rN   rO   irP   s      rQ   r7   BertEncoder.__init__  sH    ]]ERXRjRjLk#lLkqIf$BLk#lm
#ls   ANr   rv   r   r   r   	use_cacherx   rW   c                     [        U R                  5       H  u  pU	" UUU4UUS.UD6nM     [        UU(       a  US9$ S S9$ )N)r   r   )last_hidden_stater   )	enumerater  r   )
rN   r   rv   r   r   r   r
  rx   r  layer_modules
             rQ   rd   BertEncoder.forward  sg      )4OA(% (> / M  5 9+/8O
 	
>B
 	
rS   )rO   r  )NNNNN)rf   rg   rh   ri   r7   rG   rn   rl   r   boolr   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r    s    n 48:>;?(,!%
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 +,
 
u||	H	H
 
rS   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
BertPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r6   r7   r   r   r:   r   Tanh
activationrM   s     rQ   r7   BertPooler.__init__  s9    YYv1163E3EF
'')rS   r   rW   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )rN   r   first_token_tensorpooled_outputs       rQ   rd   BertPooler.forward  s6     +1a40

#566rS   )r  r   r   rq   s   @rQ   r  r    s(    $
U\\ ell  rS   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )r6   r7   r   r   r:   r   r   r   r   r   transform_act_fnrA   rB   rM   s     rQ   r7   $BertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrS   r   rW   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r  rA   r   s     rQ   rd   #BertPredictionHeadTransform.forward  s4    

=1--m<}5rS   )rA   r   r  r   rq   s   @rQ   r  r    s)    UU\\ ell  rS   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertLMPredictionHeadi  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)r6   r7   r  	transformr   r   r:   r9   decoder	ParameterrG   rJ   r%  rM   s     rQ   r7   BertLMPredictionHead.__init__  s[    4V< yy!3!3V5F5FTRLLV->->!?@	rS   c                 J    U R                  U5      nU R                  U5      nU$ r   )r&  r'  r   s     rQ   rd   BertLMPredictionHead.forward  s$    }5]3rS   )r%  r'  r&  rf   rg   rh   ri   r7   rd   ro   rp   rq   s   @rQ   r#  r#    s    A rS   r#  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r   )r6   r7   r#  predictionsrM   s     rQ   r7   BertOnlyMLMHead.__init__  s    /7rS   sequence_outputrW   c                 (    U R                  U5      nU$ r   r0  )rN   r2  prediction_scoress      rQ   rd   BertOnlyMLMHead.forward  s     ,,_=  rS   r4  r   rq   s   @rQ   r.  r.    s(    8!u|| ! ! !rS   r.  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertOnlyNSPHeadi  c                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr{   )r6   r7   r   r   r:   seq_relationshiprM   s     rQ   r7   BertOnlyNSPHead.__init__  s'     "		&*<*<a @rS   c                 (    U R                  U5      nU$ r   r;  )rN   r  seq_relationship_scores      rQ   rd   BertOnlyNSPHead.forward  s    !%!6!6}!E%%rS   r>  r,  rq   s   @rQ   r8  r8    s    A& &rS   r8  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPreTrainingHeadsi  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g r:  )r6   r7   r#  r0  r   r   r:   r;  rM   s     rQ   r7   BertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @rS   c                 L    U R                  U5      nU R                  U5      nX44$ r   r0  r;  )rN   r2  r  r5  r?  s        rQ   rd   BertPreTrainingHeads.forward  s-     ,,_=!%!6!6}!E 88rS   rF  r,  rq   s   @rQ   rB  rB    s    A
9 9rS   rB  c                   x   ^  \ rS rSr\rSrSrSrSr	Sr
Sr\\\S.r\R"                  " 5       U 4S j5       rSrU =r$ )BertPreTrainedModeli  bertT)r   
attentionscross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a|  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R
                  " UR                  5        gg)zInitialize the weightsr2   r1   N)r6   _init_weightsr   r#  initzeros_r%  r*   copy_r0   rG   rH   r]   rI   r4   )rN   rr   rP   s     rQ   rN  !BertPreTrainedModel._init_weights&  s     	f%f233KK$//JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 0rS    )rf   rg   rh   ri   r(   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrG   no_gradrN  ro   rp   rq   s   @rQ   rI  rI    sV    L&*#N"&"'. ]]_/ /rS   rI  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
BertForPreTrainingOutputi1  ar  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
    before SoftMax).
Nlossprediction_logitsseq_relationship_logitsr   rK  rS  )rf   rg   rh   ri   rj   r`  rG   rl   __annotations__ra  rb  r   r   rK  ro   rS  rS   rQ   r_  r_  1  s~    	 &*D%

d
")26u((4/68<U..5<59M5**+d2926Je''(4/6rS   r_  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                     ^  \ rS rSrSS/rSU 4S jjrS rS r\\	\
         SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\\   S\\R                     \-  4S jj5       5       5       rS rSrU =r$ )	BertModeliJ  r*   r   c                    > [         TU ]  U5        Xl        SU l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
FN)r6   r7   rO   gradient_checkpointingr*   rc   r  encoderr  pooler	post_init)rN   rO   add_pooling_layerrP   s      rQ   r7   BertModel.__init__Y  sS    
 	 &+#(0"6*,=j(4 	rS   c                 .    U R                   R                  $ r   rc   r<   rN   s    rQ   get_input_embeddingsBertModel.get_input_embeddingsj  s    ...rS   c                 $    XR                   l        g r   rn  )rN   ru   s     rQ   set_input_embeddingsBertModel.set_input_embeddingsm  s    */'rS   NrT   rv   r4   r0   rU   r   r   r   r
  rx   rW   c
           
         US L US L-  (       a  [        S5      eU R                  R                  (       a  U	b  U	OU R                  R                  n	OSn	U	(       ab  Uc_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUb  UR                  5       OSnU R                  UUUUUS9nU R                  UUUUUS9u  p'U R                  " U4UUUUU	US.U
D6nUR                  nU R                  b  U R                  U5      OS n[        UUUR                  S9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsF)rO   r   )rT   r0   r4   rU   rV   )rv   r   embedding_outputr   r   )rv   r   r   r   r
  r0   )r  pooler_outputr   )r   rO   r   r
  is_encoder_decoderr   r   get_seq_lengthrc   _create_attention_masksrh  r  ri  r   r   )rN   rT   rv   r4   r0   rU   r   r   r   r
  rx   rV   rv  encoder_outputsr2  r  s                   rQ   rd   BertModel.forwardp  sw     -t";<YZZ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg??%)'#9 + 
 261M1M)#9-"7+ 2N 2
. ,,	
)"7#9+%	
 	
 *;;8<8OO4UY;-'+;;
 	
rS   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)rO   rU   rv   r   )rO   rU   rv   )rO   rU   rv   r   )rO   r   r   r   )rN   rv   r   rv  r   r   s         rQ   rz  !BertModel._create_attention_masks  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55rS   )rO   rc   rh  rg  ri  )T)	NNNNNNNNN)rf   rg   rh   ri   _no_split_modulesr7   rp  rs  r%   r&   r"   rG   rn   r   r  r   r!   r   r   rd   rz  ro   rp   rq   s   @rQ   re  re  J  s3    *;7"/0   *..2.2,0-1596:(,!%?
<<$&?
 t+?
 t+	?

 llT)?
 ||d*?
  %||d2?
 !&t 3?
 ?
 $;?
 +,?
 
u||	K	K?
    ?
B6 6rS   re  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   n  ^  \ rS rSrSSS.rU 4S jrS rS r\\	       SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )BertForPreTrainingi  &bert.embeddings.word_embeddings.weightcls.predictions.biaszcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r6   r7   re  rJ  rB  clsrj  rM   s     rQ   r7   BertForPreTraining.__init__  s4     f%	'/ 	rS   c                 B    U R                   R                  R                  $ r   r  r0  r'  ro  s    rQ   get_output_embeddings(BertForPreTraining.get_output_embeddings      xx##+++rS   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r0  r'  r%  rN   new_embeddingss     rQ   set_output_embeddings(BertForPreTraining.set_output_embeddings  *    '5$$2$7$7!rS   NrT   rv   r4   r0   rU   labelsnext_sentence_labelrx   rW   c           	         U R                   " U4UUUUSS.UD6n	U	SS u  pU R                  X5      u  pSnUbv  Ubs  [        5       nU" UR                  SU R                  R
                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   n[        UUUU	R                  U	R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
    the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
    pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
```
Trv   r4   r0   rU   return_dictNr{   r2   )r`  ra  rb  r   rK  )	rJ  r  r   r   rO   r9   r_  r   rK  )rN   rT   rv   r4   r0   rU   r  r  rx   outputsr2  r  r5  r?  
total_lossloss_fctmasked_lm_lossnext_sentence_losss                     rQ   rd   BertForPreTraining.forward  s    R ))
))%'
 
 *1!&48HH_4\1
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J'/$:!//))
 	
rS   rJ  r  NNNNNNN)rf   rg   rh   ri   _tied_weights_keysr7   r  r  r$   r"   rG   rn   r   r!   r   r_  rd   ro   rp   rq   s   @rQ   r  r    s    +S(>
,8  *..2.2,0-1&*37A
<<$&A
 t+A
 t+	A

 llT)A
 ||d*A
 t#A
 #\\D0A
 +,A
 
u||	7	7A
  A
rS   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	           SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\\
R                  -  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )BertLMHeadModeli7  r  r  r  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Frk  
r6   r7   r   loggerwarningre  rJ  r.  r  rj  rM   s     rQ   r7   BertLMHeadModel.__init__B  sL       NNijf>	"6* 	rS   c                 B    U R                   R                  R                  $ r   r  ro  s    rQ   r  %BertLMHeadModel.get_output_embeddingsN  r  rS   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r  s     rQ   r  %BertLMHeadModel.set_output_embeddingsQ  r  rS   NrT   rv   r4   r0   rU   r   r   r  r   r
  logits_to_keeprx   rW   c                    Ub  Sn
U R                   " U4UUUUUUU	U
SS.	UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
NFT)	rv   r4   r0   rU   r   r   r   r
  r  )logitsr  r9   )r`  r  r   r   rK  rL  rS  )rJ  r  r   rm   slicer  loss_functionrO   r9   r   r   r   rK  rL  )rN   rT   rv   r4   r0   rU   r   r   r  r   r
  r  rx   r  r   slice_indicesr  r`  s                     rQ   rd   BertLMHeadModel.forwardU  s    . I@D		A
))%'"7#9+A
 A
  118B>SV8W8W~ot4]k-=!(;<=%%pVF{{OeOepiopD0#33!//))$55
 	
rS   r  )NNNNNNNNNNr   )rf   rg   rh   ri   r  r7   r  r  r$   r"   rG   rn   r   r  rm   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r  7  sP    +S(>

,8  *..2.2,0-1596:&*(,!%-.6
<<$&6
 t+6
 t+	6

 llT)6
 ||d*6
  %||d26
 !&t 36
 t#6
 6
 $;6
 ell*6
 +,6
 
u||	@	@6
  6
rS   r  c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	        SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                     \-  4S jj5       5       rSrU =r$ )BertForMaskedLMi  r  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rM   s     rQ   r7   BertForMaskedLM.__init__  sR     NN1
 f>	"6* 	rS   c                 B    U R                   R                  R                  $ r   r  ro  s    rQ   r  %BertForMaskedLM.get_output_embeddings  r  rS   c                     XR                   R                  l        UR                  U R                   R                  l        g r   r  r  s     rQ   r  %BertForMaskedLM.set_output_embeddings  r  rS   NrT   rv   r4   r0   rU   r   r   r  rx   rW   c	                 :   U R                   " U4UUUUUUSS.U	D6n
U
S   nU R                  U5      nSnUbF  [        5       nU" UR                  SU R                  R
                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
T)rv   r4   r0   rU   r   r   r  r   Nr2   r`  r  r   rK  )	rJ  r  r   r   rO   r9   r   r   rK  )rN   rT   rv   r4   r0   rU   r   r   r  rx   r  r2  r5  r  r  s                  rQ   rd   BertForMaskedLM.forward  s    ( ))

))%'"7#9

 

 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rS   r  )NNNNNNNN)rf   rg   rh   ri   r  r7   r  r  r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r    s    +S(>
,8  *..2.2,0-1596:&*+
<<$&+
 t++
 t+	+

 llT)+
 ||d*+
  %||d2+
 !&t 3+
 t#+
 +,+
 
u||	~	-+
  +
rS   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )BertForNextSentencePredictioni  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r6   r7   re  rJ  r8  r  rj  rM   s     rQ   r7   &BertForNextSentencePrediction.__init__  s4     f%	"6* 	rS   NrT   rv   r4   r0   rU   r  rx   rW   c           	         U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
SnUb2  [        5       nU" U
R                  SS5      UR                  S5      5      n[	        UU
UR
                  UR                  S9$ )a"  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring). Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
```
Tr  r'   Nr2   r{   r  )rJ  r  r   r   r   r   rK  )rN   rT   rv   r4   r0   rU   r  rx   r  r  seq_relationship_scoresr  r  s                rQ   rd   %BertForNextSentencePrediction.forward  s    N ))
))%'
 
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_*#*!//))	
 	
rS   r  NNNNNN)rf   rg   rh   ri   r7   r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r    s      *..2.2,0-1&*=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 t#=
 +,=
 
u||	:	:=
  =
rS   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )BertForSequenceClassificationi.  c                 r  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r6   r7   
num_labelsrO   re  rJ  classifier_dropoutrD   r   rC   rE   r   r:   
classifierrj  rN   rO   r  rP   s      rQ   r7   &BertForSequenceClassification.__init__5  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rS   NrT   rv   r4   r0   rU   r  rx   rW   c           	         U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUGb  U R                  R                  c  U R
                  S:X  a  SU R                  l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [        5       nU R
                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [        5       nU" U
R                  SU R
                  5      UR                  S5      5      nO,U R                  R                  S:X  a  [        5       nU" X5      n[        UU
UR                   UR"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Tr  r'   N
regressionsingle_label_classificationmulti_label_classificationr2   r  )rJ  rE   r  rO   problem_typer  r5   rG   rL   rm   r   squeezer   r   r   r   r   rK  )rN   rT   rv   r4   r0   rU   r  rx   r  r  r  r`  r  s                rQ   rd   %BertForSequenceClassification.forwardD  s   $ ))
))%'
 
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rS   )rJ  r  rO   rE   r  r  )rf   rg   rh   ri   r7   r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r  .  s      *..2.2,0-1&*;
<<$&;
 t+;
 t+	;

 llT);
 ||d*;
 t#;
 +,;
 
u||	7	7;
  ;
rS   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )BertForMultipleChoicei  c                 0  > [         TU ]  U5        [        U5      U l        UR                  b  UR                  OUR
                  n[        R                  " U5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr'   )r6   r7   re  rJ  r  rD   r   rC   rE   r   r:   r  rj  r  s      rQ   r7   BertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	rS   NrT   rv   r4   r0   rU   r  rx   rW   c           	         Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U4UUUUSS.UD6n	U	S   n
U R	                  U
5      n
U R                  U
5      nUR                  SU5      nSnUb  [        5       nU" X5      n[        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr'   r2   Tr  r  )
r]   r   rK   rJ  rE   r  r   r   r   rK  )rN   rT   rv   r4   r0   rU   r  rx   num_choicesr  r  r  reshaped_logitsr`  r  s                  rQ   rd   BertForMultipleChoice.forward  s   T -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))
))%'
 
  
]3/ ++b+6')HO4D("!//))	
 	
rS   )rJ  r  rE   r  )rf   rg   rh   ri   r7   r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r    s      *..2.2,0-1&*N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
 +,N
 
u||	8	8N
  N
rS   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )BertForTokenClassificationi  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g NFr  )r6   r7   r  re  rJ  r  rD   r   rC   rE   r   r:   r  rj  r  s      rQ   r7   #BertForTokenClassification.__init__  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rS   NrT   rv   r4   r0   rU   r  rx   rW   c           	      D   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUb<  [        5       nU" U
R	                  SU R
                  5      UR	                  S5      5      n[        UU
UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr  r   Nr2   r  )	rJ  rE   r  r   r   r  r   r   rK  )rN   rT   rv   r4   r0   rU   r  rx   r  r2  r  r`  r  s                rQ   rd   "BertForTokenClassification.forward  s      ))
))%'
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rS   )rJ  r  rE   r  r  )rf   rg   rh   ri   r7   r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r    s      *..2.2,0-1&*'
<<$&'
 t+'
 t+	'

 llT)'
 ||d*'
 t#'
 +,'
 
u||	4	4'
  '
rS   r  c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	\
   S\\R                     \-  4S jj5       5       rSrU =r$ )BertForQuestionAnsweringi"  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r6   r7   r  re  rJ  r   r   r:   
qa_outputsrj  rM   s     rQ   r7   !BertForQuestionAnswering.__init__$  sU      ++f>	))F$6$68I8IJ 	rS   NrT   rv   r4   r0   rU   start_positionsend_positionsrx   rW   c           	         U R                   " U4UUUUSS.UD6n	U	S   n
U R                  U
5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  n[        UUUU	R                  U	R                  S	9$ )
NTr  r   r'   r2   r|   )ignore_indexr{   )r`  start_logits
end_logitsr   rK  )rJ  r  splitr  r   lenrK   clampr   r   r   rK  )rN   rT   rv   r4   r0   rU   r  r  rx   r  r2  r  r  r  r  ignored_indexr  
start_lossend_losss                      rQ   rd    BertForQuestionAnswering.forward.  sx    ))
))%'
 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!!//))
 	
rS   )rJ  r  r  r  )rf   rg   rh   ri   r7   r$   r"   rG   rn   r   r!   r   r   rd   ro   rp   rq   s   @rQ   r  r  "  s      *..2.2,0-1/3-13
<<$&3
 t+3
 t+	3

 llT)3
 ||d*3
 ,3
 ||d*3
 +,3
 
u||	;	;3
  3
rS   r  )r  r  r  r  r  r  r  r   r  re  rI  )Nr   )Zrj   collections.abcr   dataclassesr   rG   r   torch.nnr   r   r    r
   rO  activationsr   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr    r!   r"   r#   utils.genericr$   r%   utils.output_capturingr&   configuration_bertr(   
get_loggerrf   r  Moduler*   rn   floatr   r   r   r   r   r   r   r   r  r  r  r#  r.  r8  rB  rI  r_  re  r  r  r  r  r  r  r  r  __all__rS  rS   rQ   <module>r     s    $ !   A A & ! C C ) J 9
 
 
 G & 6 M M I 5 * 
		H	%;RYY ;H !%II%<<% 
% <<	%
 LL4'% T\% % '(%8@)		 @)FI) I)XRYY .BII .:ryy  >* >B
")) 
@ ")) "299  !bii !&bii &	9299 	9 // / /2 
 7{ 7 7& 	|6# |6|6~ Y
, Y
Y
x 
Q
)? Q

Q
h I
) I
 I
X 
I
$7 I

I
X M
$7 M
M
` ^
/ ^
 ^
B 8
!4 8
 8
v @
2 @
 @
FrS   