
    Z ja                        S r SSKJr  SSKrSSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.  SSK/J0r0  SSK1J2r2  \)Rf                  " \45      r5S\6S\6S\Rn                  4S jr8S\6S\6S\Rn                  4S jr9 " S S\Rt                  5      r;  SAS\Rt                  S\Rn                  S \Rn                  S!\Rn                  S"\Rn                  S-  S#\<S-  S$\<S%\#\'   4S& jjr= " S' S(\Rt                  5      r> " S) S*\Rt                  5      r? " S+ S,\5      r@ " S- S.\Rt                  5      rA\( " S/ S0\!5      5       rB\( " S1 S2\B5      5       rC\(" S3S49 " S5 S6\B5      5       rD\(" S7S49 " S8 S9\B5      5       rE\( " S: S;\B5      5       rF\( " S< S=\B5      5       rG\( " S> S?\B5      5       rH/ S@QrIg)Bz
PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)get_activation)PreTrainedConfig)is_deepspeed_zero3_enabled)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging)deprecate_kwarg)can_return_tuplemerge_with_config_defaults)capture_outputs   )DistilBertConfign_posdimoutc                    [        5       (       a^  SS KnUR                  R                  USS9   [        R
                  R                  5       S:X  a  [        XUS9sS S S 5        $  S S S 5        g [        XUS9$ ! , (       d  f       g = f)Nr   )modifier_rankr"   r#   r$   )r   	deepspeedzeroGatheredParameterstorchdistributedget_rank_create_sinusoidal_embeddings)r"   r#   r$   r(   s       ڃ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/distilbert/modeling_distilbert.pycreate_sinusoidal_embeddingsr0   ?   st    !##^^..s!.D  ))+q045sS ED0 ED -5sKK	 EDs   ,A88
Bc                    [         R                  " [        U 5       VVs/ s H?  n[        U5       Vs/ s H%  oC[         R                  " SSUS-  -  U-  5      -  PM'     snPMA     snn5      nSUl        [
        R                  " [         R                  " US S 2SS S24   5      5      US S 2SS S24'   [
        R                  " [         R                  " US S 2SS S24   5      5      US S 2SS S24'   UR                  5         U$ s  snf s  snnf )Ni'     Fr   r    )
nparrayrangepowerrequires_gradr+   FloatTensorsincosdetach_)r"   r#   r$   posjposition_encs         r/   r.   r.   J   s    88hmnshtuhtadQVWZQ[\Q[ABHHUAaL34F$GGQ[\htuvLC$$RVVLADqD,A%BCC14a4L$$RVVLADqD,A%BCC14a4LKKMJ ]us   D
,DD
D
c            
          ^  \ rS rSrS\4U 4S jjr\" SSSS9  SS	\R                  S\R                  S-  S
\R                  S-  S\R                  4S jj5       r
SrU =r$ )
EmbeddingsS   configc                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  SS9U l
        [        R                  " UR                  5      U l        U R                  S[        R                  " UR                  5      R!                  S5      SS9  g )N)padding_idx-q=epsposition_idsr    F)
persistent)super__init__r   	Embedding
vocab_sizer#   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormDropoutdropoutregister_bufferr+   arangeexpandselfrB   	__class__s     r/   rM   Embeddings.__init__T   s    !||F,=,=vzzW]WjWjk#%<<0N0NPVPZPZ#[ fjje<zz&..1ELL)G)GHOOPWXej 	 	
    input_embedsz5.6.0inputs_embeds)versionnew_nameN	input_idsrH   returnc                    Ub  U R                  U5      nUR                  S5      nUcu  [        U S5      (       a  U R                  S S 2S U24   nON[        R
                  " U[        R                  UR                  S9nUR                  S5      R                  U5      nU R                  U5      nX%-   nU R                  U5      nU R                  U5      nU$ )Nr    rH   )dtypedevicer   )rQ   sizehasattrrH   r+   rX   longrg   	unsqueeze	expand_asrS   rT   rV   )r[   rc   r`   rH   
seq_lengthrS   
embeddingss          r/   forwardEmbeddings.forward_   s       00;M"''*
 t^,,#00KZK@$||JejjQZQaQab+55a8BB9M"66|D"8
^^J/
\\*-
r^   )rT   rV   rS   rQ   )NN)__name__
__module____qualname____firstlineno__r   rM   r   r+   Tensor
LongTensorro   __static_attributes____classcell__r\   s   @r/   r@   r@   S   sx    	
/ 	
 ^WO .204	<< ||d* &&-	
 
 Pr^   r@   modulequerykeyvalueattention_maskscalingrV   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrJ         r2   r   r#   )ptrainingr    )
rh   r+   matmul	transposer   
functionalsoftmaxrV   r   
contiguous)
rz   r{   r|   r}   r~   r   rV   r   attn_weightsattn_outputs
             r/   eager_attention_forwardr   ~   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r^   c            
          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\
   S\\R                     4S	 jjrS
rU =r$ )DistilBertSelfAttention   rB   c                    > [         TU ]  5         Xl        UR                  U l        UR                  U l        U R                  U R                  -  U l        U R
                  S-  U l        U R                  U R                  -  S:w  a&  [        SU R                   SU R                   S35      e[        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  S9U l        SU l        g )	Nr   r   zself.n_heads: z must divide self.dim:  evenlyin_featuresout_featuresr   F)rL   rM   rB   n_headsr#   attention_head_sizer   
ValueErrorr   Linearq_link_linv_linout_linrU   attention_dropoutrV   	is_causalrZ   s     r/   rM    DistilBertSelfAttention.__init__   s   ~~::#'88t||#; //5 88dll"a'~dll^;RSWS[S[R\\cdeeYY6::FJJO
YY6::FJJO
YY6::FJJO
yyVZZfjjQzzF$<$<=r^   Nhidden_statesr~   r   rd   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  R                  U R                  S.UD6u  pU
R                   " / UQSP76 R#                  5       n
U R%                  U
5      n
X4$ )NrJ   r    r2           )rV   r   )shaper   r   viewr   r   r   r   get_interfacerB   _attn_implementationr   r   rV   r   r   reshaper   r   )r[   r   r~   r   input_shapehidden_shapequery_layer	key_layervalue_layerattention_interfacer   r   s               r/   ro   DistilBertSelfAttention.forward   sJ    $))#2.CCbC$*B*BC jj/44lCMMaQRSJJ}-22LAKKAqQ	jj/44lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFHll;/((r^   )r   rB   r#   rV   r   r   r   r   r   r   r   N)rq   rr   rs   rt   r   rM   r+   ru   r8   r   r   tuplero   rw   rx   ry   s   @r/   r   r      sc    / 2 48)||) ))D0) +,	)
 
u||	) )r^   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrS\R                  S\R                  4S jr	Sr
U =r$ )	FFN   rB   c                   > [         TU ]  5         [        R                  " UR                  S9U l        UR
                  U l        SU l        [        R                  " UR                  UR                  S9U l
        [        R                  " UR                  UR                  S9U l        [        UR                  5      U l        g )Nr   r    r   )rL   rM   r   rU   rV   chunk_size_feed_forwardseq_len_dimr   r#   
hidden_dimlin1lin2r
   
activationrZ   s     r/   rM   FFN.__init__   s    zzFNN3'-'E'E$II&**6CTCTU	II&*;*;&**U	():):;r^   inputrd   c                 Z    [        U R                  U R                  U R                  U5      $ r   )r   ff_chunkr   r   )r[   r   s     r/   ro   FFN.forward   s%    (8T8TVZVfVfhmnnr^   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rV   )r[   r   xs      r/   r   FFN.ff_chunk   s=    IIeOOAIIaLLLOr^   )r   r   rV   r   r   r   )rq   rr   rs   rt   r   rM   r+   ru   ro   r   rw   rx   ry   s   @r/   r   r      sN    </ <oU\\ oell oell u||  r^   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\
\R                  S	4   4S
 jjrSrU =r$ )TransformerBlock   rB   c                 x  > [         TU ]  5         UR                  UR                  -  S:w  a&  [	        SUR                   SUR                   S35      e[        U5      U l        [        R                  " UR                  SS9U l	        [        U5      U l        [        R                  " UR                  SS9U l        g )Nr   zconfig.n_heads z must divide config.dim r   rE   )normalized_shaperG   )rL   rM   r#   r   r   r   	attentionr   rT   sa_layer_normr   ffnoutput_layer_normrZ   s     r/   rM   TransformerBlock.__init__   s     ::&!+v~~.>>VW]WaWaVbbijkk08\\6::5Qv;!#vzzu!Ur^   Nr   r~   r   rd   .c                     U R                   " U4SU0UD6u  pEU R                  XA-   5      nU R                  U5      nU R                  Xd-   5      nU$ )Nr~   )r   r   r   r   )r[   r   r~   r   attention_output_
ffn_outputs          r/   ro   TransformerBlock.forward   sl     #nn
)
 

  --.>.NO XX./
++J,IJ
r^   )r   r   r   r   r   )rq   rr   rs   rt   r   rM   r+   ru   r   r   r   ro   rw   rx   ry   s   @r/   r   r      sh    V/ V  /3|| t+ +,	
 
u||S 	! r^   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\
4S	 jjrS
rU =r$ )Transformeri
  rB   c                    > [         TU ]  5         UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l        g s  snf )NF)	rL   rM   n_layersr   
ModuleListr5   r   layergradient_checkpointing)r[   rB   r   r\   s      r/   rM   Transformer.__init__  sW    ]]eFOOF\#]F\$4V$<F\#]^
&+# $^s   A1Nr   r~   r   rd   c                 N    U R                    H  nU" UU40 UD6nM     [        US9$ )N)last_hidden_state)r   r   )r[   r   r~   r   layer_modules        r/   ro   Transformer.forward  s9     !JJL( M ' ??r^   )r   r   r   r   )rq   rr   rs   rt   r   rM   r+   ru   r   r   r   ro   rw   rx   ry   s   @r/   r   r   
  s^    ,/ , /3@||@ t+@ +,	@
 
@ @r^   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
Sr\\S.r\R                   " 5       S\R$                  4U 4S jj5       rSrU =r$ )	DistilBertPreTrainedModeli"  rB   
distilbertT)r   
attentionsrz   c           
      H  > [         TU ]  U5        [        U[        5      (       a  U R                  R
                  (       a  [        R                  " UR                  R                  [        U R                  R                  U R                  R                  [        R                  " UR                  R                  5      5      5        [        R                  " UR                  [        R                   " UR                  R"                  S   5      R%                  S5      5        gg)zInitialize the weights.rJ   rI   N)rL   _init_weights
isinstancer@   rB   sinusoidal_pos_embdsinitcopy_rS   weightr0   rR   r#   r+   
empty_likerH   rX   r   rY   )r[   rz   r\   s     r/   r   'DistilBertPreTrainedModel._init_weights0  s     	f%fj)){{//

..550;;(()C)C)J)JK JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh *r^    )rq   rr   rs   rt   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr+   no_gradr   Moduler   rw   rx   ry   s   @r/   r   r   "  sb    $&*#N"&)-
 ]]_iBII i ir^   r   c                     ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\	4S jr
S\R                  4S jrS	\R                  4S
 jr\\\    SS\R"                  S-  S\R"                  S-  S\R"                  S-  S\R"                  S-  S\\   S\\\R"                  S4   -  4S jj5       5       5       rSrU =r$ )DistilBertModeliA  rB   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rL   rM   r@   rn   r   transformer	post_initrZ   s     r/   rM   DistilBertModel.__init__C  s5     $V,&v. 	r^   rd   c                 .    U R                   R                  $ z!
Returns the position embeddings
)rn   rS   r[   s    r/   get_position_embeddings'DistilBertModel.get_position_embeddingsL  s     222r^   new_num_position_embeddingsc                    XR                   R                  -
  nUS:X  a  g[        R                  SU S35        XR                   l        U R                  R
                  R                  R                  5       n[        R                  " U R                   R                  U R                   R                  5      U R                  l        U R                   R                  (       aH  [        U R                   R                  U R                   R                  U R
                  R                  S9  O[        R                  " 5          US:  a9  [        R                  " U5      U R                  R
                  R                  SU* & O2[        R                  " USU 5      U R                  R
                  l        SSS5        U R                  R
                  R!                  U R"                  5        g! , (       d  f       N>= f)  
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

Arguments:
    new_num_position_embeddings (`int`):
        The number of new position embedding matrix. If position embeddings are learned, increasing the size
        will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
        end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
        size will add correct vectors at the end following the position encoding algorithm, whereas reducing
        the size will remove vectors from the end.
r   Nz(Setting `config.max_position_embeddings=z`...r'   )rB   rR   loggerinforn   rS   r   cloner   rN   r#   r   r0   r+   r   	Parametertorg   )r[   r  num_position_embeds_diffold_position_embeddings_weights       r/   resize_position_embeddings*DistilBertModel.resize_position_embeddingsR  si    $?AdAd#d  $q(>?Z>[[_`a.I+)-)L)L)S)S)Y)Y)[&.0ll4;;;^;^`d`k`k`o`o.p+;;++(kk99t{{TXTlTlTsTs +a/]_]i]i6^DOO77>>?YAY@YZ BD67P8PQBDOO77> ! 	++..t{{; !s   =A2G''
G5c                 .    U R                   R                  $ r   rn   rQ   r   s    r/   get_input_embeddings$DistilBertModel.get_input_embeddings|  s    ...r^   new_embeddingsc                 $    XR                   l        g r   r  r[   r  s     r/   set_input_embeddings$DistilBertModel.set_input_embeddings  s    *8'r^   Nrc   r~   r`   rH   r   .c                     USL USL-  (       a  [        S5      eU R                  XU5      n[        U R                  UUS9nU R                  " SUUS.UD6$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embeds)rB   r`   r~   )r   r~   r   )r   rn   r   rB   r   )r[   rc   r~   r`   rH   r   rn   s          r/   ro   DistilBertModel.forward  ss    0 -t";<YZZ__Y|L
2;;$)
  
$)
 
 	
r^   )rn   r   )NNNN)rq   rr   rs   rt   r   rM   r   rN   r  intr  r  r  r   r   r   r+   ru   r   r   r   r   ro   rw   rx   ry   s   @r/   r   r   A  s    / 3 3(<c (<T/bll /92<< 9   *..2-1,0$
<<$&$
 t+$
 ||d*	$

 llT)$
 +,$
 
5s!23	3$
    $
r^   r   zI
    DistilBert Model with a `masked language modeling` head on top.
    )custom_introc                     ^  \ rS rSrSS0rS\4U 4S jjrS\R                  4S jr	S\
4S	 jrS\R                  4S
 jrS\R                  4S jr\\     SS\R$                  S-  S\R$                  S-  S\R$                  S-  S\R&                  S-  S\R$                  S-  S\\   S\\\R$                  S4   -  4S jj5       5       rSrU =r$ )DistilBertForMaskedLMi  zvocab_projector.weightz,distilbert.embeddings.word_embeddings.weightrB   c                   > [         TU ]  U5        [        UR                  5      U l        [	        U5      U l        [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  SS9U l        [        R                  " UR                  UR                  5      U l        U R                  5         [        R                  " 5       U l        g )NrE   rF   )rL   rM   r
   r   r   r   r   r   r#   vocab_transformrT   vocab_layer_normrO   vocab_projectorr   r   mlm_loss_fctrZ   s     r/   rM   DistilBertForMaskedLM.__init__  s     ():):;)&1!yyVZZ@ "VZZU C!yyV5F5FG 	//1r^   rd   c                 6    U R                   R                  5       $ r   r   r  r   s    r/   r  -DistilBertForMaskedLM.get_position_embeddings       6688r^   r  c                 :    U R                   R                  U5        gr  Nr   r  r[   r  s     r/   r  0DistilBertForMaskedLM.resize_position_embeddings       	223NOr^   c                     U R                   $ r   r"  r   s    r/   get_output_embeddings+DistilBertForMaskedLM.get_output_embeddings  s    ###r^   r  c                     Xl         g r   r0  r  s     r/   set_output_embeddings+DistilBertForMaskedLM.set_output_embeddings  s    -r^   Nrc   r~   r`   labelsrH   r   .c           	         U R                   " SUUUUSS.UD6nUS   nU R                  U5      n	U R                  U	5      n	U R                  U	5      n	U R	                  U	5      n	Sn
Ub@  U R                  U	R                  SU	R                  S5      5      UR                  S5      5      n
[        U
U	UR                  UR                  S9$ )a*  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Trc   r~   r`   rH   return_dictr   NrJ   losslogitsr   r   r   )r   r   r   r!  r"  r#  r   rh   r   r   r   )r[   rc   r~   r`   r6  rH   r   dlbrt_outputr   prediction_logitsmlm_losss              r/   ro   DistilBertForMaskedLM.forward  s    8  
)'%
 
 %Q 00? OO,=> 112CD 001BC(():)?)?DUDZDZ[]D^)_agalalmoapqH$&44#..	
 	
r^   )r   r   r#  r!  r"  r   NNNNN)rq   rr   rs   rt   _tied_weights_keysr   rM   r   rN   r  r  r  r   r1  r4  r   r   r+   ru   rv   r   r   r   r   ro   rw   rx   ry   s   @r/   r  r    s    34bc2/ 29 9Pc P$ryy $.BII .  *..2-1*.,01
<<$&1
 t+1
 ||d*	1

   4'1
 llT)1
 +,1
 
%c 12	21
  1
r^   r  z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   T  ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\	4S jr
\\     SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\\R                  S4   -  4S jj5       5       rSrU =r$ )#DistilBertForSequenceClassificationi  rB   c                   > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  5      U l        U R                  5         g r   )rL   rM   
num_labelsrB   r   r   r   r   r#   pre_classifier
classifierrU   seq_classif_dropoutrV   r   rZ   s     r/   rM   ,DistilBertForSequenceClassification.__init__  s      ++)&1 ii

FJJ?))FJJ0A0ABzz&"<"<= 	r^   rd   c                 6    U R                   R                  5       $ r   r&  r   s    r/   r  ;DistilBertForSequenceClassification.get_position_embeddings'  r(  r^   r  c                 :    U R                   R                  U5        gr*  r+  r,  s     r/   r  >DistilBertForSequenceClassification.resize_position_embeddings-  r.  r^   Nrc   r~   r`   r6  rH   r   .c           	      R   U R                   " SUUUUSS.UD6nUS   nUSS2S4   n	U R                  U	5      n	[        R                  " 5       " U	5      n	U R	                  U	5      n	U R                  U	5      n
SnUGb  U R                  R                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [        5       nU" U
R!                  S	U R                  5      UR!                  S	5      5      nO,U R                  R                  S:X  a  [#        5       nU" X5      n[%        UU
UR&                  UR(                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Tr8  r   Nr    
regressionsingle_label_classificationmulti_label_classificationrJ   r:  r   )r   rG  r   ReLUrV   rH  rB   problem_typerF  rf   r+   rj   r  r   squeezer   r   r   r   r   r   )r[   rc   r~   r`   r6  rH   r   distilbert_outputhidden_statepooled_outputr<  r;  loss_fcts                r/   ro   +DistilBertForSequenceClassification.forward;  s   " !OO 
)'%
 
 )+$QT*++M:	-0]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'+99(33	
 	
r^   )rH  rB   r   rV   rF  rG  rA  )rq   rr   rs   rt   r   rM   r   rN   r  r  r  r   r   r+   ru   rv   r   r   r   r   ro   rw   rx   ry   s   @r/   rD  rD    s    / 9 9Pc P  *..2-1*.,0:
<<$&:
 t+:
 ||d*	:

   4':
 llT):
 +,:
 
"E%,,*;$<	<:
  :
r^   rD  c                   t  ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\	4S jr
\\      SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\\R                  S4   -  4S jj5       5       rSrU =r$ )DistilBertForQuestionAnsweringiz  rB   c                 ^  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        UR                  S:w  a  [        SUR                   35      e[        R                  " UR                  5      U l        U R                  5         g )Nr2   z)config.num_labels should be 2, but it is )rL   rM   r   r   r   r   r#   rF  
qa_outputsr   rU   
qa_dropoutrV   r   rZ   s     r/   rM   'DistilBertForQuestionAnswering.__init__|  s     )&1))FJJ0A0AB!HIZIZH[\]]zz&"3"34 	r^   rd   c                 6    U R                   R                  5       $ r   r&  r   s    r/   r  6DistilBertForQuestionAnswering.get_position_embeddings  r(  r^   r  c                 :    U R                   R                  U5        gr*  r+  r,  s     r/   r  9DistilBertForQuestionAnswering.resize_position_embeddings  r.  r^   Nrc   r~   r`   start_positionsend_positionsrH   r   .c           	         U R                   " SUUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
U
R                  SSS9u  pUR	                  S5      R                  5       nUR	                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR	                  S5      n[        UR                  5       5      S:  a  UR	                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        R                  " US9nU" X5      nU" X5      nUU-   S	-  n[        UUUUR                  UR                  S
9$ )r  Tr8  r   r    rJ   r   N)ignore_indexr2   )r;  start_logits
end_logitsr   r   r   )r   rV   r^  splitrU  r   lenrh   clampr   r   r   r   r   )r[   rc   r~   r`   re  rf  rH   r   rV  r   r<  ri  rj  
total_lossignored_indexrY  
start_lossend_losss                     r/   ro   &DistilBertForQuestionAnswering.forward  s   2 !OO 
)'%
 
 *!,]3/#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM**FH!,@J
:H$x/14J+%!+99(33
 	
r^   )r   rV   r^  )NNNNNN)rq   rr   rs   rt   r   rM   r   rN   r  r  r  r   r   r+   ru   r   r   r   r   ro   rw   rx   ry   s   @r/   r\  r\  z  s    / 9 9Pc P  *..2-1/3-1,0>
<<$&>
 t+>
 ||d*	>

 ,>
 ||d*>
 llT)>
 +,>
 
&ellC.?(@	@>
  >
r^   r\  c                   T  ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\	4S jr
\\     SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\\R                  S4   -  4S jj5       5       rSrU =r$ ) DistilBertForTokenClassificationi  rB   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l
        U R                  5         g r   )rL   rM   rF  r   r   r   rU   rV   r   hidden_sizerH  r   rZ   s     r/   rM   )DistilBertForTokenClassification.__init__  sg      ++)&1zz&..1))F$6$68I8IJ 	r^   rd   c                 6    U R                   R                  5       $ r   r&  r   s    r/   r  8DistilBertForTokenClassification.get_position_embeddings  r(  r^   r  c                 :    U R                   R                  U5        gr*  r+  r,  s     r/   r  ;DistilBertForTokenClassification.resize_position_embeddings  r.  r^   Nrc   r~   r`   r6  rH   r   .c                 B   U R                   " U4UUUSS.UD6nUS   nU R                  U5      nU R                  U5      n	Sn
Ub<  [        5       nU" U	R	                  SU R
                  5      UR	                  S5      5      n
[        U
U	UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr~   r`   rH   r9  r   NrJ   r:  )	r   rV   rH  r   r   rF  r   r   r   )r[   rc   r~   r`   r6  rH   r   outputssequence_outputr<  r;  rY  s               r/   ro   (DistilBertForTokenClassification.forward  s     //
)'%
 
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
r^   )rH  r   rV   rF  rA  )rq   rr   rs   rt   r   rM   r   rN   r  r  r  r   r   r+   ru   rv   r   r   r   r   ro   rw   rx   ry   s   @r/   rt  rt    s    	/ 	9 9Pc P  *..2-1*.,0%
<<$&%
 t+%
 ||d*	%

   4'%
 llT)%
 +,%
 
u||S'8!9	9%
  %
r^   rt  c                   T  ^  \ rS rSrS\4U 4S jjrS\R                  4S jrS\	4S jr
\\     SS	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\\\R                  S4   -  4S jj5       5       rSrU =r$ )DistilBertForMultipleChoicei+  rB   c                 Z  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        [        R                  " UR                  5      U l        U R                  5         g )Nr    )rL   rM   r   r   r   r   r#   rG  rH  rU   rI  rV   r   rZ   s     r/   rM   $DistilBertForMultipleChoice.__init__-  so     )&1 ii

FJJ?))FJJ2zz&"<"<= 	r^   rd   c                 6    U R                   R                  5       $ r   r&  r   s    r/   r  3DistilBertForMultipleChoice.get_position_embeddings8  r(  r^   r  c                 :    U R                   R                  U5        g)a  
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

Arguments:
    new_num_position_embeddings (`int`)
        The number of new position embeddings. If position embeddings are learned, increasing the size will add
        newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
        position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
        add correct vectors at the end following the position encoding algorithm, whereas reducing the size
        will remove vectors from the end.
Nr+  r,  s     r/   r  6DistilBertForMultipleChoice.resize_position_embeddings>  r.  r^   Nrc   r~   r`   r6  rH   r   .c                    Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U4UUUSS.UD6nUS   n	U	SS2S4   n
U R	                  U
5      n
[
        R                  " 5       " U
5      n
U R                  U
5      n
U R                  U
5      nUR                  SU5      nSnUb  [        5       nU" X5      n[        UUUR                  UR                  S9$ )	av  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)

Examples:

```python
>>> from transformers import AutoTokenizer, DistilBertForMultipleChoice
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
>>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

>>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True)
>>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

>>> # the linear classifier still needs to be trained
>>> loss = outputs.loss
>>> logits = outputs.logits
```Nr    rJ   Tr}  r   r:  )r   r   rh   r   rG  r   rS  rV   rH  r   r   r   r   )r[   rc   r~   r`   r6  rH   r   num_choicesr~  rW  rX  r<  reshaped_logitsr;  rY  s                  r/   ro   #DistilBertForMultipleChoice.forwardL  s{   b -6,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImq ( r=#5#5b#9=;M;Mb;QR 	 //
)'%
 
 qz$QT*++M:	-0]3/ ++b+6')HO4D("!//))	
 	
r^   )rH  r   rV   rG  rA  )rq   rr   rs   rt   r   rM   r   rN   r  r  r  r   r   r+   ru   rv   r   r   r   r   ro   rw   rx   ry   s   @r/   r  r  +  s    	/ 	9 9Pc P  *..2-1*.,0U
<<$&U
 t+U
 ||d*	U

   4'U
 llT)U
 +,U
 
#U5<<+<%=	=U
  U
r^   r  )r  r  r\  rD  rt  r   r   )Nr   )J__doc__collections.abcr   numpyr3   r+   r   torch.nnr   r   r    r	   r   activationsr
   configuration_utilsr   integrations.deepspeedr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   r   utils.output_capturingr   configuration_distilbertr!   
get_loggerrq   r  r  ru   r0   r.   r   r@   floatr   r   r   r   r   r   r   r  rD  r\  rt  r  __all__r   r^   r/   <module>r     sy  
 %    A A & ) 3 @ 6 9  G & 
 1 I 5 6 
		H	%L L# LELL L 3 U\\ ' 'b !%II%<<% 
% <<	%
 LL4'% T\% % '(%85)bii 5)p")) * 1  F@")) @0 i i i< g
/ g
 g
T 
_
5 _

_
D ^
*C ^
^
B b
%> b
 b
J G
'@ G
 G
T w
"; w
 w
tr^   