
    Z j]{                        S r SSKrSSKJr  SSKJrJrJr  SSKJr	  SSK
Jr  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  \RP                  " \)5      r* " S S\"5      r+ " S S\%5      r, " S S\!5      r- " S S\#5      r.\ " S S\5      5       r/ " S S\$5      r0\" SS9 " S S \/\5      5       r1\ " S! S"\/5      5       r2 " S# S$\Rf                  5      r4\" S%S9 " S& S'\/5      5       r5\ " S( S)\/5      5       r6\ " S* S+\/5      5       r7 " S, S-\Rf                  5      r8\ " S. S/\/5      5       r9/ S0Qr:g)1zPyTorch RoBERTa model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu)GenerationMixin),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )BertCrossAttentionBertEmbeddings	BertLayer	BertModelBertSelfAttention   )RobertaConfigc                      ^  \ rS rSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\4
S	 jjr	\
S
 5       r\
SS j5       rSrU =r$ )RobertaEmbeddings,   c                    > [         TU ]  U5        U ?U ?UR                  U l        [
        R                  " UR                  UR                  U R                  S9U l        g )N)padding_idx)	super__init__pad_token_idposition_embeddingsr#   nn	Embeddingmax_position_embeddingshidden_sizeselfconfig	__class__s     |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/roberta/modular_roberta.pyr%   RobertaEmbeddings.__init__-   sT     $!..#%<<**F,>,>DL\L\$
     N	input_idstoken_type_idsposition_idsinputs_embedspast_key_values_lengthc                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )Nr4   r   r   )dimindexdtypedevice)"create_position_ids_from_input_idsr#   &create_position_ids_from_inputs_embedssizehasattrr4   expandshapetorchgatherzeroslongr5   r>   word_embeddingstoken_type_embeddingsr'   	LayerNormdropout)r-   r3   r4   r5   r6   r7   input_shape
batch_size
seq_lengthbuffered_token_type_idsrJ   
embeddingsr'   s                r0   forwardRobertaEmbeddings.forward8   sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
r2   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr9   r   r<   r   )rA   rE   arangerH   r>   	unsqueezerC   )r6   r#   rM   sequence_lengthr5   s        r0   r@   8RobertaEmbeddings.create_position_ids_from_inputs_embedsh   sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<r2   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r:   )neintrE   cumsumtype_asrH   )r3   r#   r7   maskincremental_indicess        r0   r?   4RobertaEmbeddings.create_position_ids_from_input_idsz   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r2   )r#   r'   )NNNNr   )r   )__name__
__module____qualname____firstlineno__r%   rE   
LongTensorFloatTensorr\   rR   staticmethodr@   r?   __static_attributes____classcell__r/   s   @r0   r    r    ,   s    	
 .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$.` = =" 8 8r2   r    c                       \ rS rSrSrg)RobertaSelfAttention    Nrb   rc   rd   re   ri   ro   r2   r0   rm   rm          r2   rm   c                       \ rS rSrSrg)RobertaCrossAttention   ro   Nrp   ro   r2   r0   rs   rs      rq   r2   rs   c                       \ rS rSrSrg)RobertaLayer   ro   Nrp   ro   r2   r0   rv   rv      rq   r2   rv   c                   x   ^  \ rS rSr\rSrSrSrSr	Sr
Sr\\\S.r\R"                  " 5       U 4S j5       rSrU =r$ )RobertaPreTrainedModel   robertaT)hidden_states
attentionscross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a|  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R
                  " UR                  5        gg)zInitialize the weightsr9   )r   r9   N)r$   _init_weights
isinstanceRobertaLMHeadinitzeros_biasr    copy_r5   rE   rU   rD   rC   r4   )r-   moduler/   s     r0   r   $RobertaPreTrainedModel._init_weights   s     	f%fm,,KK$ 122JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 3r2   ro   )rb   rc   rd   re   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrv   rm   rs   _can_record_outputsrE   no_gradr   ri   rj   rk   s   @r0   ry   ry      sV     L!&*#N"&%*1 ]]_/ /r2   ry   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )RobertaModel   c                 $   > [         TU ]  X5        g N)r$   r%   )r-   r.   add_pooling_layerr/   s      r0   r%   RobertaModel.__init__   s    &r2   ro   )T)rb   rc   rd   re   r%   ri   rj   rk   s   @r0   r   r      s    ' 'r2   r   zS
    RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	           SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\\
R                        S-  S\S-  S\\
R                   -  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )RobertaForCausalLM   )roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzOIf you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`Fr   
r$   r%   
is_decoderloggerwarningr   r{   r   lm_head	post_initr,   s     r0   r%   RobertaForCausalLM.__init__   sL       NNlm#FeD$V, 	r2   c                 .    U R                   R                  $ r   r   decoderr-   s    r0   get_output_embeddings(RobertaForCausalLM.get_output_embeddings       ||###r2   c                 $    XR                   l        g r   r   r-   new_embeddingss     r0   set_output_embeddings(RobertaForCausalLM.set_output_embeddings       -r2   Nr3   attention_maskr4   r5   r6   encoder_hidden_statesencoder_attention_masklabelspast_key_values	use_cachelogits_to_keepkwargsreturnc                    Ub  Sn
U R                   " U4UUUUUUU	U
SS.	UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
>>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
>>> config.is_decoder = True
>>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NFT)	r   r4   r5   r6   r   r   r   r   return_dict)logitsr   
vocab_size)lossr   r   r|   r}   r~   ro   )r{   last_hidden_stater   r\   slicer   loss_functionr.   r   r   r   r|   r}   r~   )r-   r3   r   r4   r5   r6   r   r   r   r   r   r   r   outputsr|   slice_indicesr   r   s                     r0   rR   RobertaForCausalLM.forward   s    ` I@DA
))%'"7#9+A
 A
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD0#33!//))$55
 	
r2   r   r{   )NNNNNNNNNNr   )rb   rc   rd   re   _tied_weights_keysr%   r   r   r   r   rE   rf   rg   tupleboolr\   Tensorr   r   r   rR   ri   rj   rk   s   @r0   r   r      s    #N .

$.  .237260426:>;?*.BF!%-.O
##d*O
 ))D0O
 ((4/	O

 &&-O
 ((4/O
  %0047O
 !& 1 1D 8O
   4'O
 uU%6%6784?O
 $;O
 ell*O
 +,O
 
u||	@	@O
  O
r2   r   c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	        SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )RobertaForMaskedLMi'  r   r   r   c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NznIf you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr   r   r,   s     r0   r%   RobertaForMaskedLM.__init__.  sR     NN1
 $FeD$V, 	r2   c                 .    U R                   R                  $ r   r   r   s    r0   r   (RobertaForMaskedLM.get_output_embeddings=  r   r2   c                 $    XR                   l        g r   r   r   s     r0   r   (RobertaForMaskedLM.set_output_embeddings@  r   r2   Nr3   r   r4   r5   r6   r   r   r   r   r   c	                 p   U R                   " U4UUUUUUSS.U	D6n
U
S   nU R                  U5      nSnUba  UR                  UR                  5      n[	        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
T)r   r4   r5   r6   r   r   r   r   Nr9   r   r   r|   r}   )r{   r   tor>   r   viewr.   r   r   r|   r}   )r-   r3   r   r4   r5   r6   r   r   r   r   r   sequence_outputprediction_scoresmasked_lm_lossloss_fcts                  r0   rR   RobertaForMaskedLM.forwardC  s    : ,,

))%'"7#9

 

 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r2   r   )NNNNNNNN)rb   rc   rd   re   r   r%   r   r   r   r   rE   rf   rg   r   r   r   r   r   rR   ri   rj   rk   s   @r0   r   r   '  s'    #N .
$.  .237260426:>;?*.5
##d*5
 ))D05
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
 +,5
 
u||	~	-5
  5
r2   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r   i}  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        g )N)eps)r$   r%   r(   Linearr+   denserK   layer_norm_eps
layer_normr   r   	ParameterrE   rG   r   r,   s     r0   r%   RobertaLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	r2   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   r-   featuresr   xs       r0   rR   RobertaLMHead.forward  s;    JJx GOOA LLOr2   )r   r   r   r   	rb   rc   rd   re   __doc__r%   rR   ri   rj   rk   s   @r0   r   r   }  s    4A r2   r   z
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ ) RobertaForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr   )	r$   r%   
num_labelsr.   r   r{   RobertaClassificationHead
classifierr   r,   s     r0   r%   )RobertaForSequenceClassification.__init__  sH      ++#FeD3F; 	r2   Nr3   r   r4   r5   r6   r   r   r   c           	         U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
SnUGb  UR                  U
R                  5      nU R                  R
                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R
                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R
                  S:X  a=  [        5       nU" U
R                  S	U R                  5      UR                  S	5      5      nO,U R                  R
                  S:X  a  [        5       nU" X5      n[!        UU
UR"                  UR$                  S
9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Tr   r4   r5   r6   r   r   Nr   
regressionsingle_label_classificationmulti_label_classificationr9   r   )r{   r   r   r>   r.   problem_typer   r=   rE   rH   r\   r   squeezer   r   r   r   r|   r}   r-   r3   r   r4   r5   r6   r   r   r   r   r   r   r   s                r0   rR   (RobertaForSequenceClassification.forward  s   6 ,,
))%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r2   )r   r.   r   r{   NNNNNN)rb   rc   rd   re   r%   r   r   rE   rf   rg   r   r   r   r   r   rR   ri   rj   rk   s   @r0   r   r     s    	  .237260426*.C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
   4'C
 +,C
 
u||	7	7C
  C
r2   r   c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r$   r%   r   r{   r(   Dropouthidden_dropout_probrL   r   r+   r   r   r,   s     r0   r%   !RobertaForMultipleChoice.__init__  sV     #F+zz&"<"<=))F$6$6: 	r2   Nr3   r4   r   r   r5   r6   r   r   c           	      *   Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSn	Ub!  UR                  SUR                  S5      5      OSn
Ub!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U	4U
UUUSS.UD6nUS   nU R	                  U5      nU R                  U5      nUR                  SU5      nSnUb.  UR                  UR                  5      n[        5       nU" UU5      n[        UUUR                  UR                  S9$ )aO  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr   r9   T)r5   r4   r   r6   r   r   )rD   r   rA   r{   rL   r   r   r>   r   r   r|   r}   )r-   r3   r4   r   r   r5   r6   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   pooled_outputr   reshaped_logitsr   r   s                       r0   rR    RobertaForMultipleChoice.forward  s   V -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,
*..,
 
  
]3/ ++b+6YY556F')HOV4D("!//))	
 	
r2   )r   rL   r{   r  )rb   rc   rd   re   r%   r   r   rE   rf   rg   r   r   r   r   r   rR   ri   rj   rk   s   @r0   r  r    s      .22637*.0426P
##d*P
 ((4/P
 ))D0	P

   4'P
 &&-P
 ((4/P
 +,P
 
u||	8	8P
  P
r2   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForTokenClassificationiN  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r   )r$   r%   r   r   r{   classifier_dropoutr  r(   r  rL   r   r+   r   r   r-   r.   r  r/   s      r0   r%   &RobertaForTokenClassification.__init__P  s      ++#FeD)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r2   Nr3   r   r4   r5   r6   r   r   r   c           	      z   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUbW  UR                  U
R                  5      n[        5       nU" U
R                  SU R                  5      UR                  S5      5      n[        UU
UR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr   r   Nr9   r   )r{   rL   r   r   r>   r   r   r   r   r|   r}   r   s                r0   rR   %RobertaForTokenClassification.forward^  s    2 ,,
))%'
 
 "!*,,71YYv}}-F')HFKKDOO<fkk"oND$!//))	
 	
r2   )r   rL   r   r{   r  )rb   rc   rd   re   r%   r   r   rE   rf   rg   r   r   r   r   r   rR   ri   rj   rk   s   @r0   r  r  N  s      .237260426*.2
##d*2
 ))D02
 ((4/	2

 &&-2
 ((4/2
   4'2
 +,2
 
u||	4	42
  2
r2   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r   i  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r$   r%   r(   r   r+   r   r  r  r  rL   r   out_projr  s      r0   r%   "RobertaClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr2   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ )Nr   )rL   r   rE   tanhr  r   s       r0   rR   !RobertaClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r2   )r   rL   r  r   rk   s   @r0   r   r     s    7I r2   r   c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
r$   r%   r   r   r{   r(   r   r+   
qa_outputsr   r,   s     r0   r%   $RobertaForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	r2   Nr3   r   r4   r5   r6   start_positionsend_positionsr   r   c           	         U R                   " U4UUUUSS.UD6n	U	S   n
U R                  U
5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S	-  n[        UUUU	R                  U	R                  S
9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
Tr   r   r   r9   rZ   N)ignore_indexr   )r   start_logits
end_logitsr|   r}   )r{   r%  splitr   
contiguouslenrA   clampr   r   r|   r}   )r-   r3   r   r4   r5   r6   r'  r(  r   r   r   r   r+  r,  
total_lossignored_indexr   
start_lossend_losss                      r0   rR   #RobertaForQuestionAnswering.forward  sx   0 ,,
))%'
 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!!//))
 	
r2   )r   r%  r{   )NNNNNNN)rb   rc   rd   re   r%   r   r   rE   rf   rg   r   r   r   r   r   rR   ri   rj   rk   s   @r0   r#  r#    s      .2372604263715>
##d*>
 ))D0>
 ((4/	>

 &&->
 ((4/>
 ))D0>
 ''$.>
 +,>
 
u||	;	;>
  >
r2   r#  )r   r   r  r#  r   r  r   ry   );r   rE   torch.nnr(   r   r   r    r   r   activationsr   
generationr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   bert.modeling_bertr   r   r   r   r   configuration_robertar   
get_loggerrb   r   r    rm   rs   rv   ry   r   r   r   Moduler   r   r  r  r   r#  __all__ro   r2   r0   <module>rD     s      A A &  )   . & @ @ - l l 0 
		H	%\8 \8~	, 		. 		9 	 /_ / /2'9 '
 
i
/ i

i
X R
/ R
 R
jBII , Q
'= Q
Q
h ]
5 ]
 ]
@ C
$: C
 C
L		 , K
"8 K
 K
\	r2   