
    Z j                        S SK Jr  S SKrS SKJr  S SKJrJrJr  SSKJ	r
  SSKJrJr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJrJrJrJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  SSK-J.r.J/r/  SSK0J1r1  SSK2J3r3  \,Rh                  " \55      r6 " S S\Rn                  5      r8  SJS\Rn                  S\Rr                  S\Rr                  S\Rr                  S\Rr                  S-  S\:S-  S\:S\&\*   4S jjr; " S S \Rn                  5      r< " S! S"\Rn                  5      r= " S# S$\Rn                  5      r> " S% S&\Rn                  5      r? " S' S(\Rn                  5      r@ " S) S*\Rn                  5      rA " S+ S,\5      rB\+ " S- S.\$5      5       rC " S/ S0\Rn                  5      rD " S1 S2\Rn                  5      rE\+" S3S49 " S5 S6\C5      5       rF\+" S7S49 " S8 S9\C\5      5       rG\+ " S: S;\C5      5       rH " S< S=\Rn                  5      rI\+" S>S49 " S? S@\C5      5       rJ\+ " SA SB\C5      5       rK\+ " SC SD\C5      5       rL " SE SF\Rn                  5      rM\+ " SG SH\C5      5       rN/ SIQrOg)K    )CallableN)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )RobertaConfigc                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjr\S 5       r\SS j5       rSrU =r$ )RobertaEmbeddings8   zGConstruct the embeddings from word, position and token_type embeddings.c                 >  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR$                  5      R'                  S5      SS9  U R                  S[         R(                  " U R*                  R-                  5       [         R.                  S9SS9  UR                  U l        [        R                  " UR$                  UR
                  U R0                  S9U l        g )	N)padding_idxepsposition_idsr$   F)
persistenttoken_type_ids)dtype)super__init__nn	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandzerosr-   sizelongr*   position_embeddingsselfconfig	__class__s     }/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/roberta/modeling_roberta.pyr4   RobertaEmbeddings.__init__;   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
     N	input_idsr1   r-   inputs_embedspast_key_values_lengthreturnc                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )Nr/   r1   r   r$   )dimindexr2   device)"create_position_ids_from_input_idsr*   &create_position_ids_from_inputs_embedsrH   hasattrr1   rF   shaperC   gatherrG   rI   r-   rZ   r:   r<   rJ   r=   rA   )rL   rR   r1   r-   rS   rT   input_shape
batch_size
seq_lengthbuffered_token_type_idsr<   
embeddingsrJ   s                rO   forwardRobertaEmbeddings.forwardO   sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
rQ   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr/   r$   rY   r   )rH   rC   rD   rI   rZ   	unsqueezerF   )rS   r*   r`   sequence_lengthr-   s        rO   r\   8RobertaEmbeddings.create_position_ids_from_inputs_embeds   sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<rQ   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r$   rW   )neintrC   cumsumtype_asrI   )rR   r*   rT   maskincremental_indicess        rO   r[   4RobertaEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77rQ   )r=   rA   r*   rJ   r<   r:   )NNNNr   )r   )__name__
__module____qualname____firstlineno____doc__r4   rC   
LongTensorFloatTensorrn   Tensorre   staticmethodr\   r[   __static_attributes____classcell__rN   s   @rO   r'   r'   8   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8rQ   r'   modulequerykeyvalueattention_maskscalingrA   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr/            r   rl   )ptrainingr$   )
rH   rC   matmul	transposer5   
functionalsoftmaxrA   r   
contiguous)
r   r   r   r   r   r   rA   r   attn_weightsattn_outputs
             rO   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rQ   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )RobertaSelfAttention   Nc                 N  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        UR&                  U l        X l        X0l        g Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r3   r4   r8   num_attention_headsr]   
ValueErrorrM   rn   attention_head_sizeall_head_sizer   r5   Linearr   r   r   r?   attention_probs_dropout_probrA   
is_decoder	is_causal	layer_idxrL   rM   r   r   rN   s       rO   r4   RobertaSelfAttention.__init__   sM    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rQ   hidden_statesr   past_key_valuesr   rU   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n	UbA  Un
[        U[        5      (       a  UR                  n
U
R                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nX4$ )Nr/   r$   r           rA   r   )r^   r   r   viewr   r   r   
isinstancer   self_attention_cacheupdater   r   get_interfacerM   _attn_implementationr   r   rA   r   r   reshaper   )rL   r   r   r   r   r`   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                 rO   re   RobertaSelfAttention.forward   s    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS&&5#/+>??*9*N*N' &=%C%CI\`\j\j%k"I(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rQ   )r   r   rM   rA   r   r   r   r   r   r   r   r   FN)NNrt   ru   rv   rw   r4   rC   r{   rz   r   r   r   tuplere   r}   r~   r   s   @rO   r   r      sl    #6 48(,	')||') ))D0') 	')
 +,') 
u||	') ')rQ   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                     4S
 jjrSrU =r$ )RobertaCrossAttentioni  Nc                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        X l        X0l        g r   )r3   r4   r8   r   r]   r   rM   rn   r   r   r   r5   r   r   r   r   r?   r   rA   r   r   r   s       rO   r4   RobertaCrossAttention.__init__  s@    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rQ   r   encoder_hidden_statesr   r   r   rU   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nUb%  UR
                  R                  U R                  5      OSn	Ubb  U	(       a[  UR                  R                  U R                     R                  n
UR                  R                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUbA  UR                  R                  XU R                  5      u  pSUR
                  U R                  '   [        R                   " U R"                  R$                  [&        5      nU" U UU
UU4U R(                  (       d  SOU R*                  R,                  U R.                  S.UD6u  pUR0                  " / UQSP76 R3                  5       nX4$ )Nr/   r$   r   FTr   r   )r^   r   r   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesr   r   r   r   r   rM   r   r   r   rA   r   r   r   r   )rL   r   r   r   r   r   r`   r   r   r   r   r   kv_shaper   r   r   s                   rO   re   RobertaCrossAttention.forward  s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)UDNN*&	 >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rQ   )r   r   rM   rA   r   r   r   r   r   r   r   r   )NNN)rt   ru   rv   rw   r4   rC   r{   rz   r   r   r   r   re   r}   r~   r   s   @rO   r   r     s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1) 1)rQ   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RobertaSelfOutputiM  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr+   )r3   r4   r5   r   r8   denser=   r>   r?   r@   rA   rK   s     rO   r4   RobertaSelfOutput.__init__N  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rQ   r   input_tensorrU   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rA   r=   rL   r   r   s      rO   re   RobertaSelfOutput.forwardT  5    

=1]3}'CDrQ   r=   r   rA   
rt   ru   rv   rw   r4   rC   r{   re   r}   r~   r   s   @rO   r   r   M  6    >U\\  RWR^R^  rQ   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\\R                     4S jjrSrU =r$ )RobertaAttentioni[  Nc                    > [         TU ]  5         X@l        U(       a  [        O[        nU" XUS9U l        [        U5      U l        g )Nr   r   )r3   r4   is_cross_attentionr   r   rL   r   output)rL   rM   r   r   r   attention_classrN   s         rO   r4   RobertaAttention.__init__\  s9    "43E/K_#F9U	'/rQ   r   r   r   encoder_attention_maskr   r   rU   c                     U R                   (       d  UOUnU R                  " U4UUUS.UD6u  pxU R                  Xq5      nXx4$ )N)r   r   r   )r   rL   r   )	rL   r   r   r   r   r   r   attention_outputr   s	            rO   re   RobertaAttention.forwardc  s\     04/F/FLb)-*
"7)+	*

 *
&  ;;'7G--rQ   )r   r   rL   )FNFNNNNr   r   s   @rO   r   r   [  s    0 48:>;?(,.||. ))D0.  %0047	.
 !& 1 1D 8. . +,. 
u||	. .rQ   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RobertaIntermediateix  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r3   r4   r5   r   r8   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnrK   s     rO   r4   RobertaIntermediate.__init__y  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rQ   r   rU   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )rL   r   s     rO   re   RobertaIntermediate.forward  s&    

=100?rQ   r   r   r   s   @rO   r   r   x  s(    9U\\ ell  rQ   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )RobertaOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r3   r4   r5   r   r   r8   r   r=   r>   r?   r@   rA   rK   s     rO   r4   RobertaOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rQ   r   r   rU   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      rO   re   RobertaOutput.forward  r   rQ   r   r   r   s   @rO   r   r     r   rQ   r   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	\
   S
\R                  4S jjrS rSrU =r$ )RobertaLayeri  Nc                   > [         TU ]  5         UR                  U l        SU l        [	        XR
                  US9U l        UR
                  U l        UR                  U l        U R                  (       a0  U R
                  (       d  [        U  S35      e[	        USUSS9U l	        [        U5      U l        [        U5      U l        g )Nr$   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r3   r4   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   )rL   rM   r   rN   s      rO   r4   RobertaLayer.__init__  s    '-'E'E$)&<M<MYbc ++#)#=#= ##?? D6)g!hii"2##'	#D 07#F+rQ   r   r   r   r   r   r   rU   c                 2   U R                   " UU4SU0UD6u  pxUn	U R                  (       a?  Ub<  [        U S5      (       d  [        SU  S35      eU R                  " US UU4SU0UD6u  pU
n	[        U R                  U R                  U R                  U	5      nU$ )Nr   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   r]   r   r   r   feed_forward_chunkr   r   )rL   r   r   r   r   r   r   self_attention_output_r   cross_attention_outputlayer_outputs               rO   re   RobertaLayer.forward  s     $(>>$
 ,$
 	$
  1??4@4!122 =dV DD D 
 )-(;(;%%&	)
 !0) )%"  60##T%A%A4CSCSUe
 rQ   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   )rL   r   intermediate_outputr  s       rO   r  RobertaLayer.feed_forward_chunk  s)    "//0@A{{#6IrQ   )r   r   r   r   r   r   r   r   r   r   )rt   ru   rv   rw   r4   rC   r{   rz   r   r   r   re   r  r}   r~   r   s   @rO   r   r     s    ,, 48:>;?(,%||% ))D0%  %0047	%
 !& 1 1D 8% % +,% 
%N rQ   r   c                   x   ^  \ rS rSr\rSrSrSrSr	Sr
Sr\\\S.r\R"                  " 5       U 4S j5       rSrU =r$ )RobertaPreTrainedModeli  robertaT)r   
attentionscross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a|  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R
                  " UR                  5        gg)zInitialize the weightsr/   r.   N)r3   _init_weightsr   RobertaLMHeadinitzeros_biasr'   copy_r-   rC   rD   r^   rF   r1   )rL   r   rN   s     rO   r  $RobertaPreTrainedModel._init_weights  s     	f%fm,,KK$ 122JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 3rQ    )rt   ru   rv   rw   r%   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   r   _can_record_outputsrC   no_gradr  r}   r~   r   s   @rO   r  r    sV     L!&*#N"&%*1 ]]_/ /rQ   r  c                      ^  \ rS rSrU 4S jr     SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\	S-  S
\
\   S\\R                     \-  4S jjrSrU =r$ )RobertaEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )N)r   )	r3   r4   rM   r5   
ModuleListrangenum_hidden_layersr   layer)rL   rM   irN   s      rO   r4   RobertaEncoder.__init__  sH    ]]uU[UmUmOn#oOn!L$EOn#op
#os   ANr   r   r   r   r   	use_cacher   rU   c                     [        U R                  5       H  u  pU	" UUU4UUS.UD6nM     [        UU(       a  US9$ S S9$ )N)r   r   )last_hidden_stater   )	enumerater)  r   )
rL   r   r   r   r   r   r,  r   r*  layer_modules
             rO   re   RobertaEncoder.forward  sg      )4OA(% (> / M  5 9+/8O
 	
>B
 	
rQ   )rM   r)  )NNNNN)rt   ru   rv   rw   r4   rC   r{   rz   r   boolr   r   r   r   re   r}   r~   r   s   @rO   r$  r$    s    q 48:>;?(,!%
||
 ))D0
  %0047	

 !& 1 1D 8
 
 $;
 +,
 
u||	H	H
 
rQ   r$  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )RobertaPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r3   r4   r5   r   r8   r   Tanh
activationrK   s     rO   r4   RobertaPooler.__init__  s9    YYv1163E3EF
'')rQ   r   rU   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   r7  )rL   r   first_token_tensorpooled_outputs       rO   re   RobertaPooler.forward  s6     +1a40

#566rQ   )r7  r   r   r   s   @rO   r4  r4    s(    $
U\\ ell  rQ   r4  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                     ^  \ rS rSrSS/rSU 4S jjrS rS r\\	\
         SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\\   S\\R                     \-  4S jj5       5       5       rS rSrU =r$ )RobertaModeli  r'   r   c                    > [         TU ]  U5        Xl        SU l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
FN)r3   r4   rM   gradient_checkpointingr'   rd   r$  encoderr4  pooler	post_init)rL   rM   add_pooling_layerrN   s      rO   r4   RobertaModel.__init__.  sS    
 	 &+#+F3%f-/@mF+d 	rQ   c                 .    U R                   R                  $ r   rd   r:   rL   s    rO   get_input_embeddings!RobertaModel.get_input_embeddings?  s    ...rQ   c                 $    XR                   l        g r   rI  )rL   r   s     rO   set_input_embeddings!RobertaModel.set_input_embeddingsB  s    */'rQ   NrR   r   r1   r-   rS   r   r   r   r,  r   rU   c
           
         US L US L-  (       a  [        S5      eU R                  R                  (       a  U	b  U	OU R                  R                  n	OSn	U	(       ab  Uc_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9nUb  UR                  5       OSnU R                  UUUUUS9nU R                  UUUUUS9u  p'U R                  " U4UUUUU	US.U
D6nUR                  nU R                  b  U R                  U5      OS n[        UUUR                  S9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsF)rM   r   )rR   r-   r1   rS   rT   )r   r   embedding_outputr   r   )r   r   r   r   r,  r-   )r.  pooler_outputr   )r   rM   r   r,  is_encoder_decoderr   r   get_seq_lengthrd   _create_attention_masksrC  r.  rD  r   r   )rL   rR   r   r1   r-   rS   r   r   r   r,  r   rT   rQ  encoder_outputssequence_outputr<  s                   rO   re   RobertaModel.forwardE  sw     -t";<YZZ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg??%)'#9 + 
 261M1M)#9-"7+ 2N 2
. ,,	
)"7#9+%	
 	
 *;;8<8OO4UY;-'+;;
 	
rQ   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)rM   rS   r   r   )rM   rS   r   )rM   rS   r   r   )rM   r   r   r   )rL   r   r   rQ  r   r   s         rO   rU  $RobertaModel._create_attention_masks  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55rQ   )rM   rd   rC  rB  rD  )T)	NNNNNNNNN)rt   ru   rv   rw   _no_split_modulesr4   rK  rN  r"   r#   r   rC   r{   r   r2  r   r   r   r   re   rU  r}   r~   r   s   @rO   r@  r@    s3    -n="/0   *..2.2,0-1596:(,!%?
<<$&?
 t+?
 t+	?

 llT)?
 ||d*?
  %||d2?
 !&t 3?
 ?
 $;?
 +,?
 
u||	K	K?
    ?
B6 6rQ   r@  zS
    RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
    c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	           SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\\
R                        S-  S\S-  S\\
R                   -  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )RobertaForCausalLMi  )roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzOIf you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`FrF  
r3   r4   r   loggerwarningr@  r  r  lm_headrE  rK   s     rO   r4   RobertaForCausalLM.__init__  sL       NNlm#FeD$V, 	rQ   c                 .    U R                   R                  $ r   rf  decoderrJ  s    rO   get_output_embeddings(RobertaForCausalLM.get_output_embeddings      ||###rQ   c                 $    XR                   l        g r   ri  rL   new_embeddingss     rO   set_output_embeddings(RobertaForCausalLM.set_output_embeddings      -rQ   NrR   r   r1   r-   rS   r   r   labelsr   r,  logits_to_keepr   rU   c                    Ub  Sn
U R                   " U4UUUUUUU	U
SS.	UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
>>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
>>> config.is_decoder = True
>>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NFT)	r   r1   r-   rS   r   r   r   r,  return_dict)logitsrt  r7   )lossrx  r   r   r  r  r  )r  r.  r   rn   slicerf  loss_functionrM   r7   r   r   r   r  r  )rL   rR   r   r1   r-   rS   r   r   rt  r   r,  ru  r   outputsr   slice_indicesrx  ry  s                     rO   re   RobertaForCausalLM.forward  s    ` I@DA
))%'"7#9+A
 A
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD0#33!//))$55
 	
rQ   rf  r  )NNNNNNNNNNr   )rt   ru   rv   rw   _tied_weights_keysr4   rk  rq  r!   r   rC   ry   rz   r   r2  rn   r{   r   r   r   re   r}   r~   r   s   @rO   r]  r]    s    #N .

$.  .237260426:>;?*.BF!%-.O
##d*O
 ))D0O
 ((4/	O

 &&-O
 ((4/O
  %0047O
 !& 1 1D 8O
   4'O
 uU%6%6784?O
 $;O
 ell*O
 +,O
 
u||	@	@O
  O
rQ   r]  c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	        SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )RobertaForMaskedLMi  r^  r_  r`  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NznIf you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Frb  rc  rK   s     rO   r4   RobertaForMaskedLM.__init__"  sR     NN1
 $FeD$V, 	rQ   c                 .    U R                   R                  $ r   ri  rJ  s    rO   rk  (RobertaForMaskedLM.get_output_embeddings1  rm  rQ   c                 $    XR                   l        g r   ri  ro  s     rO   rq  (RobertaForMaskedLM.set_output_embeddings4  rs  rQ   NrR   r   r1   r-   rS   r   r   rt  r   rU   c	                 p   U R                   " U4UUUUUUSS.U	D6n
U
S   nU R                  U5      nSnUba  UR                  UR                  5      n[	        5       nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
T)r   r1   r-   rS   r   r   rw  r   Nr/   ry  rx  r   r  )r  rf  torZ   r   r   rM   r7   r   r   r  )rL   rR   r   r1   r-   rS   r   r   rt  r   r|  rW  prediction_scoresmasked_lm_lossloss_fcts                  rO   re   RobertaForMaskedLM.forward7  s    : ,,

))%'"7#9

 

 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rQ   r  )NNNNNNNN)rt   ru   rv   rw   r  r4   rk  rq  r!   r   rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s'    #N .
$.  .237260426:>;?*.5
##d*5
 ))D05
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
 +,5
 
u||	~	-5
  5
rQ   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  iq  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        g r   )r3   r4   r5   r   r8   r   r=   r>   
layer_normr7   rj  	ParameterrC   rG   r  rK   s     rO   r4   RobertaLMHead.__init__t  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rQ   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r
   r  rj  rL   featuresr   xs       rO   re   RobertaLMHead.forward|  s;    JJx GOOA LLOrQ   )r  rj  r   r  	rt   ru   rv   rw   rx   r4   re   r}   r~   r   s   @rO   r  r  q  s    4A rQ   r  z
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ ) RobertaForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFrb  )	r3   r4   
num_labelsrM   r@  r  RobertaClassificationHead
classifierrE  rK   s     rO   r4   )RobertaForSequenceClassification.__init__  sH      ++#FeD3F; 	rQ   NrR   r   r1   r-   rS   rt  r   rU   c           	         U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n
SnUGb  UR                  U
R                  5      nU R                  R
                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R
                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R
                  S:X  a=  [        5       nU" U
R                  S	U R                  5      UR                  S	5      5      nO,U R                  R
                  S:X  a  [        5       nU" X5      n[!        UU
UR"                  UR$                  S
9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Tr   r1   r-   rS   rw  r   Nr$   
regressionsingle_label_classificationmulti_label_classificationr/   r  )r  r  r  rZ   rM   problem_typer  r2   rC   rI   rn   r   squeezer   r   r   r   r   r  rL   rR   r   r1   r-   rS   rt  r   r|  rW  rx  ry  r  s                rO   re   (RobertaForSequenceClassification.forward  s   6 ,,
))%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rQ   )r  rM   r  r  NNNNNN)rt   ru   rv   rw   r4   r!   r   rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s    	  .237260426*.C
##d*C
 ))D0C
 ((4/	C

 &&-C
 ((4/C
   4'C
 +,C
 
u||	7	7C
  C
rQ   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr$   )r3   r4   r@  r  r5   r?   r@   rA   r   r8   r  rE  rK   s     rO   r4   !RobertaForMultipleChoice.__init__  sV     #F+zz&"<"<=))F$6$6: 	rQ   NrR   r1   r   rt  r-   rS   r   rU   c           	      *   Ub  UR                   S   OUR                   S   nUb!  UR                  SUR                  S5      5      OSn	Ub!  UR                  SUR                  S5      5      OSn
Ub!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U	4U
UUUSS.UD6nUS   nU R	                  U5      nU R                  U5      nUR                  SU5      nSnUb.  UR                  UR                  5      n[        5       nU" UU5      n[        UUUR                  UR                  S9$ )aO  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr$   r/   T)r-   r1   r   rS   rw  r  )r^   r   rH   r  rA   r  r  rZ   r   r   r   r  )rL   rR   r1   r   rt  r-   rS   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr|  r<  rx  reshaped_logitsry  r  s                       rO   re    RobertaForMultipleChoice.forward  s   V -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,
*..,
 
  
]3/ ++b+6YY556F')HOV4D("!//))	
 	
rQ   )r  rA   r  r  )rt   ru   rv   rw   r4   r!   r   rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s      .22637*.0426P
##d*P
 ((4/P
 ))D0	P

   4'P
 &&-P
 ((4/P
 +,P
 
u||	8	8P
  P
rQ   r  c                   8  ^  \ rS rSrU 4S jr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForTokenClassificationiB  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r3   r4   r  r@  r  classifier_dropoutr@   r5   r?   rA   r   r8   r  rE  rL   rM   r  rN   s      rO   r4   &RobertaForTokenClassification.__init__D  s      ++#FeD)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rQ   NrR   r   r1   r-   rS   rt  r   rU   c           	      z   U R                   " U4UUUUSS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUbW  UR                  U
R                  5      n[        5       nU" U
R                  SU R                  5      UR                  S5      5      n[        UU
UR                  UR                  S9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr  r   Nr/   r  )r  rA   r  r  rZ   r   r   r  r   r   r  r  s                rO   re   %RobertaForTokenClassification.forwardR  s    2 ,,
))%'
 
 "!*,,71YYv}}-F')HFKKDOO<fkk"oND$!//))	
 	
rQ   )r  rA   r  r  r  )rt   ru   rv   rw   r4   r!   r   rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r  B  s      .237260426*.2
##d*2
 ))D02
 ((4/	2

 &&-2
 ((4/2
   4'2
 +,2
 
u||	4	42
  2
rQ   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r3   r4   r5   r   r8   r   r  r@   r?   rA   r  out_projr  s      rO   r4   "RobertaClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrQ   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r:  )rA   r   rC   tanhr  r  s       rO   re   !RobertaClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rQ   )r   rA   r  r  r   s   @rO   r  r    s    7I rQ   r  c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )RobertaForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r3   r4   r  r@  r  r5   r   r8   
qa_outputsrE  rK   s     rO   r4   $RobertaForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	rQ   NrR   r   r1   r-   rS   start_positionsend_positionsr   rU   c           	         U R                   " U4UUUUSS.UD6n	U	S   n
U R                  U
5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S	-  n[        UUUU	R                  U	R                  S
9$ )a  
token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.
    This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
    >= 2. All the value in this tensor should be always < type_vocab_size.

    [What are token type IDs?](../glossary#token-type-ids)
Tr  r   r$   r/   rl   N)ignore_indexr   )ry  start_logits
end_logitsr   r  )r  r  splitr  r   lenrH   clampr   r   r   r  )rL   rR   r   r1   r-   rS   r  r  r   r|  rW  rx  r  r  
total_lossignored_indexr  
start_lossend_losss                      rO   re   #RobertaForQuestionAnswering.forward  sx   0 ,,
))%'
 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!!//))
 	
rQ   )r  r  r  )NNNNNNN)rt   ru   rv   rw   r4   r!   r   rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s      .2372604263715>
##d*>
 ))D0>
 ((4/	>

 &&->
 ((4/>
 ))D0>
 ''$.>
 +,>
 
u||	;	;>
  >
rQ   r  )r]  r  r  r  r  r  r@  r  )Nr   )Pcollections.abcr   rC   torch.nnr5   r   r   r    r   r  activationsr	   r
   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r    utils.genericr!   r"   utils.output_capturingr#   configuration_robertar%   
get_loggerrt   rd  Moduler'   r{   floatr   r   r   r   r   r   r   r   r  r$  r4  r@  r]  r  r  r  r  r  r  r  __all__r  rQ   rO   <module>r     s  , %   A A & ' C C ) J 9	 	 	 G & 6 @ @ I 5 0 
		H	%g8		 g8` !%II%<<% 
% <<	%
 LL4'% T\% % '(%8@)299 @)FI)BII I)X		 .ryy .:")) BII >- >B /_ / /2
RYY 
@BII  	|6) |6|6~ 
i
/ i

i
X R
/ R
 R
jBII , Q
'= Q
Q
h ]
5 ]
 ]
@ C
$: C
 C
L		 , K
"8 K
 K
\	rQ   