
    Z j                         S r SSKJr  SSKrSSKJr  SSKJrJrJr  SSK	J
r  SSKJrJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJrJrJrJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/J0r0  SSK1J2r2  SSK3J4r4  \-Rj                  " \65      r7 " S S\Rp                  5      r9  SNS\Rp                  S\Rt                  S\Rt                  S\Rt                  S\Rt                  S-  S\;S-  S\;S\'\+   4S  jjr< " S! S"\Rp                  5      r= " S# S$\Rp                  5      r> " S% S&\Rp                  5      r? " S' S(\Rp                  5      r@ " S) S*\Rp                  5      rA " S+ S,\Rp                  5      rB " S- S.\Rp                  5      rC " S/ S0\5      rD " S1 S2\Rp                  5      rE " S3 S4\Rp                  5      rF\, " S5 S6\%5      5       rG\," S7S89 " S9 S:\G5      5       rH\," S;S89 " S< S=\G\5      5       rI\, " S> S?\G5      5       rJ " S@ SA\Rp                  5      rK\," SBS89 " SC SD\G5      5       rL\, " SE SF\G5      5       rM\, " SG SH\G5      5       rN " SI SJ\Rp                  5      rO\, " SK SL\G5      5       rP/ SMQrQg)OzPyTorch X-MOD model.    )CallableN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )
XmodConfigc                      ^  \ rS rSrSrU 4S jr     SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\	S
\R                  4S jjr\S 5       r\SS j5       rSrU =r$ )XmodEmbeddings3   zGConstruct the embeddings from word, position and token_type embeddings.c                 >  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR$                  5      R'                  S5      SS9  U R                  S[         R(                  " U R*                  R-                  5       [         R.                  S9SS9  UR                  U l        [        R                  " UR$                  UR
                  U R0                  S9U l        g )	N)padding_idxepsposition_idsr%   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandzerosr.   sizelongr+   position_embeddingsselfconfig	__class__s     w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/xmod/modeling_xmod.pyr5   XmodEmbeddings.__init__6   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
     N	input_idsr2   r.   inputs_embedspast_key_values_lengthreturnc                    Uc;  Ub  U R                  XR                  U5      nOU R                  X@R                  5      nUb  UR                  5       nOUR                  5       S S nUu  pxUc  [	        U S5      (       aQ  U R
                  R                  UR                  S   S5      n	[        R                  " U	SUS9n	U	R                  Xx5      nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R!                  U5      nX-   nU R#                  U5      nU R%                  U5      nU$ )Nr0   r2   r   r%   )dimindexr3   device)"create_position_ids_from_input_idsr+   &create_position_ids_from_inputs_embedsrH   hasattrr2   rF   shaperC   gatherrG   rI   r.   rZ   r:   r<   rJ   r=   rA   )rL   rR   r2   r.   rS   rT   input_shape
batch_size
seq_lengthbuffered_token_type_idsr<   
embeddingsrJ   s                rO   forwardXmodEmbeddings.forwardJ   sb    $#FF//1G   $JJ=ZjZjk #..*K',,.s3K!,

 !t-..*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
!W!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D5
^^J/
\\*-
rQ   c                     U R                  5       SS nUS   n[        R                  " US-   X1-   S-   [        R                  U R                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr0   r%   rY   r   )rH   rC   rD   rI   rZ   	unsqueezerF   )rS   r+   r`   sequence_lengthr.   s        rO   r\   5XmodEmbeddings.create_position_ids_from_inputs_embedsz   sn     $((*3B/%a.||!O_:Q>ejjYfYmYm
 %%a(//<<rQ   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r%   rW   )neintrC   cumsumtype_asrI   )rR   r+   rT   maskincremental_indicess        rO   r[   1XmodEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77rQ   )r=   rA   r+   rJ   r<   r:   )NNNNr   )r   )__name__
__module____qualname____firstlineno____doc__r5   rC   
LongTensorFloatTensorrn   Tensorre   staticmethodr\   r[   __static_attributes____classcell__rN   s   @rO   r(   r(   3   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8rQ   r(   modulequerykeyvalueattention_maskscalingrA   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr0            r   rl   )ptrainingr%   )
rH   rC   matmul	transposer   
functionalsoftmaxrA   r   
contiguous)
r   r   r   r   r   r   rA   r   attn_weightsattn_outputs
             rO   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rQ   c                      ^  \ rS rSrSU 4S jjr  SS\R                  S\R                  S-  S\S-  S\	\
   S\\R                     4
S	 jjrS
rU =r$ )XmodSelfAttention   Nc                 N  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        UR&                  U l        X l        X0l        g Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r4   r5   r8   num_attention_headsr]   
ValueErrorrM   rn   attention_head_sizeall_head_sizer   r   Linearr   r   r   r?   attention_probs_dropout_probrA   
is_decoder	is_causal	layer_idxrL   rM   r   r   rN   s       rO   r5   XmodSelfAttention.__init__   sM    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++""rQ   hidden_statesr   past_key_valuesr   rU   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n	UbA  Un
[        U[        5      (       a  UR                  n
U
R                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UUU	U4U R"                  (       d  SOU R$                  R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nX4$ )Nr0   r%   r           rA   r   )r^   r   r   viewr   r   r   
isinstancer   self_attention_cacheupdater   r   get_interfacerM   _attn_implementationr   r   rA   r   r   reshaper   )rL   r   r   r   r   r`   hidden_shapequery_layer	key_layervalue_layercurrent_past_key_valuesattention_interfacer   r   s                 rO   re   XmodSelfAttention.forward   s    $))#2.CCbC$*B*BC jj/44lCMMaQRSHH]+00,?II!QO	jj/44lCMMaQRS&&5#/+>??*9*N*N' &=%C%CI\`\j\j%k"I(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rQ   )r   r   rM   rA   r   r   r   r   r   r   r   r   FN)NN)rt   ru   rv   rw   r5   rC   r{   rz   r   r   r   tuplere   r}   r~   r   s   @rO   r   r      sl    #6 48(,	')||') ))D0') 	')
 +,') 
u||	') ')rQ   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\S-  S\	\
   S	\\R                     4S
 jjrSrU =r$ )XmodCrossAttention   Nc                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        U R                  S-  U l
        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        X l        X0l        g r   )r4   r5   r8   r   r]   r   rM   rn   r   r   r   r   r   r   r   r   r?   r   rA   r   r   r   s       rO   r5   XmodCrossAttention.__init__   s@    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  #)#=#= #&v'9'9F<V<V'V#W !558P8PP//5YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF""rQ   r   encoder_hidden_statesr   r   r   rU   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nUb%  UR
                  R                  U R                  5      OSn	Ubb  U	(       a[  UR                  R                  U R                     R                  n
UR                  R                  U R                     R                  nO/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUbA  UR                  R                  XU R                  5      u  pSUR
                  U R                  '   [        R                   " U R"                  R$                  [&        5      nU" U UU
UU4U R(                  (       d  SOU R*                  R,                  U R.                  S.UD6u  pUR0                  " / UQSP76 R3                  5       nX4$ )Nr0   r%   r   FTr   r   )r^   r   r   r   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesr   r   r   r   r   rM   r   r   r   rA   r   r   r   r   )rL   r   r   r   r   r   r`   r   r   r   r   r   kv_shaper   r   r   s                   rO   re   XmodCrossAttention.forward  s    $))#2.CCbC$*B*BC jj/44\BLLQPQRGVGb_//33DNNChm
&:'==DDT^^TYYI)??FFt~~V]]KX.44Sb9X2Xt?W?WXH!67<<XFPPQRTUVI**%:;@@JTTUVXYZK*)8)N)N)U)UDNN*&	 >B**4>>:(?(M(MKK,,.E)
 %8	%
  $}}C$,,..LL	%
 	%
! "));;;;FFH((rQ   )r   r   rM   rA   r   r   r   r   r   r   r   r   )NNN)rt   ru   rv   rw   r5   rC   r{   rz   r   r   r   r   re   r}   r~   r   s   @rO   r   r      s    #4 ;?376:1)||1)  %00471) ))D0	1)
 -t31) +,1) 
u||	1) 1)rQ   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )XmodSelfOutputiK  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr,   )r4   r5   r   r   r8   denser=   r>   r?   r@   rA   rK   s     rO   r5   XmodSelfOutput.__init__M  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rQ   r   input_tensorrU   c                 R    U R                  U5      nU R                  U5      nX-   nU$ N)r   rA   )rL   r   r   s      rO   re   XmodSelfOutput.forwardS  s,    

=1]3%4rQ   )r=   r   rA   
rt   ru   rv   rw   r5   rC   r{   re   r}   r~   r   s   @rO   r   r   K  s6    >U\\  RWR^R^  rQ   r   c                     ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\\\R                        S-  S	\	\
   S
\\R                     4S jjrSrU =r$ )XmodAttentioniZ  Nc                    > [         TU ]  5         X@l        U(       a  [        O[        nU" XUS9U l        [        U5      U l        UR                  U l        g )Nr   r   )	r4   r5   is_cross_attentionr   r   rL   r   outputpre_norm)rL   rM   r   r   r   attention_classrN   s         rO   r5   XmodAttention.__init__[  sD    "40B,HY#F9U	$V,rQ   r   r   r   encoder_attention_maskr   r   rU   c                 :   UnU R                   (       a  U R                  R                  U5      nU R                  (       d  UOUnU R                  " U4UUUS.UD6u  pU R                  X5      nU R                   (       d  U R                  R                  U5      nX4$ )N)r   r   r   )r   r   r=   r   rL   )
rL   r   r   r   r   r   r   residualattention_outputr   s
             rO   re   XmodAttention.forwardd  s     !== KK11-@M/3/F/FLb)-*
"7)+	*

 *
&  ;;'7B}}#{{445EF--rQ   )r   r   r   rL   )FNFNNNN)rt   ru   rv   rw   r5   rC   r{   rz   r   r   r   re   r}   r~   r   s   @rO   r   r   Z  s    ( 48:>;?BF.||. ))D0.  %0047	.
 !& 1 1D 8. uU%6%6784?. +,. 
u||	. .rQ   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )XmodIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r4   r5   r   r   r8   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrK   s     rO   r5   XmodIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rQ   r   rU   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rL   r   s     rO   re   XmodIntermediate.forward  s&    

=100?rQ   r   r   r   s   @rO   r   r     s(    9U\\ ell  rQ   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )XmodAdapteri  c                   > [         TU ]  5         UR                  UR                  -  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " U R                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r4   r5   r8   adapter_reduction_factorbottleneck_sizer   r   dense1dense2r   r   r   r
   adapter_act_fnrK   s     rO   r5   XmodAdapter.__init__  s    %11V5T5TTii 2 2D4H4HIii 4 4f6H6HIf''--"():):";D"("3"3DrQ   r   rU   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   s     rO   re   XmodAdapter.forward  s4    M2++M:M2rQ   )r   r   r   r   r   r   s   @rO   r   r     s(    4U\\ ell  rQ   r   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS	r	U =r
$ )

XmodOutputi  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        UR                  U l	        [        R                  " UR                  5      U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        UR                  U l        [        R                  " 0 5      U l        UR"                   H$  n[%        U5      U R                   ['        U5      '   M&     g r   )r4   r5   r   r   r   r8   r   r=   r>   ln_before_adapterr?   r@   rA   adapter_layer_normadapter_reuse_layer_norm
ModuleDictadapter_modules	languagesr   r   )rL   rM   languagerN   s      rO   r5   XmodOutput.__init__  s    YYv779K9KL
f&8&8f>S>ST!'!9!9zz&"<"<=$$&(ll63E3E6K`K`&aD#&*D#(.(G(G%!}}R0((H2=f2ED  X/ )rQ   r   r   lang_idsrU   c                 t    U R                  U5      nU R                  U5      nX-   nU R                  X15      nU$ r   )r   rA   lang_adapter)rL   r   r   r  s       rO   re   XmodOutput.forward  s<    

=1]3%4))(BrQ   c                    U R                   (       d  UnU R                  b  U R                  U5      nO"U R                  (       a  U R                  U5      nU R                   (       a  Un[        R
                  " U5      n[        U R                  R                  5       5       H&  u  pVX:H  nX'   nU R                  U   " U5      n	XU'   M(     U R                  U5      nUW-  nU$ r   )
r   r   r   r=   rC   
zeros_like	enumerater  r   rA   )
rL   r  r   r   new_hidden_statesadapter_idxlang_key	lang_masklang_hidden_statesadapted_lang_hidden_statess
             rO   r  XmodOutput.lang_adapter  s    %%$H"". 33MBM** NN=9M!!$H!,,];%.t/C/C/H/H/J%K!K /I!.!9)-)=)=h)GHZ)[&+Ei(	 &L %67!rQ   )r=   r   r  r   r   rA   r   )rt   ru   rv   rw   r5   rC   r{   re   r  r}   r~   r   s   @rO   r   r     s`    FU\\  Y^YeYe jojvjv U\\ %,,  rQ   r   c                     ^  \ rS rSrSU 4S jjr    SS\R                  S\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\\\R                        S-  S
\	\
   S\R                  4S jjrS rSrU =r$ )	XmodLayeri  Nc                   > [         TU ]  5         UR                  U l        SU l        [	        XR
                  US9U l        UR
                  U l        UR                  U l        U R                  (       a0  U R
                  (       d  [        U  S35      e[	        USUSS9U l	        [        U5      U l        [        U5      U l        UR                  U l        g )Nr%   r   z> should be used as a decoder model if cross attention is addedFT)r   r   r   )r4   r5   chunk_size_feed_forwardseq_len_dimr   r   	attentionadd_cross_attentionr   crossattentionr   intermediater   r   r   )rL   rM   r   rN   s      rO   r5   XmodLayer.__init__  s    '-'E'E$&v9J9JV_` ++#)#=#= ##?? D6)g!hii"/##'	#D -V4 (rQ   r   r  r   r   r   r   r   rU   c                 
   U R                   " UU4SU0UD6u  pUn
U R                  (       a?  Ub<  [        U S5      (       d  [        SU  S35      eU R                  " U
S UU4SU0UD6u  pUn
U
nU R
                  (       a  U R                  R                  U
5      n
[        U R                  U R                  U R                  U
5      nU R                  XU5      nU R
                  (       d  U R                  R                  U5      nU$ )Nr   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r  r   r]   r   r  r   r   r=   r   feed_forward_chunkr  r  )rL   r   r  r   r   r   r   r   self_attention_output_r   cross_attention_outputr   intermediate_outputlayer_outputs                  rO   re   XmodLayer.forward  s1    $(>>$
 ,$
 	$
  1??4@4!122 =dV DD D 
 )-(;(; %&	)
 !0) )%"  6#==#{{445EF7##((	
 {{#6(K}};;00>LrQ   c                 $    U R                  U5      $ r   )r  )rL   r   s     rO   r  XmodLayer.feed_forward_chunk  s      !122rQ   )	r  r  r  r  r  r   r   r   r  r   r   )rt   ru   rv   rw   r5   rC   r{   rz   r   r   r   re   r  r}   r~   r   s   @rO   r  r    s    (0 48:>;?BF0||0 ,,0 ))D0	0
  %00470 !& 1 1D 80 uU%6%6784?0 +,0 
0d3 3rQ   r  c                   ,  ^  \ rS rSrU 4S jr     SS\R                  S\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\\\R                        S-  S
\	S-  S\
\   S\\R                     \-  4S jjrSrU =r$ )XmodEncoderi  c           
      f  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        UR                  U l
        U R                  (       a/  [        R                  " UR                  UR                  S9U l        g g s  snf )N)r   r,   )r4   r5   rM   r   
ModuleListrangenum_hidden_layersr  layerr   is_pre_normr=   r8   r>   )rL   rM   irN   s      rO   r5   XmodEncoder.__init__  s    ]]ERXRjRjLk#lLkqIf$BLk#lm
!??\\&*<*<&BWBWXDN  $ms   B.Nr   r  r   r   r   r   	use_cacher   rU   c           	          [        U R                  5       H  u  pU
" UUUUUU40 UD6nM     U R                  (       a  U R                  U5      n[	        UU(       a  US9$ S S9$ )N)last_hidden_stater   )r  r.  r/  r=   r   )rL   r   r  r   r   r   r   r2  r   r0  layer_modules              rO   re   XmodEncoder.forward&  sz      )4OA(%& M  5  NN=9M8+/8O
 	
>B
 	
rQ   )r=   rM   r/  r.  )NNNNN)rt   ru   rv   rw   r5   rC   r{   rz   r   boolr   r   r   re   r}   r~   r   s   @rO   r)  r)    s    Y 48:>;?BF!%
||
 ,,
 ))D0	

  %0047
 !& 1 1D 8
 uU%6%6784?
 $;
 +,
 
u||	H	H
 
rQ   r)  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
XmodPooleriF  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r4   r5   r   r   r8   r   Tanh
activationrK   s     rO   r5   XmodPooler.__init__G  s9    YYv1163E3EF
'')rQ   r   rU   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   r<  )rL   r   first_token_tensorpooled_outputs       rO   re   XmodPooler.forwardL  s6     +1a40

#566rQ   )r<  r   r   r   s   @rO   r9  r9  F  s(    $
U\\ ell  rQ   r9  c                      ^  \ rS rSr\rSrSr/ SQrSr	Sr
SrSr\\\S.r\R$                  " 5       U 4S j5       rS\4S jrS	 rS
rU =r$ )XmodPreTrainedModeliU  robertaT)r(   r   r   )r   
attentionscross_attentionsc                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a|  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R
                  " UR                  5        gg)zInitialize the weightsr0   r/   N)r4   _init_weightsr   
XmodLMHeadinitzeros_biasr(   copy_r.   rC   rD   r^   rF   r2   )rL   r   rN   s     rO   rI  !XmodPreTrainedModel._init_weightse  s     	f%fj))KK$//JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 0rQ   r  c           	          XR                   R                  ;  a0  [        U  SU S[        U R                   R                  5       35      eXR                   l        g)z
Set the default language code for the model. This is used when the language is not specified in the input.

Args:
    language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
z does not have an adapter for z. Supported languages: N)rM   r  r   listdefault_language)rL   r  s     rO   set_default_language(XmodPreTrainedModel.set_default_languageo  sW     ;;000&6xj@WX\]a]h]h]r]rXsWtu  (0$rQ   c                     [         R                  S5        U R                  R                  R	                  5        H
  nSUl        M     [         R                  S5        U R                  R                  R                   H~  nUR                  R                  b2  UR                  R                  R	                  5        H
  nSUl        M     UR                  R                  R	                  5        H
  nSUl        M     M     g)z
Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
fine-tuned on a downstream task.
zFreezing embeddingsFzFreezing adaptersN)loggerinforE  rd   
parametersrequires_gradencoderr.  r   r   r  )rL   	parameterr.  s      rO   'freeze_embeddings_and_language_adapters;XmodPreTrainedModel.freeze_embeddings_and_language_adapters|  s    
 	)*00;;=I&+I# >'(\\))//E||..:!&!@!@!K!K!MI.3I+ "N"\\99DDF	*/	' G	 0rQ    )rt   ru   rv   rw   r&   config_classbase_model_prefixsupports_gradient_checkpointingno_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   r   _can_record_outputsrC   no_gradrI  r   rS  r\  r}   r~   r   s   @rO   rD  rD  U  sr    L!&*#TN"&"'. ]]_/ /0S 00 0rQ   rD  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                     ^  \ rS rSrSU 4S jjrS rS r\\\	          SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\
R                     S-  S\S-  S\\   S\\
R                     \-  4S jj5       5       5       rS rSrU =r$ )	XmodModeli  c                    > [         TU ]  U5        Xl        SU l        [	        U5      U l        [        U5      U l        U(       a  [        U5      OSU l	        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
FN)r4   r5   rM   gradient_checkpointingr(   rd   r)  rZ  r9  pooler	post_init)rL   rM   add_pooling_layerrN   s      rO   r5   XmodModel.__init__  sS    
 	 &+#(0"6*,=j(4 	rQ   c                 .    U R                   R                  $ r   rd   r:   rL   s    rO   get_input_embeddingsXmodModel.get_input_embeddings  s    ...rQ   c                 $    XR                   l        g r   rs  )rL   r   s     rO   set_input_embeddingsXmodModel.set_input_embeddings  s    */'rQ   NrR   r  r   r2   r.   rS   r   r   r   r2  r   rU   c                 ^   USL USL-  (       a  [        S5      eU R                  R                  (       a  U
b  U
OU R                  R                  n
OSn
U
(       ab  U	c_  Uc  U R                  R                  (       a.  [        [        U R                  S9[        U R                  S95      O[        U R                  S9n	Ub  UR                  S   OUR                  S   nUb  UR                  OUR                  nU	b  U	R                  5       OSnUc  U R                  R                  c  [        S5      e[        U R                  R                  S   R                  R                  R!                  5       5      nUR#                  U R                  R                  5      nU[$        R&                  " XS9-  nU R)                  UUUUUS9nU R+                  UUUUU	S	9u  p8U R                  " U4UUUUU	U
US
.UD6nUS   nU R,                  b  U R-                  U5      OSn[/        UUUR0                  S9$ )
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
Nz:You must specify exactly one of input_ids or inputs_embedsF)rM   r   zPInput language unknown. Please call `XmodPreTrainedModel.set_default_language()`)rZ   )rR   r.   r2   rS   rT   )r   r   embedding_outputr   r   )r  r   r   r   r   r2  r.   )r4  pooler_outputr   )r   rM   r   r2  is_encoder_decoderr   r   r^   rZ   get_seq_lengthrR  rQ  rZ  r.  r   r  r   rX   rC   onesrd   _create_attention_masksrn  r   r   )rL   rR   r  r   r2   r.   rS   r   r   r   r2  r   ra   rZ   rT   adapter_languagesdefault_lang_idr|  encoder_outputssequence_outputrA  s                        rO   re   XmodModel.forward  s.   , -t";<YZZ;;!!%.%:	@U@UII0 )48V8V $L$DlZ^ZeZeFfg!5  ,5+@Y__Q'mFYFYZ[F\
%.%:!!@T@TETE`!?!?!Afg{{++3 !stt $T\\%7%7%:%A%A%Q%Q%V%V%X Y/55dkk6R6RSO&J)NNH??%)'#9 + 
 261M1M)#9-"7+ 2N 2
. ,,

)"7#9+%

 

 *!,8<8OO4UY;-'+;;
 	
rQ   c                     U R                   R                  (       a  [        U R                   UUUS9nO[        U R                   UUS9nUb  [        U R                   UUUS9nX4$ )N)rM   rS   r   r   )rM   rS   r   )rM   rS   r   r   )rM   r   r   r   )rL   r   r   r|  r   r   s         rO   r  !XmodModel._create_attention_masks
  sr     ;;!!/{{.- /	N 7{{.-N "-%>{{.5&;	&" 55rQ   )rM   rd   rZ  rm  rn  )T)
NNNNNNNNNN)rt   ru   rv   rw   r5   ru  rx  r#   r$   r    rC   r{   ry   rQ  rz   r7  r   r   r   r   re   r  r}   r~   r   s   @rO   rk  rk    sX   $/0   *.,0.2.2,0-1596::>!%O
<<$&O
 ""T)O
 t+	O

 t+O
 llT)O
 ||d*O
  %||d2O
 !&t 3O
 e//047O
 $;O
 +,O
 
u||	K	KO
    O
d6 6rQ   rk  zQ
    X-MOD Model with a `language modeling` head on top for CLM fine-tuning.
    c                      ^  \ rS rSrSSS.rU 4S jrS rS r\\	            SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\\
R                        S-  S\S-  S\\
R                   -  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )XmodForCausalLMi+  )roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`Frp  
r4   r5   r   rV  warningrk  rE  rJ  lm_headro  rK   s     rO   r5   XmodForCausalLM.__init__7  sL       NNij 5A!&) 	rQ   c                 .    U R                   R                  $ r   r  decoderrt  s    rO   get_output_embeddings%XmodForCausalLM.get_output_embeddingsD      ||###rQ   c                 $    XR                   l        g r   r  rL   new_embeddingss     rO   set_output_embeddings%XmodForCausalLM.set_output_embeddingsH      -rQ   NrR   r  r   r2   r.   rS   r   r   labelsr   r2  logits_to_keepr   rU   c                    U	b  SnU R                   " U4UUUUUUUU
USS.
UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnU	b)  U R                  " SUXR                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
>>> config = AutoConfig.from_pretrained("facebook/xmod-base")
>>> config.is_decoder = True
>>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
>>> model.set_default_language("en_XX")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NFT)
r  r   r2   r.   rS   r   r   r   r2  return_dict)logitsr  r7   )lossr  r   r   rF  rG  r^  )rE  r4  r   rn   slicer  loss_functionrM   r7   r   r   r   rF  rG  )rL   rR   r  r   r2   r.   rS   r   r   r  r   r2  r  r   outputsr   slice_indicesr  r  s                      rO   re   XmodForCausalLM.forwardK  s    X I@DA
))%'"7#9+A
 A
  118B>SV8W8W~ot4]kmA}a,?@A%%pVF{{OeOepiopD0#33!//))$55
 	
rQ   r  rE  )NNNNNNNNNNNr   )rt   ru   rv   rw   _tied_weights_keysr5   r  r  r"   r    rC   ry   rz   r   r7  rn   r{   r   r   r   re   r}   r~   r   s   @rO   r  r  +  s    #N .
$.  .2,037260426:>;?*.BF!%-.L
##d*L
 ""T)L
 ))D0	L

 ((4/L
 &&-L
 ((4/L
  %0047L
 !& 1 1D 8L
   4'L
 uU%6%6784?L
 $;L
 ell*L
 +,L
 
u||	@	@L
  L
rQ   r  c                     ^  \ rS rSrSSS.rU 4S jrS rS r\\	         SS	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\\   S\\
R                      \-  4S jj5       5       rSrU =r$ )XmodForMaskedLMi  r  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rK   s     rO   r5   XmodForMaskedLM.__init__  sR     NN1
 !5A!&) 	rQ   c                 .    U R                   R                  $ r   r  rt  s    rO   r  %XmodForMaskedLM.get_output_embeddings  r  rQ   c                 $    XR                   l        g r   r  r  s     rO   r  %XmodForMaskedLM.set_output_embeddings  r  rQ   NrR   r  r   r2   r.   rS   r   r   r  r   rU   c
                 <   U R                   " U4UUUUUUUSS.U
D6nUS   nU R                  U5      nSnU	bF  [        5       nU" UR                  SU R                  R
                  5      U	R                  S5      5      n[        UUUR                  UR                  S9$ )av  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
T)r  r   r2   r.   rS   r   r   r  r   Nr0   r  r  r   rF  )	rE  r  r   r   rM   r7   r   r   rF  )rL   rR   r  r   r2   r.   rS   r   r   r  r   r  r  prediction_scoresmasked_lm_lossloss_fcts                   rO   re   XmodForMaskedLM.forward  s    0 ,,
))%'"7#9
 
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
rQ   r  )	NNNNNNNNN)rt   ru   rv   rw   r  r5   r  r  r"   r    rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s?    #N . $.  .2,037260426:>;?*./
##d*/
 ""T)/
 ))D0	/

 ((4//
 &&-/
 ((4//
  %0047/
 !& 1 1D 8/
   4'/
 +,/
 
u||	~	-/
  /
rQ   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rJ  i  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        g r   )r4   r5   r   r   r8   r   r=   r>   
layer_normr7   r  	ParameterrC   rG   rM  rK   s     rO   r5   XmodLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	rQ   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  r  rL   featuresr   xs       rO   re   XmodLMHead.forward  s;    JJx GOOA LLOrQ   )rM  r  r   r  	rt   ru   rv   rw   rx   r5   re   r}   r~   r   s   @rO   rJ  rJ    s    4A rQ   rJ  z
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )XmodForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr  )	r4   r5   
num_labelsrM   rk  rE  XmodClassificationHead
classifierro  rK   s     rO   r5   &XmodForSequenceClassification.__init__  sH      ++ 5A08 	rQ   NrR   r  r   r2   r.   rS   r  r   rU   c           
         U R                   " U4UUUUUSS.UD6n	U	S   n
U R                  U
5      nSnUGb  U R                  R                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR
                  [        R                  :X  d  UR
                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                  R                  S:X  a=  [        5       nU" UR                  S	U R                  5      UR                  S	5      5      nO,U R                  R                  S:X  a  [        5       nU" X5      n[        UUU	R                  U	R                   S
9$ )aa  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Tr  r   r2   r.   rS   r  r   Nr%   
regressionsingle_label_classificationmulti_label_classificationr0   r  )rE  r  rM   problem_typer  r3   rC   rI   rn   r   squeezer   r   r   r   r   rF  rL   rR   r  r   r2   r.   rS   r  r   r  r  r  r  r  s                 rO   re   %XmodForSequenceClassification.forward  s   , ,,	
))%'	
 	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
rQ   )r  rM   r  rE  NNNNNNN)rt   ru   rv   rw   r5   r"   r    rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s    	  .2,037260426*.=
##d*=
 ""T)=
 ))D0	=

 ((4/=
 &&-=
 ((4/=
   4'=
 +,=
 
u||	7	7=
  =
rQ   r  c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )XmodForMultipleChoicei[  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr%   )r4   r5   rk  rE  r   r?   r@   rA   r   r8   r  ro  rK   s     rO   r5   XmodForMultipleChoice.__init__^  sV      (zz&"<"<=))F$6$6: 	rQ   NrR   r  r2   r   r  r.   rS   r   rU   c           
      d   Ub  UR                   S   OUR                   S   n	Ub!  UR                  SUR                  S5      5      OSn
Ub2  UR                  UR                  S5      UR                  S5      -  5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb!  UR                  SUR                  S5      5      OSnUb1  UR                  SUR                  S5      UR                  S5      5      OSnU R                  " U
4UUUUUSS.UD6nUS   nU R                  U5      nU R                  U5      nUR                  SU	5      nSnUb  [        5       nU" UU5      n[        UUUR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
lang_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr%   r0   r   T)r  r.   r2   r   rS   r  r  )r^   r   rH   repeatrE  rA   r  r   r   r   rF  )rL   rR   r  r2   r   r  r.   rS   r   num_choicesflat_input_idsflat_lang_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr  rA  r  reshaped_logitsr  r  s                         rO   re   XmodForMultipleChoice.forwardh  s   \ -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bRZRf	q(9INN1<M(MNlpLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,	
"*..,	
 	
  
]3/ ++b+6')HOV4D("!//))	
 	
rQ   )r  rA   rE  r  )rt   ru   rv   rw   r5   r"   r    rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r  [  s      .2,02637*.0426S
##d*S
 ""T)S
 ((4/	S

 ))D0S
   4'S
 &&-S
 ((4/S
 +,S
 
u||	8	8S
  S
rQ   r  c                   X  ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )XmodForTokenClassificationi  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r4   r5   r  rk  rE  classifier_dropoutr@   r   r?   rA   r   r8   r  ro  rL   rM   r  rN   s      rO   r5   #XmodForTokenClassification.__init__  s      ++ 5A)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rQ   NrR   r  r   r2   r.   rS   r  r   rU   c           
      F   U R                   " U4UUUUUSS.UD6n	U	S   n
U R                  U
5      n
U R                  U
5      nSnUb<  [        5       nU" UR	                  SU R
                  5      UR	                  S5      5      n[        UUU	R                  U	R                  S9$ )a  
lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of the language adapters that should be activated for each sample, respectively. Default: the index
    that corresponds to `self.config.default_language`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Tr  r   Nr0   r  )	rE  rA   r  r   r   r  r   r   rF  r  s                 rO   re   "XmodForTokenClassification.forward  s    ( ,,	
))%'	
 	
 "!*,,71')HFKKDOO<fkk"oND$!//))	
 	
rQ   )r  rA   r  rE  r  )rt   ru   rv   rw   r5   r"   r    rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s      .2,037260426*.,
##d*,
 ""T),
 ))D0	,

 ((4/,
 &&-,
 ((4/,
   4',
 +,,
 
u||	4	4,
  ,
rQ   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r4   r5   r   r   r8   r   r  r@   r?   rA   r  out_projr  s      rO   r5   XmodClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrQ   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r?  )rA   r   rC   tanhr  r  s       rO   re   XmodClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rQ   )r   rA   r  r  r   s   @rO   r  r    s    7I rQ   r  c                   x  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
\   S\\R                     \-  4S jj5       5       rSrU =r$ )XmodForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r4   r5   r  rk  rE  r   r   r8   
qa_outputsro  rK   s     rO   r5   !XmodForQuestionAnswering.__init__  sU      ++ 5A))F$6$68I8IJ 	rQ   NrR   r  r   r2   r.   rS   start_positionsend_positionsr   rU   c	           
         U R                   " U4UUUUUSS.U	D6n
U
S   nU R                  U5      nUR                  SSS9u  pUR                  S5      R	                  5       nUR                  S5      R	                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S	-  n[        UUUU
R                  U
R                  S
9$ )r{  Tr  r   r%   r0   rl   N)ignore_indexr   )r  start_logits
end_logitsr   rF  )rE  r  splitr  r   lenrH   clampr   r   r   rF  )rL   rR   r  r   r2   r.   rS   r  r  r   r  r  r  r   r  
total_lossignored_indexr  
start_lossend_losss                       rO   re    XmodForQuestionAnswering.forward&  s{   & ,,	
))%'	
 	
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J+%!!//))
 	
rQ   )r  r  rE  )NNNNNNNN)rt   ru   rv   rw   r5   r"   r    rC   ry   rz   r   r   r   r{   r   re   r}   r~   r   s   @rO   r  r    s     .2,0372604263715:
##d*:
 ""T):
 ))D0	:

 ((4/:
 &&-:
 ((4/:
 ))D0:
 ''$.:
 +,:
 
u||	;	;:
  :
rQ   r  )r  r  r  r  r  r  rk  rD  )Nr   )Rrx   collections.abcr   rC   r   torch.nnr   r   r    r	   rK  activationsr
   r   cache_utilsr   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r    r!   utils.genericr"   r#   utils.output_capturingr$   configuration_xmodr&   
get_loggerrt   rV  Moduler(   r{   floatr   r   r   r   r   r   r   r   r  r)  r9  rD  rk  r  r  rJ  r  r  r  r  r  __all__r^  rQ   rO   <module>r     s    $   A A & ' C C ) J 9	 	 	 G & 6 @ @ I 5 * 
		H	%g8RYY g8b !%II%<<% 
% <<	%
 LL4'% T\% % '(%:@)		 @)HI) I)XRYY $.BII $.Pryy ")) $, ,^H3* H3V%
")) %
R  40/ 40 40n M6# M6M6` 
i
)? i

i
X O
) O
 O
f , L
$7 L
L
^ a
/ a
 a
H >
!4 >
 >
DRYY , H
2 H
 H
V	rQ   