
    Z j                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJ r   SSK!J"r"  \ RF                  " \$5      r%Sq&S r'S r(SGS jr)SGS jr*SGS jr+S r, " S S\RZ                  R\                  5      r/ " S S\RZ                  R\                  5      r0 " S S5      r1SHS jr2S r3   SIS jr4 " S S\Rj                  5      r6 " S  S!\Rj                  5      r7 " S" S#\Rj                  5      r8 " S$ S%\Rj                  5      r9 " S& S'\Rj                  5      r: " S( S)\Rj                  5      r; " S* S+\5      r< " S, S-\Rj                  5      r= " S. S/\Rj                  5      r> " S0 S1\Rj                  5      r? " S2 S3\Rj                  5      r@\ " S4 S5\5      5       rA\ " S6 S7\A5      5       rB\ " S8 S9\A5      5       rC " S: S;\Rj                  5      rD\" S<S=9 " S> S?\A5      5       rE\ " S@ SA\A5      5       rF\ " SB SC\A5      5       rG\ " SD SE\A5      5       rH/ SFQrIg)JzPyTorch MRA model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringis_cuda_platformis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                  T    [        5       (       d  [        S5      eSSKJn   U " S5      qg )NzFkernels is not installed, please install it with `pip install kernels`r   
get_kernelzkernels-community/mra)r   ImportErrorintegrations.hub_kernelsr   mra_cuda_kernelr   s    u/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mra/modeling_mra.pyload_cuda_kernelsr"   3   s%    !!bcc6 !89O    c                 H   [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S5      S:w  a  [        S5      eU R                  S	S
9R                  R                  SS	5      nUR                  5       nUR                  5       nUR                  5       n[        R                  XAX#5      u  pVUR                  SS	5      SS2SS2SSS24   nXV4$ )z0
Computes maximum values for softmax stability.
   z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr    	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r!   
sparse_maxr=   <   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::P_!oH'11"b9!Qa-H%%r#   c                    [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  S   UR                  S   :w  a  [        S5      eU R                  u  p4XB-  n[        R
                  " UR                  S5      [        R                  UR                  S9nU R                  X5U5      n XSS2S4   X-  R                  5       SS24   n U $ )zF
Converts attention mask to a sparse mask for high resolution logits.
r&   z$mask must be a 2-dimensional tensor.r'   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r-   r.   r/   shapetorcharangelongrA   reshape)maskr7   
block_size
batch_sizeseq_len	num_block	batch_idxs          r!   sparse_maskrM   X   s     499;1?@@
7<<>aBCCzz!}a((]^^**J%IW\\!_EJJw~~VI<<
z:D!T'"W%8$>$>$@!CDDKr#   c                 f   U R                  5       u  pEnUR                  5       u  pxnXS-  S:w  a  [        S5      eX-  S:w  a  [        S5      eU R                  XEU-  X65      R                  SS5      n UR                  XHU-  X65      R                  SS5      n[	        U R                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      e[	        UR                  5       5      S	:w  a  [        S
5      eU R                  S5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XUR                  5       5      $ )z/
Performs Sampled Dense Matrix Multiplication.
r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r,   r)   r%   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r&   r'   r   r(   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r.   r/   rF   r2   r-   r3   r4   r    mm_to_sparse)	dense_query	dense_keyr7   rH   rI   
query_sizer+   _key_sizes	            r!   rQ   rQ   o   s    #."2"2"4JC ~~'A!#opp!kll%%j
2JJ\ffgikmnK!!**.DjV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''NNr#   c                 B   UR                  5       u  pVnXd-  S:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eU R                  S5      U:w  a  [        S5      eUR                  XVU-  XG5      R                  SS5      n[	        U R                  5       5      S	:w  a  [        S
5      e[	        UR                  5       5      S	:w  a  [        S5      e[	        UR                  5       5      S:w  a  [        S5      eUR                  S5      S:w  a  [        S5      eU R                  5       n UR                  5       nUR                  5       nUR                  5       n[        R                  XX#5      nUR                  SS5      R                  XSU-  U5      nU$ )zH
Performs matrix multiplication of a sparse matrix with a dense matrix.
r   rO   r&   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r,   r)   r%   ,sparse_query must be a 4-dimensional tensor.rP   r'   r(   z8The size of the third dimension of dense_key must be 32.)	r.   r/   rF   r2   r-   r3   r4   r    sparse_dense_mm)	sparse_queryr7   rS   r8   rH   rI   rV   r+   dense_qk_prods	            r!   rY   rY      s    !* 0J#!kllz)lmmz)kll!!**.DjV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L9fM!++B3;;JZdHdfijMr#   c                 X    X-  U-  [         R                  " XSS9-   R                  5       $ )Nfloorrounding_mode)rC   divrE   )r7   dim_1_blockdim_2_blocks      r!   transpose_indicesrc      s*    "k1EIIgbi4jjpprrr#   c                   H    \ rS rSr\S 5       r\S 5       r\SS j5       rSrg)MraSampledDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ N)rQ   save_for_backwardrH   )ctxrR   rS   r7   rH   r6   s         r!   forwardMraSampledDenseMatMul.forward   s)    %kgRkg>#r#   c                     U R                   u  p#nU R                  nUR                  S5      U-  nUR                  S5      U-  n[        XFU5      n[	        UR                  SS5      XU5      n	[	        XX65      n
XS S 4$ Nr   r,   r)   )saved_tensorsrH   r.   rc   rY   r2   )rj   gradrR   rS   r7   rH   r8   r9   	indices_Tgrad_key
grad_querys              r!   backwardMraSampledDenseMatMul.backward   s    *-*;*;'^^
%**1-;!q)Z7%gN	"4>>"b#99S`a$TIO
T4//r#   c                 .    [         R                  XX#5      $ rh   )re   apply)rR   rS   r7   rH   s       r!   operator_call#MraSampledDenseMatMul.operator_call   s    $**;7WWr#    Nr(   	__name__
__module____qualname____firstlineno__staticmethodrk   rt   rx   __static_attributes__rz   r#   r!   re   re      s>      0 0 X Xr#   re   c                   D    \ rS rSr\S 5       r\S 5       r\S 5       rSrg)MraSparseDenseMatMul   c                 N    [        XX45      nU R                  XU5        X@l        U$ rh   )rY   ri   r8   )rj   rZ   r7   rS   r8   r6   s         r!   rk   MraSparseDenseMatMul.forward   s*    (	[lY?-r#   c                     U R                   u  p#nU R                  nUR                  S5      UR                  S5      -  n[        X5U5      n[	        UR                  SS5      XqU5      n[        XU5      n	U	S US 4$ rn   )ro   r8   r.   rc   rY   r2   rQ   )
rj   rp   rZ   r7   rS   r8   r9   rq   rr   rs   s
             r!   rt   MraSparseDenseMatMul.backward   s~    +.+<+<(y--!q)\->->r-BB%gN	"<#9#9"b#A9Tab!$7;
44//r#   c                 .    [         R                  XX#5      $ rh   )r   rw   )rZ   r7   rS   r8   s       r!   rx   "MraSparseDenseMatMul.operator_call   s    #)),\\r#   rz   Nr|   rz   r#   r!   r   r      s>      0 0 ] ]r#   r   c                   $    \ rS rSr\S 5       rSrg)MraReduceSum   c                    U R                  5       u  pEpg[        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      eU R                  5       u    pvnUR                  5       u  pEU R                  SS9R	                  XE-  U5      n [
        R                  " UR                  S5      [
        R                  UR                  S9n[
        R                  " XSS	9R                  5       US S 2S 4   U-  -   R	                  XE-  5      n	[
        R                  " XB-  U4U R                  U R                  S9n
U
R                  SX5      R	                  XBU5      nUR	                  XBU-  5      nU$ )
Nr%   rX   r&   r'   r*   r   r?   r]   r^   )r.   r-   r/   sumrF   rC   rD   rE   rA   r`   zerosr@   	index_add)rZ   r7   r8   r9   rI   rK   rH   rU   rL   global_idxestempoutputs               r!   rx   MraReduceSum.operator_call   sb   /;/@/@/B,
z|  "#q(KLLw||~!#FGG*//11! '
#''A'.66z7MzZLLa

7>>Z	IIgGDIIKiXY[_X_N`crNrr
'*(
) 	 {{):6l>P>PYeYlYl
 <>FFzdno
j,HIr#   rz   N)r}   r~   r   r   r   rx   r   rz   r#   r!   r   r      s     r#   r   c                    U R                  5       u  pVnXb-  nSn	Ub  UR                  XXU5      R                  SS9n
U R                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  nUb/  UR                  XXX'5      R                  SS9U
SS2SS2S4   S-   -  n	OU[        R                  " XX[        R
                  U R                  S9-  n
U R                  XXX'5      R                  SS9nUR                  XXX'5      R                  SS9nUb  UR                  XXX'5      R                  SS9n	[        R                  " XR                  SS5      5      [        R                  " U5      -  nUR                  SSS9R                  nUb0  US	U
SS2SSS24   U
SS2SS2S4   -  S
:  R                  5       -  -
  nXX4$ )z'
Compute low resolution approximation.
Nr,   r*   r)   ư>r?   T)r+   keepdims     @g      ?)r.   rF   r   rC   onesfloatrA   meanmatmulr2   mathsqrtr0   r1   )querykeyrH   rG   valuerI   rJ   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r!   get_low_resolution_logitr     s    %*JJL!J-Ill:*MQQVXQYMM*VZZ_aZb1d
#d*
	 ++jZRVV[]V^1d
#d*
 jZZ^^ce^fAq$J'$.I !5::jSXS^S^glgsgs#ttMM*V[[`b[c	++jZRWW\^W_jZZ__df_gI <<	3D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JUUr#   c                    U R                   u  pVnUS:  a]  US-  n[        R                  " XfU R                  S9n	[        R                  " [        R
                  " X* S9US9n
X
SSS2SS24   S-  -   n US:  a:  U SS2SU2SS24   S-   U SS2SU2SS24'   U SS2SS2SU24   S-   U SS2SS2SU24'   [        R                  " U R                  US5      USSS	S
9nUR                  nUS:X  a@  UR                  R                  SS9R                  nXSS2SS4   :  R                  5       nX4$ US:X  a  SnX4$ [        U S35      e)zR
Compute the indices of the subset of components to be used in the approximation.
r   r&   rA   )diagonalNg     @r,   TF)r+   largestsortedfullr*   sparsez# is not a valid approx_model value.)rB   rC   r   rA   triltriutopkrF   r7   r1   minr   r/   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrI   total_blocks_per_rowrU   offset	temp_maskdiagonal_mask
top_k_valsr7   	thresholdhigh_resolution_masks                  r!   get_block_idxesr   7  s    +?*D*D'Ja&*0A5JJ3RfRmRmn	

5::i'#JU[\3D!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4!T4-8P PWWY (( 
	 # (( K=(KLMMr#   c	                    [         c$  [        R                  " U 5      R                  5       $ U R	                  5       u  ppX-  nX-  S:w  a  [        S5      eX-  nU R                  XU5      n UR                  XU5      nUR                  XU5      nUb*  XSS2SS2S4   -  n XSS2SS2S4   -  nX#SS2SS2S4   -  nUS:X  a  [        XXcU5      u  nnnnOAUS:X  a0  [        R                  " 5          [        XXc5      u  nnnnSSS5        O[        S5      e[        R                  " 5          WW-
  n[        UUUUU5      u  nnSSS5        [        R                  XWUS9[        R                  " U5      -  n[        UUX5      u  nnUU-
  nUb"  USS	[!        UU5      SS2SS2SS2S4   -
  -  -
  n[        R"                  " U5      n[$        R                  UUX.5      n[&        R                  UUX5      nUS:X  Gax  [        R"                  " WW-
  SW-  -
  5      WSS2SSS24   -  n[        R(                  " UW5      SS2SS2SSS24   R+                  S	S	US	5      R                  XU5      nUR-                  S
S9SS2SS2S4   R+                  S	S	U5      R                  X5      nUR+                  S	S	U5      R                  X5      U-
  n Ub  U U-  n [        R"                  " U U S:*  R/                  5       -  5      n!UU!SS2SS2S4   -  nUU!-  n[        R"                  " U * U S:  R/                  5       -  5      n"UU"SS2SS2S4   -  nUU"-  nUU-   USS2SS2S4   USS2SS2S4   -   S-   -  n#O$US:X  a  UUSS2SS2S4   S-   -  n#O[        S5      eUb  U#USS2SS2S4   -  n#U#R                  XX5      n#U#$ ! , (       d  f       GN= f! , (       d  f       GN= f)z(
Use Mra to approximate self-attention.
Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rH   r   r   r,   r*   r   z-config.approx_mode must be "full" or "sparse")r    rC   
zeros_likerequires_grad_r.   r/   rF   r   no_grad	Exceptionr   re   rx   r   r   r=   rM   expr   r   r   repeatr   r   )$r   r   r   rG   r   r   rH   r   r   rI   num_headrJ   r   
meta_batchr   r   r   r   r   rU   low_resolution_logit_normalizedr7   r   high_resolution_logitr;   r<   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r!   mra2_attentionr   ]  sq    &5577.3jjl+J'&Jq OPP-MM*x8E
++j8
4CMM*x8EQ4Z((At$$Q4Z((fUm
%V
Rk+G 
	 ]]_QiJRN +/KQ _
 @AA	*>A]*]'(7+(+)
%% 
 2??G
 @ 		( ",,A7L]!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu  ".!;!;g'8" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcm 	" 6<<Q:NVVWaknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*RMS _ 
s   1N?,O?
O
O c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )MraEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c           	      B  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  S-   UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      S-   5        U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  U R,                  R2                  S9SS	9  g )
N)padding_idxr&   epsposition_idsr   r,   token_type_idsr?   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrC   rD   expandr   r   r.   rE   rA   selfconfig	__class__s     r!   r   MraEmbeddings.__init__  s*   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mnKK))..0

4K\K\KcKcd 	 	
r#   c                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr,   r   r   r   r?   )r.   r   hasattrr   r   rC   r   rE   rA   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r!   rk   MraEmbeddings.forward  s,    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
\\*-
r#   )r   r   r   r   r   )NNNN	r}   r~   r   r   __doc__r   rk   r   __classcell__r   s   @r!   r   r     s    Q
"   r#   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )MraSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      e[        S Ln[        5       (       a0  [        5       (       a!  [        5       (       a  U(       d   [        5         UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        ["        R$                  " UR                  U R                   5      U l        ["        R$                  " UR                  U R                   5      U l        ["        R$                  " UR                  U R                   5      U l        ["        R,                  " UR.                  5      U l        UR2                  S-  UR4                  -  U l        [9        U R6                  [        UR2                  S-  S-  5      5      U l        UR:                  U l        UR<                  U l        UR>                  U l        g ! [         a#  n[        R                  SU 35         S nAGNS nAff = f)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r(   r&   ) r   r   r   num_attention_headsr   r/   r    r   r   r   r"   r   loggerwarningr4   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   block_per_rowrK   r   r   r   r   )r   r   kernel_loadeder   s       r!   r   MraSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 (t3"$$)9););@R@T@T]jn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF 88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,%  n!hijhklmmns   !
H? ?
I,	I''I,c                 @   UR                   u  p4nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nSUS-  -   nUR                  5       R                  SU R                  S5      R                  X0R                  -  U5      R                  5       nSn	U R                  U	:  a  X0R                  XIU R                  -
  4n
[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[        R                  " U[        R                  " XR                  S9/SS9n[!        UR#                  5       UR#                  5       UR#                  5       UR#                  5       U R$                  U R&                  U R(                  U R*                  S	9nU R                  U	:  a  US S 2S S 2S S 2S U R                  24   nUR                  X0R                  X@R                  5      nUR-                  S
SSS5      R/                  5       nUR1                  5       S S U R2                  4-   nUR                  " U6 nU4nU$ )Nr,   r   r&   g      ?r   r(   r   r*   )r   r   r   r   r   r)   )rB   r   viewr  r  r2   r   r   squeezer   rF   r4   rC   catr   rA   r   r   rK   r   r   r   permuter3   r.   r  )r   hidden_statesattention_maskrI   rJ   rU   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                 r!   rk   MraSelfAttention.forward(  s   !.!4!4
QJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ~77""$VAt//3WZ":"::GDSU	 	 ##m3!#;#;WVZVnVnFnnH))[%++hOaOa2b$ciklK		9ekk(K[K[.\"]cefI))[%++hOaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 ##m3)!Q3MT5M5M3M*MNM%--j:R:RT[]u]uv%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD "r#   )r  r   r  r   r   r   r   r  rK   r   r   rh   r}   r~   r   r   r   rk   r   r  r  s   @r!   r
  r
    s    V@< <r#   r
  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )MraSelfOutputih  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r!   r   MraSelfOutput.__init__i  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r#   r  input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rh   r-  r   r   r   r  r/  s      r!   rk   MraSelfOutput.forwardo  5    

=1]3}'CDr#   r   r-  r   
r}   r~   r   r   r   rC   Tensorrk   r   r  r  s   @r!   r*  r*  h  6    >U\\  RWR^R^  r#   r*  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )MraAttentioniv  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g rh   )r   r   r
  r   r*  r   r   s     r!   r   MraAttention.__init__w  s&    $V,	#F+r#   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )r   r   )r   r  r  self_outputsattention_outputr&  s         r!   rk   MraAttention.forward|  s>    yy?;;|AF#%QR(88r#   )r   r   rh   r(  r  s   @r!   r;  r;  v  s    ,
 r#   r;  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rh   )r   r   r   r  r   intermediate_sizer-  
isinstance
hidden_actstrr	   intermediate_act_fnr   s     r!   r   MraIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r#   r  r0  c                 J    U R                  U5      nU R                  U5      nU$ rh   r-  rJ  r   r  s     r!   rk   MraIntermediate.forward  s&    

=100?r#   rM  r7  r  s   @r!   rD  rD    s(    9U\\ ell  r#   rD  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	MraOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r,  )r   r   r   r  rF  r   r-  r   r   r   r   r   r   s     r!   r   MraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r#   r  r/  r0  c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rh   r2  r3  s      r!   rk   MraOutput.forward  r5  r#   r6  r7  r  s   @r!   rQ  rQ    r9  r#   rQ  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )MraLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        [        U5      U l        [        U5      U l
        g Nr   )r   r   chunk_size_feed_forwardseq_len_dimr;  	attentionadd_cross_attentionrD  intermediaterQ  r   r   s     r!   r   MraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r#   c                     U R                  X5      nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ r?  )r\  r   feed_forward_chunkrZ  r[  )r   r  r  self_attention_outputsrA  r&  layer_outputs          r!   rk   MraLayer.forward  sa    !%!N1!4(,0##T%A%A4CSCSUe
  /G+r#   c                 J    U R                  U5      nU R                  X!5      nU$ rh   )r^  r   )r   rA  intermediate_outputrc  s       r!   ra  MraLayer.feed_forward_chunk  s)    "//0@A{{#6Ir#   )r]  r\  rZ  r^  r   r[  rh   )	r}   r~   r   r   r   rk   ra  r   r  r  s   @r!   rW  rW    s    ( r#   rW  c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )
MraEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersrW  layergradient_checkpointing)r   r   rU   r   s      r!   r   MraEncoder.__init__  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A&c                     U(       a  SOS n[        U R                  5       H  u  pgU(       a  XQ4-   nU" X5      nUS   nM      U(       a  XQ4-   nU(       d  [        S X4 5       5      $ [        UUS9$ )Nrz   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frh   rz   ).0vs     r!   	<genexpr>%MraEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)last_hidden_stater  )	enumeratern  tupler   )	r   r  r  output_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss	            r!   rk   MraEncoder.forward  s     #7BD(4OA#$58H$H!(GM)!,M  5   14D DX]$FXXX1++
 	
r#   )r   ro  rn  )NFTr(  r  s   @r!   ri  ri    s    , "
 
r#   ri  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r,  )r   r   r   r  r   r-  rG  rH  rI  r	   transform_act_fnr   r   r   s     r!   r   #MraPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr#   r  r0  c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rh   )r-  r  r   rN  s     r!   rk   "MraPredictionHeadTransform.forward  s4    

=1--m<}5r#   )r   r-  r  r7  r  s   @r!   r  r    s)    UU\\ ell  r#   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )MraLMPredictionHeadi  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)r   r   r  	transformr   r  r   r   decoder	ParameterrC   r   r  r   s     r!   r   MraLMPredictionHead.__init__  s[    3F; yy!3!3V5F5FTRLLV->->!?@	r#   c                 J    U R                  U5      nU R                  U5      nU$ rh   )r  r  rN  s     r!   rk   MraLMPredictionHead.forward  s$    }5]3r#   )r  r  r  r(  r  s   @r!   r  r    s    A r#   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MraOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g rh   )r   r   r  predictionsr   s     r!   r   MraOnlyMLMHead.__init__  s    .v6r#   sequence_outputr0  c                 (    U R                  U5      nU$ rh   r  )r   r  prediction_scoress      r!   rk   MraOnlyMLMHead.forward  s     ,,_=  r#   r  r7  r  s   @r!   r  r    s(    7!u|| ! ! !r#   r  c                      ^  \ rS rSr% \\S'   SrSr\R                  " 5       S\
R                  4U 4S jj5       rSrU =r$ )MraPreTrainedModeli  r   mraTmodulec                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      S-   5        [        R
                  " UR                  5        gg)zInitialize the weightsr,   r   r&   N)r   _init_weightsrG  r  initzeros_r  r   copy_r   rC   rD   rB   r   r   )r   r  r   s     r!   r   MraPreTrainedModel._init_weights  s     	f%f122KK$..JJv**ELL9L9L9R9RSU9V,W,^,^_f,gjk,klKK--. /r#   rz   )r}   r~   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointingrC   r   r   Moduler  r   r  r  s   @r!   r  r    s:     &*#
]]_/BII / /r#   r  c                     ^  \ rS rSrU 4S jrS rS r\       SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S-  S\
S-  S\\-  4S jj5       rSrU =r$ )MraModeli"  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g rh   )r   r   r   r   r  ri  encoder	post_initr   s     r!   r   MraModel.__init__$  s9     '/!&) 	r#   c                 .    U R                   R                  $ rh   r  r   r   s    r!   get_input_embeddingsMraModel.get_input_embeddings.  s    ...r#   c                 $    XR                   l        g rh   r  )r   r   s     r!   set_input_embeddingsMraModel.set_input_embeddings1  s    */'r#   Nr   r  r   r   r   rz  r{  r0  c                 `   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R	                  X5        UR                  5       n	O"Ub  UR                  5       S S n	O[        S5      eU	u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U	[        R                  US9nU R                  X)5      nU R                  UUUUS9nU R!                  UUUUS9nUS	   nU(       d	  U4US
S  -   $ [#        UUR$                  UR&                  UR(                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer,   z5You have to specify either input_ids or inputs_embedsr   r   r?   )r   r   r   r   )r  rz  r{  r   r   )rw  r  
attentionscross_attentions)r   rz  r{  r/   %warn_if_padding_and_no_attention_maskr.   rA   rC   r   r   r  r   r   r   rE   get_extended_attention_maskr  r   r  r  r  )r   r   r  r   r   r   rz  r{  kwargsr   rI   r   rA   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r!   rk   MraModel.forward4  s    %9$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m??%)'	 + 
 ,,2!5#	 ' 
 *!,#%(;;;1-)77&11,==	
 	
r#   )r   r  r  )NNNNNNN)r}   r~   r   r   r   r  r  r   rC   r8  boolry  r   rk   r   r  r  s   @r!   r  r  "  s    /0  *..2.2,0-1,0#'B
<<$&B
 t+B
 t+	B

 llT)B
 ||d*B
 #TkB
 D[B
 
3	3B
 B
r#   r  c                   8  ^  \ rS rSrSSS.rU 4S jrS rS r\        SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )MraForMaskedLMiz  zcls.predictions.biasz%mra.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rh   )r   r   r  r  r  clsr  r   s     r!   r   MraForMaskedLM.__init__  s4     F#!&) 	r#   c                 B    U R                   R                  R                  $ rh   )r  r  r  r  s    r!   get_output_embeddings$MraForMaskedLM.get_output_embeddings  s    xx##+++r#   c                     XR                   R                  l        UR                  U R                   R                  l        g rh   )r  r  r  r  )r   new_embeddingss     r!   set_output_embeddings$MraForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r#   Nr   r  r   r   r   labelsrz  r{  r0  c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n
U
S   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr  r   r   r   rz  r{  r   r,   r   losslogitsr  r  )
r   r{  r  r  r   r  r   r   r  r  )r   r   r  r   r   r   r  rz  r{  r  r&  r  r  masked_lm_lossloss_fctr   s                   r!   rk   MraForMaskedLM.forward  s    & &1%<k$++BYBY(())%'!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r#   )r  r  NNNNNNNN)r}   r~   r   r   _tied_weights_keysr   r  r  r   rC   r8  r  ry  r   rk   r   r  r  s   @r!   r  r  z  s     )?*Q
,8  *..2.2,0-1&*,0#'/
<<$&/
 t+/
 t+	/

 llT)/
 ||d*/
 t#/
 #Tk/
 D[/
 
	/
 /
r#   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MraClassificationHeadi  z-Head for sentence-level classification tasks.c                 8  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        Xl        g rh   )r   r   r   r  r   r-  r   r   r   
num_labelsout_projr   r   s     r!   r   MraClassificationHead.__init__  se    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr#   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r   r-  r	   r   rH  r  )r   featuresr  xs       r!   rk   MraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r#   )r   r-  r   r  r  r  s   @r!   r  r    s    7 r#   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   "  ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )MraForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g rh   )r   r   r  r  r  r  
classifierr  r   s     r!   r   %MraForSequenceClassification.__init__  sA      ++F#/7 	r#   Nr   r  r   r   r   r  rz  r{  r0  c	           
      ,   Ub  UOU R                   R                  nU R                  UUUUUUUS9n
U
S   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                   U
R"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr,   r  )r   r{  r  r  problem_typer  r@   rC   rE   r4   r   r  r   r  r   r   r  r  )r   r   r  r   r   r   r  rz  r{  r  r&  r  r  r  r  r   s                   r!   rk   $MraForSequenceClassification.forward  s   & &1%<k$++BYBY(())%'!5#  
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r#   )r  r  r  r  )r}   r~   r   r   r   r   rC   r8  r  ry  r   rk   r   r  r  s   @r!   r  r    s      *..2.2,0-1&*,0#'@
<<$&@
 t+@
 t+	@

 llT)@
 ||d*@
 t#@
 #Tk@
 D[@
 
)	)@
 @
r#   r  c                   "  ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )MraForMultipleChoicei.  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        U R                  5         g rY  )
r   r   r  r  r   r  r   pre_classifierr  r  r   s     r!   r   MraForMultipleChoice.__init__0  s_     F# ii(:(:F<N<NO))F$6$6: 	r#   Nr   r  r   r   r   r  rz  r{  r0  c	           
         Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   n
Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUS9nUS   nUSS2S4   nU R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SU
5      nSnUb  [        5       nU" X5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r,   r)   r  r   r  )r   r{  rB   r  r.   r  r  r   ReLUr  r   r   r  r  )r   r   r  r   r   r   r  rz  r{  r  num_choicesr&  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r!   rk   MraForMultipleChoice.forward:  s   V &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  
 qz$QT*++M:	-0/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r#   )r  r  r  r  )r}   r~   r   r   r   r   rC   r8  r  ry  r   rk   r   r  r  s   @r!   r  r  .  s      *..2.2,0-1&*,0#'W
<<$&W
 t+W
 t+	W

 llT)W
 ||d*W
 t#W
 #TkW
 D[W
 
*	*W
 W
r#   r  c                   "  ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )MraForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rh   )r   r   r  r  r  r   r   r   r   r  r   r  r  r   s     r!   r   "MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r#   Nr   r  r   r   r   r  rz  r{  r0  c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n
U
S   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " XR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU(       d  U4U
SS -   nUb  U4U-   $ U$ [        UUU
R                  U
R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r,   r   r  )r   r{  r  r   r  r   r  r  rC   wheretensorignore_indextype_asr   r  r  )r   r   r  r   r   r   r  rz  r{  r  r&  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r!   rk   !MraForTokenClassification.forward  sc   " &1%<k$++BYBY(())%'!5#  
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r#   )r  r   r  r  r  )r}   r~   r   r   r   r   rC   r8  r  ry  r   rk   r   r  r  s   @r!   r  r    s    	  *..2.2,0-1&*,0#'8
<<$&8
 t+8
 t+	8

 llT)8
 ||d*8
 t#8
 #Tk8
 D[8
 
&	&8
 8
r#   r  c                   B  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )MraForQuestionAnsweringi  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )Nr&   )
r   r   r  r  r  r   r  r   
qa_outputsr  r   s     r!   r    MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r#   Nr   r  r   r   r   start_positionsend_positionsrz  r{  r0  c
           
         U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9nUS   nU R                  U5      nUR	                  SSS9u  pUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU	(       d  X4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r,   r*   )r  r&   )r  start_logits
end_logitsr  r  )r   r{  r  r  splitr  r-   r.   clampr   r   r  r  )r   r   r  r   r   r   r  r  rz  r{  r  r&  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r!   rk   MraForQuestionAnswering.forward  s    &1%<k$++BYBY(())%'!5#  
 "!*1#)<<r<#: #++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r#   )r  r  r  )	NNNNNNNNN)r}   r~   r   r   r   r   rC   r8  r  ry  r   rk   r   r  r  s   @r!   r
  r
    s    
  *..2.2,0-1/3-1,0#';
<<$&;
 t+;
 t+	;

 llT);
 ||d*;
 ,;
 ||d*;
 #Tk;
 D[;
 
-	-;
 ;
r#   r
  )r  r  r
  r  r  rW  r  r  r{   )NN)r(   r   r   )Jr  r   rC   r   torch.nnr   r   r    r   r  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_mrar   
get_loggerr}   r  r    r"   r=   rM   rQ   rY   rc   autogradFunctionre   r   r   r   r   r   r  r   r
  r*  r;  rD  rQ  rW  ri  r  r  r  r  r  r  r  r  r  r  r
  __all__rz   r#   r!   <module>r'     s       A A & ! 9  . 6  ) 
		H	%:&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf4BII 4n]ryy ]BBII 
299 
bii  		 ) : 
  
H $")) "!RYY ! / / /  T
! T
 T
n F
' F
 F
TBII * K
#5 K
K
\ c
- c
 c
L E
 2 E
 E
P I
0 I
 I
X	r#   