
    Z j̳                     L   S r SSKrSSKrSSKJr  SSKJrJrJr  SSKJ	r
  SSKJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJr  SSKJrJrJrJrJr  SSK J!r!  \RD                  " \#5      r$Sq%S r&S r'S r(S r) " S S\RT                  RV                  5      r, " S S\RT                  RV                  5      r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1 " S S\R\                  5      r2 " S S \R\                  5      r3 " S! S"\R\                  5      r4 " S# S$\5      r5 " S% S&\R\                  5      r6 " S' S(\R\                  5      r7 " S) S*\R\                  5      r8 " S+ S,\R\                  5      r9\ " S- S.\5      5       r:\ " S/ S0\:5      5       r;\ " S1 S2\:5      5       r< " S3 S4\R\                  5      r=\" S5S69 " S7 S8\:5      5       r>\ " S9 S:\:5      5       r?\ " S; S<\:5      5       r@\ " S= S>\:5      5       rA/ S?QrBg)@zPyTorch YOSO model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )
YosoConfigc                  l    [        5       (       d  [        S5      eSSKJn   U " S5      nUR                  qg )NzFkernels is not installed, please install it with `pip install kernels`r   )
get_kernelzkernels-community/yoso)r   ImportErrorintegrations.hub_kernelsr   lsh_cumulation)r   yosos     w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/yoso/modeling_yoso.pyload_cuda_kernelsr!   3   s0    !!bcc6./D((N    c                     [        U [        5      (       aC  / nU  H9  nUR                  5       (       d  UR                  5       nUR	                  U5        M;     U$ U R                  5       (       d  U R                  5       n U $ N)
isinstancelistis_contiguous
contiguousappendinput_tensorsouttensors      r    to_contiguousr.   =   sq    -&&#F''))**,JJv $ 
**,,)446Mr"   c           	          [        U [        5      (       a;  / nU  H1  nUR                  [        R                  R                  USSS95        M3     U$ [        R                  R                  U SSS9$ )N   )pdim)r%   r&   r)   r   
functional	normalizer*   s      r    r5   r5   K   sa    -&&#FJJr}}..v.CD $
}}&&}r&BBr"   c                 b   [        U R                  5       5      S:w  a  [        S5      e[        UR                  5       5      S:w  a  [        S5      e[        R                  " U R                  S5      U R                  S5      X#-  U R
                  S9nS[        R                  " X0R
                  S9-  n[        R                  " X5      R                  U R                  S5      U R                  S5      X#5      n[        R                  " X5      R                  UR                  S5      UR                  S5      X#5      nUS:  R                  5       nUS:  R                  5       n	[        R                  " X-  SS	9n
[        R                  " X-  SS	9n
U
R                  5       U
R                  5       4$ )
Nr   zQuery has incorrect size.zKey has incorrect size.r   r0   devicer   r1   r3   )lensize
ValueErrortorchrandnr8   arangematmulreshapeintsum)querykeynum_hashhash_lenrmat	raise_powquery_projectionkey_projectionquery_binary
key_binary
query_hashs              r    hashingrO   U   sF   
5::<A455
388:!233;;uzz!}ejjmX5HQVQ]Q]^DU\\(<<@@I||E088A

STW_j\\#,44SXXa[#((1+xbN$q(--/L 1$))+J<3<J:1r:J>>Z^^---r"   c                   4    \ rS rSr\S 5       r\S 5       rSrg)YosoCumulationh   c           
      F   US   nS[         R                  " [         R                  " X4R                  SS5      5      5      [        R
                  -  -
  U-  nXS S 2S S 2S 4   -  US S 2S S S 24   -  n[         R                  " X5      n	U R                  XXXE5        X`l        U	$ )Nhash_code_lenr   r1   )r=   acosr@   	transposemathpisave_for_backwardconfig)
ctx
query_maskkey_maskrD   rE   valuer[   rT   expectationcumulation_values
             r    forwardYosoCumulation.forwardi   s    /5::ell5--B:O&PQTXT[T[[[`mm!q!Tz$::Xaqj=QQ <<;jKS
r"   c                    [        U5      nU R                  u  p#pEpgU R                  nUS   n	[        R                  " XR                  SS5      5      U-  n
[        R                  " XS-  U-  5      n[        R                  " U
R                  SS5      U	S-  U-  5      n[        R                  " UR                  SS5      U5      nS S XUS 4$ )NrT   r1   rU   r0   )r.   saved_tensorsr[   r=   r@   rW   )r\   gradr]   r^   r`   rD   rE   r_   r[   rT   weighted_exp
grad_querygrad_key
grad_values                 r    backwardYosoCumulation.backwardv   s    T"?B?P?P<
k#/||D//"b*AB[P\\,1Bc0IJ
<< 6 6r2 >QRARV[@[\\\+"7"7B"?F
T:TAAr"    N__name__
__module____qualname____firstlineno__staticmethodrb   rk   __static_attributes__rm   r"   r    rQ   rQ   h   s*    
  
  B Br"   rQ   c                   4    \ rS rSr\S 5       r\S 5       rSrg)YosoLSHCumulation   c           
      t   UR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S5      eUR                  S5      UR                  S5      :w  a  [        S	5      e[        XX4U/5      u  pp4nUR                  nUS
   nUS   n	[	        SU	-  5      n
US   (       a  [
        R                  XX$XUS5      u  pO[        X4X5      u  p[
        R                  XX,XZUS5      nU R                  XXX4U5        X`l	        U$ )Nr   z6Query mask and Key mask differ in sizes in dimension 0z3Query mask and Query differ in sizes in dimension 0z1Query mask and Key differ in sizes in dimension 0z8Query mask and Value mask differ in sizes in dimension 0r   z,Key and Value differ in sizes in dimension 1r0   z,Query and Key differ in sizes in dimension 2rF   rT   use_fast_hash)
r;   r<   r.   is_cudarB   r   	fast_hashrO   rZ   r[   )r\   r]   r^   rD   rE   r_   r[   use_cudarF   rT   hashtable_capacityquery_hash_codekey_hash_codera   s                 r    rb   YosoLSHCumulation.forward   s   ??1q!11UVV??1A.RSS??1!,PQQ??1A.WXX88A;%**Q-'KLL::a=CHHQK'KLL2?W\ch@i2j/
e%%%*%/ M!12/"-;-E-E8(8UV.*O] .5U-Y*O)88%]egh
 	jOTY`ef
r"   c                    [        U5      nU R                  u  p#pEpgnU R                  n	UR                  n
U	S   n[	        SU-  5      nU	S   (       ac  [
        R                  X5X$XU
S5      n[
        R                  UUUUUUUS-  U-  UU
S5
      n[
        R                  UUUUUUUS-  U-  UU
S5
      nGOS[        R                  " [        R                  " XgR                  SS5      5      5      [        R                  -  -
  U-  nUUS S 2S S 2S 4   -  US S 2S S S 24   -  n[        R                  " XR                  SS5      5      U-  n[        R                  " UUS-  U-  5      n[        R                  " UR                  SS5      US-  U-  5      n[        R                  " UR                  SS5      U5      nS S XUS 4$ )NrT   r0   lsh_backwardr      r1   rU   )r.   re   r[   rz   rB   r   lsh_weighted_cumulationr=   rV   r@   rW   rX   rY   )r\   rf   r]   r^   r~   r   rD   rE   r_   r[   r|   rT   r}   rj   rh   ri   r`   rg   s                     r    rk   YosoLSHCumulation.backward   s   T"RURcRcO
oe%<</ M!12.!'66d`hjkJ (??"c)"J &=="e+"H uzz%,,ummBPR>S*TUX\X_X___dqqK%
1a:(>>!TST*AUUK <<oob".EFTLl]Q5F#4MNJ||L$:$:2r$B]UVEVZ_D_`Hk&;&;B&CTJJT:TAAr"   rm   Nrn   rm   r"   r    rv   rv      s+    #  # J .B .Br"   rv   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )YosoEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c           	      @  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  S-   UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      S-   SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  U R,                  R2                  S	9SS9  g )
N)padding_idxr0   epsposition_idsr   r1   F)
persistenttoken_type_idsdtyper8   )super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr=   r?   expandzerosr   r;   longr8   selfr[   	__class__s     r    r   YosoEmbeddings.__init__   s7   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWX[\\in 	 	
 	KK))..0

4K\K\KcKcd 	 	
r"   c                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr1   r   r   r   r   )r;   r   hasattrr   r   r=   r   r   r8   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r    rb   YosoEmbeddings.forward   s,    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
\\*-
r"   )r   r   r   r   r   )NNNN	ro   rp   rq   rr   __doc__r   rb   rt   __classcell__r   s   @r    r   r      s    Q
&   r"   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )YosoSelfAttentioni  c           	        > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      e[        S Ln[        5       (       a!  [        5       (       a  U(       d   [        5         UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R"                  " UR                  U R                  5      U l        [         R*                  " UR,                  5      U l        UR0                  U l        UR2                  U l        UR4                  S LU l        UR8                  U l        UR:                  U l        UR<                  U l        U R2                  U R8                  U R:                  U R<                  S.U l        UR4                  bX  [         R@                  " UR                  UR                  UR4                  S4UR4                  S	-  S4S
UR                  S9U l!        g g ! [         a#  n[        R                  SU 35         S nAGN5S nAff = f)Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: )rT   ry   rF   r   r   r0   F)in_channelsout_channelskernel_sizepaddingbiasgroups)"r   r   r   num_attention_headsr   r<   r   r   r   r!   	ExceptionloggerwarningrB   attention_head_sizeall_head_sizer   LinearrD   rE   r_   r   attention_probs_dropout_probr   use_expectationrT   conv_windowuse_convry   rF   r   
lsh_configConv2dconv)r   r[   kernel_loadeder   s       r    r   YosoSelfAttention.__init__  s\    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  'd2"$$);)=)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF%55#11**$6#11"// "//!// --	
 )		"66#77#//3++q0!411DI *7  n!hijhklmmns   
J) )
K3KKc                 F   UR                   u  pEnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n	U R                  (       a  U R                  XS S 2S S S 2S 4   -  5      n
UR                  5       u  pKpUR                  XK-  X5      nUR                  XK-  X5      nU	R                  XK-  X5      n	SUS-  -   nUR                  S5      R                  USS9R                  XK-  U5      R                  5       nSnU R                  (       d  X:  a  XK-  XU-
  4n[         R"                  " U[         R$                  " XR&                  S9/SS9n[         R"                  " U[         R$                  " XR&                  S9/SS9n[         R"                  " U	[         R$                  " XR&                  S9/SS9n	U R                  (       d  U R(                  (       a  [+        Xx/5      u  pxU R                  (       a"  [,        R/                  X"XxXR0                  5      nO![2        R/                  X"XxXR0                  5      nU R                  (       d  X:  a  US S 2S S 2S U24   n[+        U5      nUR                  XKX5      nU R                  (       a  UW
-  nUR5                  S	SSS
5      R7                  5       nUR                  5       S S U R8                  4-   nUR                  " U6 nU(       a  UU4nU$ U4nU$ )Nr1   r   r0   g      ?g     @r9       r7   r   r   rU   )shaperD   viewr   r   rW   rE   r_   r   r   r;   rA   	unsqueezerepeat_interleaverB   r   r=   catr   r8   trainingr5   rQ   applyr   rv   permuter(   r   )r   hidden_statesattention_maskoutput_attentions
batch_sizer   _query_layer	key_layervalue_layerconv_value_layer	num_headsseq_lenhead_dimgpu_warp_sizepad_sizecontext_layernew_context_layer_shapeoutputss                      r    rb   YosoSelfAttention.forwardK  sb   $1$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ==#yyaqRVFV7W)WX3>3C3C3E0
w!))**@'T%%j&<gP	!))**@'T~77$$Q'ya0WZ+W5SU	 	 $$(*B!-w8PPH))KK1C1CD K 		KK1A1AB I  ))KK1C1CD K 4==%./G%H"K*00UdUdM .33UdUdM $$(*B)!Q		/:M!-0%--jWW==--M%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD4E=-0 MZK[r"   )r   r   r   r   rT   rE   r   r   r   rF   rD   r   r   ry   r_   NFro   rp   rq   rr   r   rb   rt   r   r   s   @r    r   r     s    .`\ \r"   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )YosoSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr   )r   r   r   r   r   denser   r   r   r   r   r   s     r    r   YosoSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r"   r   input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r$   r   r   r   r   r   r   s      r    rb   YosoSelfOutput.forward  5    

=1]3}'CDr"   r   r   r   
ro   rp   rq   rr   r   r=   Tensorrb   rt   r   r   s   @r    r   r     6    >U\\  RWR^R^  r"   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )YosoAttentioni  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r$   )r   r   r   r   r   outputr   s     r    r   YosoAttention.__init__  s&    %f-	$V,r"   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r  )r   r   r   r   self_outputsattention_outputr   s          r    rb   YosoAttention.forward  sA    yy@QR;;|AF#%QR(88r"   )r  r   r   r   r   s   @r    r  r    s    -
 r"   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r$   )r   r   r   r   r   intermediate_sizer   r%   
hidden_actstrr	   intermediate_act_fnr   s     r    r   YosoIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r"   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r$   r   r  r   r   s     r    rb   YosoIntermediate.forward  s&    

=100?r"   r  r  r   s   @r    r  r    s(    9U\\ ell  r"   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
YosoOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r   r   r   r   r  r   r   r   r   r   r   r   r   s     r    r   YosoOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r"   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r$   r   r   s      r    rb   YosoOutput.forward  r   r"   r   r  r   s   @r    r  r    r  r"   r  c                   8   ^  \ rS rSrU 4S jrSS jrS rSrU =r$ )	YosoLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        [        U5      U l        [        U5      U l
        g Nr   )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionadd_cross_attentionr  intermediater  r  r   s     r    r   YosoLayer.__init__  sW    '-'E'E$&v.#)#=#= ,V4 (r"   c                     U R                  XUS9nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ )N)r   r   r   )r%  r   feed_forward_chunkr#  r$  )r   r   r   r   self_attention_outputsr  r   layer_outputs           r    rb   YosoLayer.forward  sf    !%ar!s1!4(,0##T%A%A4CSCSUe
  /G+r"   c                 J    U R                  U5      nU R                  X!5      nU$ r$   )r'  r  )r   r  intermediate_outputr,  s       r    r*  YosoLayer.feed_forward_chunk  s)    "//0@A{{#6Ir"   )r&  r%  r#  r'  r  r$  r   )	ro   rp   rq   rr   r   rb   r*  rt   r   r   s   @r    r   r     s    ) r"   r   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )YosoEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r   r   r[   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r   r[   r   r   s      r    r   YosoEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                 $   U(       a  SOS nU(       a  SOS n[        U R                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nrm   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr$   rm   ).0vs     r    	<genexpr>&YosoEncoder.forward.<locals>.<genexpr>"  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater7  tupler   )r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r    rb   YosoEncoder.forward	  s     #7BD$5b4(4OA#$58H$H!(HYZM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm1++*
 	
r"   )r[   r8  r7  )NFFTr   r   s   @r    r2  r2    s     , "
 
r"   r2  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoPredictionHeadTransformi+  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )r   r   r   r   r   r   r%   r  r  r	   transform_act_fnr   r   r   s     r    r   $YosoPredictionHeadTransform.__init__,  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr"   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r$   )r   rO  r   r  s     r    rb   #YosoPredictionHeadTransform.forward5  s4    

=1--m<}5r"   )r   r   rO  r  r   s   @r    rM  rM  +  s)    UU\\ ell  r"   rM  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )YosoLMPredictionHeadi=  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)r   )r   r   rM  	transformr   r   r   r   decoder	Parameterr=   r   r   r   s     r    r   YosoLMPredictionHead.__init__>  s[    4V< yy!3!3V5F5FTRLLV->->!?@	r"   c                 J    U R                  U5      nU R                  U5      nU$ r$   )rV  rW  r  s     r    rb   YosoLMPredictionHead.forwardG  s$    }5]3r"   )r   rW  rV  r   r   s   @r    rT  rT  =  s    A r"   rT  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )YosoOnlyMLMHeadiN  c                 B   > [         TU ]  5         [        U5      U l        g r$   )r   r   rT  predictionsr   s     r    r   YosoOnlyMLMHead.__init__O  s    /7r"   sequence_outputr   c                 (    U R                  U5      nU$ r$   r_  )r   ra  prediction_scoress      r    rb   YosoOnlyMLMHead.forwardS  s     ,,_=  r"   rc  r  r   s   @r    r]  r]  N  s(    8!u|| ! ! !r"   r]  c                      ^  \ rS rSr% \\S'   SrSr\R                  " 5       S\
R                  4U 4S jj5       rSrU =r$ )YosoPreTrainedModeliX  r[   r   Tmodulec                   > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a  [        R                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      S-   5        [        R
                  " UR                  5        gg)zInitialize the weightsr1   r   r0   N)r   _init_weightsr%   rT  initzeros_r   r   copy_r   r=   r?   r   r   r   )r   rh  r   s     r    rj  !YosoPreTrainedModel._init_weights^  s     	f%f233KK$//JJv**ELL9L9L9R9RSU9V,W,^,^_f,gjk,klKK--. 0r"   rm   )ro   rp   rq   rr   r   __annotations__base_model_prefixsupports_gradient_checkpointingr=   no_gradr   Modulerj  rt   r   r   s   @r    rg  rg  X  s8    &*#
]]_/BII / /r"   rg  c                     ^  \ rS rSrU 4S jrS rS r\        SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\
S-  S\
S-  S\
S-  S\\-  4S jj5       rSrU =r$ )	YosoModelii  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r$   )r   r   r[   r   r   r2  encoder	post_initr   s     r    r   YosoModel.__init__k  s9     (0"6* 	r"   c                 .    U R                   R                  $ r$   r   r   r   s    r    get_input_embeddingsYosoModel.get_input_embeddingsu  s    ...r"   c                 $    XR                   l        g r$   r{  )r   r_   s     r    set_input_embeddingsYosoModel.set_input_embeddingsx  s    */'r"   Nr   r   r   r   r   r   rD  rE  r   c	                 v   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R                  UUUUS9nU R!                  UUUUUS9nUS	   nU(       d	  U4US
S  -   $ [#        UUR$                  UR&                  UR(                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer1   z5You have to specify either input_ids or inputs_embedsr7   r   r   )r   r   r   r   )r   r   rD  rE  r   r   )r@  r   rA  cross_attentions)r[   r   rD  rE  r<   %warn_if_padding_and_no_attention_maskr;   r8   r=   onesr   r   r   r   r   r   rw  r   r   rA  r  )r   r   r   r   r   r   r   rD  rE  kwargsr   r   r   r8   r   r   embedding_outputencoder_outputsra  s                      r    rb   YosoModel.forward{  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z??%)'	 + 
 ,,)/!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r"   )r[   r   rw  )NNNNNNNN)ro   rp   rq   rr   r   r}  r  r   r=   r  boolrC  r   rb   rt   r   r   s   @r    ru  ru  i  s    /0  *..2.2,0-1)-,0#'A
<<$&A
 t+A
 t+	A

 llT)A
 ||d*A
  $;A
 #TkA
 D[A
 
3	3A
 A
r"   ru  c                   D  ^  \ rS rSrSSS.rU 4S jrS rS r\         SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )YosoForMaskedLMi  zcls.predictions.biasz&yoso.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r$   )r   r   ru  r   r]  clsrx  r   s     r    r   YosoForMaskedLM.__init__  s4     f%	"6* 	r"   c                 B    U R                   R                  R                  $ r$   )r  r_  rW  r|  s    r    get_output_embeddings%YosoForMaskedLM.get_output_embeddings  s    xx##+++r"   c                     XR                   R                  l        UR                  U R                   R                  l        g r$   )r  r_  rW  r   )r   new_embeddingss     r    set_output_embeddings%YosoForMaskedLM.set_output_embeddings  s*    '5$$2$7$7!r"   Nr   r   r   r   r   labelsr   rD  rE  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nr   r   r   r   r   rD  rE  r   r1   r   losslogitsr   rA  )
r[   rE  r   r  r   r   r   r   r   rA  )r   r   r   r   r   r   r  r   rD  rE  r  r   ra  rd  masked_lm_lossloss_fctr  s                    r    rb   YosoForMaskedLM.forward  s    ( &1%<k$++BYBY))))%'/!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r"   )r  r   	NNNNNNNNN)ro   rp   rq   rr   _tied_weights_keysr   r  r  r   r=   r  r  rC  r   rb   rt   r   r   s   @r    r  r    s     )?*R
,8  *..2.2,0-1&*)-,0#'1
<<$&1
 t+1
 t+	1

 llT)1
 ||d*1
 t#1
  $;1
 #Tk1
 D[1
 
	1
 1
r"   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )YosoClassificationHeadi  z-Head for sentence-level classification tasks.c                 8  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        Xl        g r$   )r   r   r   r   r   r   r   r   r   
num_labelsout_projr[   r   s     r    r   YosoClassificationHead.__init__  se    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr"   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        U R                  R                     " U5      nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   r	   r[   r  r  )r   featuresr  xs       r    rb   YosoClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r"   )r[   r   r   r  r   r   s   @r    r  r    s    7 r"   r  z
    YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )YosoForSequenceClassificationi!  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r$   )r   r   r  ru  r   r  
classifierrx  r   s     r    r   &YosoForSequenceClassification.__init__(  sA      ++f%	08 	r"   Nr   r   r   r   r   r  r   rD  rE  r   c
                 .   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr1   r  )r[   rE  r   r  problem_typer  r   r=   r   rB   r   squeezer   r   r   r   r   rA  )r   r   r   r   r   r   r  r   rD  rE  r  r   ra  r  r  r  r  s                    r    rb   %YosoForSequenceClassification.forward1  s   ( &1%<k$++BYBY))))%'/!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r"   )r  r  r   r  )ro   rp   rq   rr   r   r   r=   r  r  rC  r   rb   rt   r   r   s   @r    r  r  !  s      *..2.2,0-1&*)-,0#'B
<<$&B
 t+B
 t+	B

 llT)B
 ||d*B
 t#B
  $;B
 #TkB
 D[B
 
)	)B
 B
r"   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )YosoForMultipleChoiceiw  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r"  )
r   r   ru  r   r   r   r   pre_classifierr  rx  r   s     r    r   YosoForMultipleChoice.__init__y  s_     f%	 ii(:(:F<N<NO))F$6$6: 	r"   Nr   r   r   r   r   r  r   rD  rE  r   c
                    U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nUSS2S4   nU R                  U5      n[        R                  " 5       " U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" UU5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r1   rU   r  r   r  )r[   rE  r   r   r;   r   r  r   ReLUr  r   r   r   rA  )r   r   r   r   r   r   r  r   rD  rE  r  num_choicesr   hidden_statepooled_outputr  reshaped_logitsr  r  r  s                       r    rb   YosoForMultipleChoice.forward  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  	
 qz$QT*++M:	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r"   )r  r  r   r  )ro   rp   rq   rr   r   r   r=   r  r  rC  r   rb   rt   r   r   s   @r    r  r  w  s      *..2.2,0-1&*)-,0#'Y
<<$&Y
 t+Y
 t+	Y

 llT)Y
 ||d*Y
 t#Y
  $;Y
 #TkY
 D[Y
 
*	*Y
 Y
r"   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )YosoForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r$   )r   r   r  ru  r   r   r   r   r   r   r   r  rx  r   s     r    r   #YosoForTokenClassification.__init__  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r"   Nr   r   r   r   r   r  r   rD  rE  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r1   r   r  )r[   rE  r   r   r  r   r   r  r=   wherer-   ignore_indextype_asr   r   rA  )r   r   r   r   r   r   r  r   rD  rE  r  r   ra  r  r  r  active_lossactive_logitsactive_labelsr  s                       r    rb   "YosoForTokenClassification.forward  sh   $ &1%<k$++BYBY))))%'/!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r"   )r  r   r  r   r  )ro   rp   rq   rr   r   r   r=   r  r  rC  r   rb   rt   r   r   s   @r    r  r    s    	  *..2.2,0-1&*)-,0#':
<<$&:
 t+:
 t+	:

 llT):
 ||d*:
 t#:
  $;:
 #Tk:
 D[:
 
&	&:
 :
r"   r  c                   N  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )YosoForQuestionAnsweringi+  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )Nr0   )
r   r   r  ru  r   r   r   r   
qa_outputsrx  r   s     r    r   !YosoForQuestionAnswering.__init__-  s[      ++f%	))F$6$68I8IJ 	r"   Nr   r   r   r   r   start_positionsend_positionsr   rD  rE  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   r1   r9   )r  r0   )r  start_logits
end_logitsr   rA  )r[   rE  r   r  splitr  r:   r;   clampr   r   r   rA  )r   r   r   r   r   r   r  r  r   rD  rE  r  r   ra  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r    rb    YosoForQuestionAnswering.forward9  s    &1%<k$++BYBY))))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r"   )r  r  r   )
NNNNNNNNNN)ro   rp   rq   rr   r   r   r=   r  r  rC  r   rb   rt   r   r   s   @r    r  r  +  s    
  *..2.2,0-1/3-1)-,0#'=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 ,=
 ||d*=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r"   r  )r  r  r  r  r  r   ru  rg  )Cr   rX   r=   r   torch.nnr   r   r    r   rk  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_yosor   
get_loggerro   r   r   r!   r.   r5   rO   autogradFunctionrQ   rv   rs  r   r   r   r  r  r  r   r2  rM  rT  r]  rg  ru  r  r  r  r  r  r  __all__rm   r"   r    <module>r     sR       A A & ! 9  . 6  + 
		H	% )C.&BU^^,, B>VB// VBt6RYY 6rM		 MbRYY 
BII 
ryy   * :%
")) %
R")) $299 "!bii ! // / /  S
# S
 S
l H
) H
 H
VRYY * M
$7 M
M
` e
/ e
 e
P G
!4 G
 G
T K
2 K
 K
\	r"   