
    Z jy}                     P   S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJrJrJrJrJrJr  SSKJr  \R2                  " \5      rSqS r " S S\R<                  R>                  5      r S'S jr!S'S jr" " S S\RF                  5      r$ " S S\RF                  5      r% " S S\5      r&\ " S S\5      5       r'\" SS9\ " S S\5      5       5       r(\" SS9\ " S S \5      5       5       r)\ " S! S"\'5      5       r*\" S#S9 " S$ S%\'\
5      5       r+/ S&Qr,g)(zPyTorch RWKV model.    N)	dataclass)nn   )initialization)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                 j    [        5       (       d  [        S5      eSSKJn  U" S5      qU [        l        g )NzFkernels is not installed, please install it with `pip install kernels`r   )
get_kernelzkernels-community/rwkv)r   ImportErrorintegrations.hub_kernelsr   rwkv_cuda_kernelmax_seq_length)context_lengthr   s     w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr   -   s/    !!bcc6!":;&4#    c                   <    \ rS rSr\SS j5       r\SS j5       rSrg)RwkvLinearAttention8   Nc                    UR                  5       u  pxn	U[        R                  :  a   [        SU S[        R                   S35      eXy-  [	        U	S5      -  S:w  a  [        SU SU	 S[	        U	S5       S	35      eUR
                  U l        UR                  R                  S
:w  dN  UR                  R                  S
:w  d4  UR                  R                  S
:w  d  UR                  R                  S
:w  a  [        S5      e[        R                  " UR                  5       R                  5       5      * nUR
                  [        R                  :X  a0  UR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       n[        R                  " U[        R                  S9n
U(       d  Ub  UcT  [        R                   " UU	S[        R"                  UR                  [        R                  S9nUS S 2S S 2S4==   S-  ss'   OB[        R$                  " U Vs/ s H  oR'                  S5      PM     snSS9R                  5       nUR
                  [        R(                  :X  a  [        R*                  nO[        R,                  nU" XX4X5        OHUR
                  [        R(                  :X  a  [        R.                  O[        R0                  nU" XX4U
5        U R3                  XX4U
5        Ub4  [        R4                  " USSS9 Vs/ s H  oR7                  S5      PM     nnU
R9                  U R                  5      U4$ s  snf s  snf )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of .cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer%      籡*G)dim)sizer   r   
ValueErrorminr&   input_dtyper'   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r   r?   RwkvLinearAttention.forward9   s   +.88:(
[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"A5a;;q>5"AqITTVyyENN*/GG/BBVK<?II<W+88]m]u]uLVDjc&I+0;;uaQ+GH+GaYYq\+GEHyy)500 #B Is   	M07M5c                 .   U R                   nU R                  u  pEpgn[        R                  " U[        R                  U[        R
                  :X  a  [        R
                  O[        R                  S9n	[        R                  " U[        R                  S9n
[        R                  " U[        R                  S9n[        R                  " U[        R                  S9nU[        R                  :X  a  UR                  5       nU[        R
                  :X  a  [        R                  O[        R                  nU" UUUUUUR                  5       U	U
UU5
        U	R                  U5      U
R                  U5      UR                  U5      UR                  U5      S S 4$ )N)r%   r&   r$   )r.   saved_tensorsr0   r5   r6   r;   r8   r4   r2   r   backward_bf16backwardr3   rC   )rD   g_outputg_stater.   rE   rF   rG   rH   rN   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r   rU   RwkvLinearAttention.backwardx   sF    oo585F5F2
F''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
r    NFN)__name__
__module____qualname____firstlineno__staticmethodr?   rU   __static_attributes__r^   r   r   r   r   8   s)    <1 <1| %
 %
r   r   c                    UR                  5       u  pgn[        R                  " U5      nUc  [        R                  " US S 2S4   [        R                  S9n	[        R                  " US S 2S4   [        R                  S9n
[        R                  " US S 2S4   [        R                  S9S-
  nOUu  pn[        R                  " U 5      * n [        U5       GH
  nUS S 2U4   R                  5       nUS S 2U4   n[        R                  " XU-   5      n[        R                  " X-
  5      n[        R                  " X-   U-
  5      nUU	-  UU-  -   nUU
-  U-   nUU-  R                  UR                  5      US S 2U4'   [        R                  " X-   U5      n[        R                  " X-   U-
  5      n[        R                  " UU-
  5      nUU	-  UU-  -   n	UU
-  U-   n
UnGM     U(       d  Ub  XU/nX4$ )Nr   )r&   r)   )
r+   r0   
zeros_liker8   r1   ranger2   maximumrC   r&   )rE   rF   rG   rH   rI   rJ   _
seq_lengthrN   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r   rwkv_linear_attention_cpury      s    xxzA1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	i
 ))J''Jz*!]*+113a./ y
2JKYYy12YY{/.@ANR-%77	9nr)$-$;#?#?#Mq-  i&<kJYYy-=>YY{]23NR-%77	NR'	!	% +( u(y1=r   c           	          [        S XX#4 5       5      nUR                  S5      S:H  n[        b  U(       d  U(       a  [        XX#XES9$ [        R                  XX#XE5      $ )Nc              3   R   #    U  H  oR                   R                  S :g  v   M     g7f)r#   N)r'   r/   ).0ts     r   	<genexpr>(rwkv_linear_attention.<locals>.<genexpr>   s     X3Wa((--6)3Ws   %'r   rI   rJ   )anyr+   r   ry   r   apply)rE   rF   rG   rH   rI   rJ   no_cuda	one_tokens           r   rwkv_linear_attentionr      sZ    XJC3WXXG q I7i(SXtt"((Uaar   c                   @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	RwkvSelfAttention   c                   > [         TU ]  5         Xl        [        S L=(       a    [        R                  UR
                  :H  n[        5       (       a,  [        5       (       a  U(       d   [        UR
                  5        X l        UR                  nUR                  b  UR                  OUnXPl        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R0                  " S5      U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XTSS9U l        g ! [         a    [        R                  S5         GNf = f)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr   r   r   r   r   r   	Exceptionloggerinfolayer_idrM   attention_hidden_sizer   	Parameterr0   emptyrE   rF   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrG   rH   
receptancerN   )selfr   r   kernel_loadedrM   r   	__class__s         r   r   RwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[eLYY{N
))KUSii 5O)  YWXYs   (H% %IIc                 p   UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   nXR
                  -  USU R
                  -
  -  -   nU R                  U5      nU R                  U5      n[        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XdXR4$ Nr   r   r   )r+   r   r   r   r   r   rG   rH   r0   sigmoidr   )r   hiddenrI   shiftedrG   rH   r   s          r   extract_key_value#RwkvSelfAttention.extract_key_value   s2   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RR666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==(),,r   c           	        ^  T R                  XS9u  pEpbUb  [        U 4S jUSS   5       5      OS n[        T R                  T R                  UUUUS9u  pUbT  US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   T R                  XH-  5      U4$ )	NrI   c              3   N   >#    U  H  oS S 2S S 2TR                   4   v   M     g 7fr`   r   )r|   rO   r   s     r   r~   ,RwkvSelfAttention.forward.<locals>.<genexpr>  s     FIqaDMM12Is   "%r(   r   r   r   r      )r   tupler   rE   rF   r   rN   )	r   r   rI   	use_cacher   rG   rH   layer_staterwkvs	   `        r   r?   RwkvSelfAttention.forward  s    (,(>(>v(>(S%
JOJ[eFE!"IFFae1OOOO"
 ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44r   )r   r   rG   r   rN   r   rE   rF   r   r   r   r   rH   r   r`   r_   )	ra   rb   rc   rd   r   r   r?   rf   __classcell__r   s   @r   r   r      s    P<-&5 5r   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )RwkvFeedForwardi  c                 8  > [         TU ]  5         Xl        X l        UR                  nUR
                  b  UR
                  OSUR                  -  n[        R                  " S5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " X4SS9U l        [        R                  " X3SS9U l        [        R                  " XCSS9U l        g )Nr   r   r   Fr   )r   r   r   r   rM   intermediate_sizer   r   r   r   r0   r   r   r   r   rG   r   rH   )r   r   r   rM   r   r   s        r   r   RwkvFeedForward.__init__   s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[%H))K5IYY0EJ
r   c                    UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   n[
        R                  " [
        R                  " U R                  U5      5      5      nU R                  U5      n[
        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XV-  U4$ r   )r+   r   r   r   r   r0   squarerelurG   rH   r   r   )r   r   rI   r   rG   r   rH   s          r   r?   RwkvFeedForward.forward1  s#   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()!5((r   )r   rG   r   r   r   r   r   rH   r   r`   ra   rb   rc   rd   r   r?   rf   r   r   s   @r   r   r     s    K") )r   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )	RwkvBlockiE  c                   > [         TU ]  5         Xl        X l        US:X  a.  [        R
                  " UR                  UR                  S9U l        [        R
                  " UR                  UR                  S9U l	        [        R
                  " UR                  UR                  S9U l
        [        X5      U l        [        X5      U l        g )Nr   )eps)r   r   r   r   r   	LayerNormrM   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r   r   RwkvBlock.__init__F  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*6<+F=r   c                    U R                   S:X  a  U R                  U5      nU R                  U R                  U5      X#S9u  pRX-   nU R	                  U R                  U5      US9u  pbX-   nX4nU(       a  Xu4-  nU$ US-  nU$ )Nr   )rI   r   r   r`   )r   r   r   r   r   r   )r   r   rI   r   output_attentionsr   r   outputss           r   r?   RwkvBlock.forwardT  s    ==A[[(F>>$((6*:%>]	#"//0@/N&/|#G  wGr   )r   r   r   r   r   r   r   )NFFr   r   s   @r   r   r   E  s    > r   r   c                       \ rS rSr% \\S'   SrS/rSS/rSr	Sr
\R                  " 5       S\R                  4S	 j5       rS
rg)RwkvPreTrainedModelig  r   r   r   rE   rF   Tmodulec           	         [        U[        5      (       Ga  UR                  nUR                  R                  nUR                  R
                  nUR                  nX#S-
  -  nSX#-  -
  n[        R                  " [        U5       Vs/ s H  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        U5       V
s/ s H  n
SSXS-
  -  SSU-  -   -  -  -   PM     nn
[        R                  " XR                  R                  UR                  R                  S9n[        R                  " [        U5       Vs/ s H  oS-   S	-  S-
  PM     snUR                  R                  UR                  R                  S9S
-  n[        R                   " UR                  U5        [        R                   " UR                  [        R"                  " UR                  [$        R&                  " S5      -  U-   5      5        [        R                   " UR                  [        R(                  " X5      5        [        R                   " UR*                  [        R(                  " X5      SU-  -   5        [        R                   " UR,                  [        R(                  " U	S
U-  5      5        g[        U[.        5      (       Ga  UR                  nUR                  R                  nUR                  R
                  nSX#-  -
  n[        R                  " [        U5       Vs/ s H  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        R                   " UR                  [        R(                  " X5      5        [        R                   " UR,                  [        R(                  " X5      5        g[        U[0        R2                  5      (       a  UR4                  R6                  nSnSnUR8                  b   [        R:                  " UR8                  5        US   US   :  a  [$        R<                  " US   US   -  5      nUS   U R                  R>                  :X  a  US   U R                  R
                  :X  a  S
nX-  n[        R@                  " UR4                  US9  g[        U[0        RB                  5      (       a_  UR4                  R6                  nS[$        R<                  " [E        US   US   5      5      -  n[        R@                  " UR4                  US9  g[        U[0        RF                  5      (       aA  [        RH                  " UR4                  5        [        R:                  " UR8                  5        ggs  snf s  sn
f s  snf s  snf )zInitialize the weights.r   g      ?r&   r'   N   gffffff?g?r   g      ?g333333?r   )gaing-C6?)%
isinstancer   r   r   num_hidden_layersrM   r   r0   tensorri   r   r&   r'   rE   rF   initcopy_	ones_likemathlogpowr   r   r   r   r   weightshaper   zeros_sqrt
vocab_sizeorthogonal_	Embeddingmaxr   ones_)r   r   r   r   rM   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzagr   r   scales                   r   _init_weights!RwkvPreTrainedModel._init_weightsp  su    f/00H & ? ? --33K$*$@$@!#1'<=L!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4K 455A Q!q89sS<EW?WXXX5    ,,{:K:K:Q:QZ`ZkZkZrZrsK.34I.JK.J!eq[1_.JK ++11!,,33
   JJv((+6JJv((%//&:K:KdhhWZm:[^d:d*efJJv**EIIk,VWJJv,,eii.X[^am[m.mnJJv11599[#PbJb3cd00H & ? ? --33K!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4KJJv**EIIk,VWJJv11599[3]^		**MM''EDE{{&FKK(Qx%("yyqE!H!45Qx4;;111eAh$++BYBY6YMDV]]6--MM''E$))Ca%($;<<DV]]6--JJv}}%KK$ .w > L* >s   V4 V9V>2Wr^   N)ra   rb   rc   rd   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr0   no_gradr   Moduler   rf   r^   r   r   r   r   g  sR    $)<8&*#L
]]_I%BII I% I%r   r   z+
    Class for the RWKV model outputs.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                     S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)

RwkvOutputi  z
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.
Nlast_hidden_staterI   .hidden_states
attentionsr^   )ra   rb   rc   rd   __doc__r  r0   FloatTensorr   rI   listr  r   r  rf   r^   r   r   r   r     sw     37u((4/6,0E4!!"T)0:>M5**C/047>7;Je'',-4;r   r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)RwkvCausalLMOutputi  aP  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.
NlosslogitsrI   .r  r  r^   )ra   rb   rc   rd   r  r	  r0   r  r   r
  rI   r  r  r   r  rf   r^   r   r   r  r    s     &*D%

d
")'+FE$+,0E4!!"T)0:>M5**C/047>7;Je'',-4;r   r  c                     ^  \ rS rSrU 4S jrS rS r\        SS\R                  S-  S\R                  S-  S\R                  S-  S	\\R                     S-  S
\S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rS rS rSrU =r$ )	RwkvModeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [        R                  " UR
                  5      U l        SU l        SU l        U R!                  5         g s  snf )Nr   F)r   r   r   r   r   rM   
embeddings
ModuleListri   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r   r   RwkvModel.__init__  s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pPoYv%DPo$pqll6#5#56#( &+# 	 %qs   (Cc                     U R                   $ r`   r  r   s    r   get_input_embeddingsRwkvModel.get_input_embeddings  s    r   c                     Xl         g r`   r  r   new_embeddingss     r   set_input_embeddingsRwkvModel.set_input_embeddings  s    (r   N	input_idsattention_maskinputs_embedsrI   r   r   output_hidden_statesreturn_dictreturnc	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R
                  nUb  [        R                  S5        U R                  U R                  :X  a  U R                  5         Ub  Ub  [        S5      eUc  Uc  [        S5      eUc  U R                  U5      nU(       a  Uc  UR                  S5      U R                   R                  U R                   R                  4n
[        S5       Vs/ s HC  n[         R"                  " XS::  a  UR$                  O[         R&                  UR(                  S	.6PME     nnUS
==   S-  ss'   U R*                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUnU(       a  SOSnU(       a  SOSn[-        U R.                  5       H{  u  nnU" XXVS9u  pnU R                  (       a?  U R                   R0                  S:  a%  US-   U R                   R0                  -  S:X  a  US-  nU(       a  X4-   nU(       d  Mu  UU4-   nM}     U R3                  U5      nU(       a  X4-   nU(       d  [5        S XX4 5       5      $ [7        UUUUS9$ s  snf )a(  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r^   )rI   r   r   r(   c              3   .   #    U  H  oc  M  Uv   M     g 7fr`   r^   )r|   xs     r   r~   $RwkvModel.forward.<locals>.<genexpr>`  s     t$bq$bs   	)r  rI   r  r  )r   r   r$  trainingr   r%  r   warning_oncer  _rescale_layersr,   r  r+   rM   r   ri   r0   r7   r&   r8   r'   r  	enumerater  rescale_everyr  r   r   )r   r!  r"  r#  rI   r   r   r$  r%  kwargsr   r   r  all_self_attentionsall_hidden_statesr  blockr  s                     r   r?   RwkvModel.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++BYBY% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	 "A a-"5"5U]][h[o[o "	   !HH&&4==##p "	%$5b4"6BD#DKK0JC/4i0,M*
 ((KK--11W 9 99Q> - 1#$58H$H!  &9ZM&I#! 1$ M2 14D Dt];L$bttt++*	
 	
[s   A
Kc           	         U R                   U R                  (       + :X  a  g U R                  R                  S:  Ga  [        R
                  " 5          [        U R                  5       GH  u  pU R                  (       a  UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        M  [        UR                  R                  R                  S5      (       a  UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        GM  [        UR                  R                  R                  S5      (       aO  U R%                  UR                  R                  U5        U R%                  UR                  R                  U5        GM!  UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        GM     S S S 5        U R                  (       + U l         g ! , (       d  f       N%= f)Nr   r(   SCBquant_state)r  r,  r   r0  r0   r   r/  r  r   rN   r   mul_intr   rH   hasattrr7  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr4  s      r   r.  RwkvModel._rescale_layersi  s?   ##DMM(9:;;$$q('0'=OH}}..55::1HP[P[PiPiDi@j;jk**0077<<Q#hR]R]RkRkFkBl=lm #5??#9#9#@#@%HH!OO2299==BB1HXcXcXqXqLqHrCrs!..44;;??DDQ#hZeZeZsZsNsJtEtu$U__%;%;%B%BMRR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCT_T_TmTmHmDn?no!..44;;@@c(VaVaVoVoJoFpApq (> !" (,}}#4 # !s   KM
Mc                    [        5       (       d  [        S5      eSSKnUR                  R	                  UR
                  R                  UR
                  R                  5      nUR                  S[        X R                  R                  -  5      -  5        UR                  R                  UR                  S5      SS9R                  UR                  5      n[!        USU5        g)	z
Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
be quantized again.
z/Please install bitsandbytes to use this method.r   Nr(   cpuF)requires_gradr   )r   r   bitsandbytes
functionaldequantize_4bitr   datar8  r<  r:  r   r0  r   
Params4bitrC   r'   setattr)r   target_layerr>  bnbdequant_weightsquant_weights         r   r=  *RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )**OPP"..889L9L9Q9QS_SfSfSrSrsQ#h++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5r   )r  r  r  r  r  )NNNNNNNN)ra   rb   rc   rd   r   r  r  r   r0   
LongTensorr  r  boolr   r   r?   r.  r=  rf   r   r   s   @r   r  r    s    )  .2262604!%)-,0#'h
##d*h
 ((4/h
 ((4/	h

 E%%&-h
 $;h
  $;h
 #Tkh
 D[h
 
	h
 h
T506 6r   r  z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   T  ^  \ rS rSrSS0rU 4S jrS rS r\          SS\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\\	R                     S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\\	R                  -  S\\-  4S jj5       rSrU =r$ )RwkvForCausalLMi  zhead.weightzrwkv.embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r   r   r  r   r   r   rM   r   headr  )r   r   r   s     r   r   RwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	r   c                     U R                   $ r`   rS  r  s    r   get_output_embeddings%RwkvForCausalLM.get_output_embeddings  s    yyr   c                     Xl         g r`   rV  r  s     r   set_output_embeddings%RwkvForCausalLM.set_output_embeddings  s    "	r   Nr!  r"  r#  rI   labelsr   r   r$  r%  logits_to_keepr&  c           
         U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                   R                  S.UD6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
N)r#  rI   r   r   r$  r%  r   )r
  r\  r   r   )r	  r
  rI   r  r  r^   )r   r%  r   r   r:  slicerS  loss_functionr   r  rI   r  r  )r   r!  r"  r#  rI   r\  r   r   r$  r%  r]  r1  rwkv_outputsr  slice_indicesr
  r	  rN   s                     r   r?   RwkvForCausalLM.forward  s   L &1%<k$++BYBYyy'/!5# ! 
 %Q8B>SV8W8W~ot4]k=M1)<=>%%pVt{{OeOepiopDYab!11F)-)9TGf$EvE!$$&44#..
 	
r   )rS  r   )
NNNNNNNNNr   )ra   rb   rc   rd   _tied_weights_keysr   rW  rZ  r   r0   rN  r  r  rO  r:  Tensorr   r  r?   rf   r   r   s   @r   rQ  rQ    s(    ()AB#  .2262604*.!%)-,0#'-.D
##d*D
 ((4/D
 ((4/	D

 E%%&-D
   4'D
 $;D
  $;D
 #TkD
 D[D
 ell*D
 
#	#D
 D
r   rQ  )rQ  r  r   r_   )-r  r   dataclassesr   r0   r    r   r   
generationr   modeling_layersr   modeling_utilsr	   utilsr
   r   r   r   r   r   r   configuration_rwkvr   
get_loggerra   r   r   r   autogradFunctionr   ry   r   r   r   r   r   r   r   r  r  rQ  __all__r^   r   r   <module>rq     s}     !   & ) 9 -   + 
		H	%  5g
%..11 g
T)XbC5		 C5L#)bii #)L* D R%/ R% R%j 
 
< 
< 
< 
 < < <$ k6# k6 k6\ V
)? V
V
r Br   