
    Z j                        S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJr  \R>                  " \ 5      r!S\RD                  S\#4S jr$S r%S r&S r' " S S\R
                  RP                  5      r) " S S\RP                  5      r* " S S\RP                  5      r+ " S S\5      r,\ " S S \5      5       r-\\" S!S"9 " S# S$\5      5       5       r.\\" S%S"9 " S& S'\5      5       5       r/\ " S( S)\-5      5       r0\" S*S"9 " S+ S,\-\5      5       r1/ S-Qr2g).zPyTorch MAMBA2 model.    N)	dataclass)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_torchdynamo_compilinglogging)resolve_internal_import   )Mamba2Configinput_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shapes      {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizer#   (   sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UU    c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r      )r#   r   r   reshape)r   r   
chunk_sizes      r"   reshape_into_chunksr*   3   s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r$   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r&   .Ndevicedtype)diagonalr   dim)
sizeexpandr   trilonesr.   boolmasked_fillcumsuminf)r   r)   masktensor_segsums       r"   segment_sumr>   G   s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr$   c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr   r   )r   r/   to)hidden_statesattention_maskr/   s      r"   apply_mask_to_padding_statesrC   [   s_    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr$   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )MambaRMSNormGatedg   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g Nsuper__init__r   	Parameterr   r7   weightvariance_epsilonselfhidden_sizeeps	__class__s      r"   rK   MambaRMSNormGated.__init__h   s-    ll5::k#:; #r$   c                    UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  S5      R                  SSS9nU[        R                  " X@R                  -   5      -  nU R                  UR                  U5      -  $ Nr'   r&   T)keepdim)r/   r@   r   float32r   r   silupowmeanrsqrtrN   rM   )rP   rA   gateinput_dtypevariances        r"   forwardMambaRMSNormGated.forwardm   s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%H?T?T4T(UU{{]--k:::r$   rN   rM   gư>rH   __name__
__module____qualname____firstlineno__rK   r`   __static_attributes____classcell__rS   s   @r"   rE   rE   g   s    $
	; 	;r$   rE   c                   L  ^  \ rS rSrSrSS\S\S\4U 4S jjjr\	R                  " 5       S 5       r  SS	\	R                  S
\S-  S\	R                  S-  4S jjr  SS	\	R                  S
\S-  S\	R                  S-  4S jjr  SS
\S-  S\	R                  S-  4S jjrSrU =r$ )Mamba2Mixery   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
config	layer_idxinitialize_mixer_weightsc           	      j  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        [        UR                  5      U l        X l        UR                  U l        UR                  U l        [         UR                     U l        UR$                  U l        UR&                  U l        UR(                  U l        UR*                  U l        UR,                  U l        UR.                  U l        UR0                  U l        UR2                  U l        UR4                  U l        U R                  SU R(                  -  U R
                  -  -   U l        [8        R:                  " U R6                  U R6                  UR                  UR                  U R6                  UR                  S-
  S9U l        U R                  U R6                  -   U R                  -   n[8        R>                  " U R                  UUR@                  S9U l!        [8        RD                  " [F        RH                  " U R                  5      5      U l%        [8        RD                  " [F        RH                  " U R                  5      5      U l&        [O        U R                  U R$                  S9U l(        [8        RD                  " [F        RH                  " U R                  5      5      U l)        U(       a4  U RJ                  RT                  RV                  S:w  a  U RY                  5         [8        R>                  " U R                  U R                  UR@                  S9U l-        UR@                  U l         []        S5      n[_        USS 5      q0[_        US	S 5      q1[]        S
5      n[e        USS9q3[e        USS9q4[e        USS9q5[m        [f        [h        [j        [b        [`        45      q7[n        (       d  [p        Rs                  S5        g g )Nr'   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingru   rR   metazcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d):rJ   rK   	num_headsrQ   
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintr5   intermediate_sizetime_step_rankrp   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normn_groupshead_dimr)   time_step_limittime_step_mintime_step_maxtime_step_floorconv_dimr   Conv1dconv1dLinearuse_biasin_projrL   r   emptydt_biasA_logrE   normDr.   typeinit_mamba2_weightsout_projr   getattrr|   r}   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rP   ro   rp   rq   projection_sizecausal_conv1d	mamba_ssmrS   s          r"   rK   Mamba2Mixer.__init__   sE   ))!--$// & 2 2!$V]]T5E5E%E!F!&"7"78"#11 ++&++,"(";"; ++%55#11#11%55..T]]1BTEXEX1XXii%%**==&&*
 004==@4>>Qyy
 ||EKK$?@ \\%++dnn"=>
%d&<&<$BYBYZ	ekk$..9:#(;(;(@(@F(J$$&		$"8"8$:J:JQWQ`Q`a )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &%> &r$   c                 z   [         R                  " SU R                  S-   U R                  R                  [         R
                  S9n[        R                  " U R                  [         R                  " U5      5        [        R                  " U R                  5        [         R                  " [         R                  " U R                  U R                  R                  [         R
                  S9[        R                  " U R                  5      [        R                  " U R                   5      -
  -  [        R                  " U R                   5      -   5      R#                  U R$                  S9nU[         R                  " [         R&                  " U* 5      * 5      -   n[        R                  " U R                  U5        g )Nr   r-   )min)r   aranger   r   r.   rX   initcopy_logones_r   exprandr   mathr   r   clampr   expm1)rP   Adtinv_dts       r"   r   Mamba2Mixer.init_mamba2_weights   s   LLDNNQ.tzz7H7HPUP]P]^

4::uyy|,

466YYJJt~~dll.A.AWxx**+dhht7I7I.JJLhht))*+
 %D((%
)	 	 eiibS!1 122

4<<(r$   NrA   cache_paramsrB   c                    [        X5      nU R                  U5      nUR                  u  pVnU R                  U R                  -  nUR                  S   SU R
                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UGb  UR                  U R                  5      (       Ga  UR                  S5      R                  XU R
                  U R                  U R                  /SS9u    pzp[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[&        R                  " UU R
                  X/SS9u  pn[&        R(                  " U R*                  R-                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R/                  SU R0                  U R                  5      R3                  [&        R4                  S9nUS S 2S S 2S 4   R/                  SSU R0                  5      nU R6                  S S 2S S4   R/                  SU R0                  5      nU R8                  S S 2S S4   R/                  SU R0                  5      nUR;                  XPR                  UR                  S   U R                  -  5      nUR;                  XPR                  UR                  S   U R                  -  5      nUR;                  XPR                  U R0                  5      n[=        UR                  U R                     R>                  UUUUUUS USS9
nUR;                  XPR                  U R0                  -  5      nU RA                  X5      nU RC                  U5      S S 2S S4   nU$ [&        R(                  " U R*                  R-                  5       5      * nU RD                  S	[-        S
5      4:X  a  0 OSU RD                  0nU RF                  (       a  Uc  [I        UU R                  R                   R                  S5      U R                  R"                  U R6                  U4U R8                  U RJ                  S U R$                  U R@                  R                   U R@                  RL                  U RB                  R                   U RB                  R"                  U R0                  U R                  SSS.UD6nU$ UR                  XU R
                  U R                  U R                  /SS9u    pzpUbh  URO                  SS5      n[P        RR                  RU                  UU RV                  UR                  S   -
  S45      nURY                  UU R                  S9nU R$                  S;  aH  U R[                  U R                  URO                  SS5      5      SS U24   RO                  SS5      5      nOm[]        URO                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  S9RO                  SS5      n[        X5      n[&        R                  " UU R
                  X/SS9u  pn[_        UR;                  XVSU R0                  5      UUUR;                  XVU R                  S5      UR;                  XVU R                  S5      4U RJ                  U R8                  S S SU R6                  SS.UD6u  nnUb  Ub  URa                  UU R                  S9  UR;                  XVS5      nU RA                  UU
5      nU RC                  U5      nU$ )Nr&   r'   r   r2   .r/   T)zr   dt_softplusg        r;   dt_limitF)r   r)   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   rp   )rY   swish)xrM   ru   r   )r)   r   r   r   r   r   r   )1rC   r   r   r   r   r   r   has_previous_staterp   squeezesplitr   r|   layersconv_statesr   rM   ru   r   r   r   r   floatr5   r   r@   rX   r   r   viewr   recurrent_statesr   r   r   trainingr   r)   rN   	transposer   r   r    r   update_conv_stater   r}   r   update_recurrent_state)rP   rA   r   rB   projected_states
batch_sizeseq_len_groups_time_state_sized_mlpr]   hidden_states_B_Cr   BCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_states                            r"   cuda_kernels_forward Mamba2Mixer.cuda_kernels_forward   s    5]S<<6 "/!4!4
Q!%1D1D!D""2&$((()$--$"5"556 nn  #(G(G(W(W0@0H0H0K0Q0Qt55t}}dnnU[] 1R 1-Aq)
 !5!##DNN3??""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M --.q$|<Cv 
o 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%d 
y 5E4J4J4#9#94==$..Y_a 5K 511-  +3D3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K #/"@"@X\XfXf"@"gK??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'??	)
  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y" (\-E 77	T^^7\)..zBG"iiT: mmK0
r$   c                    UR                   u  pEnUR                  n[        X5      nU R                  U5      nUR                   S   SU R                  -  -
  SU R
                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUR                  SS5      nUS L=(       a    UR                  U R                  5      nU(       a  UR                  XR                  S9n[        R                  " XR                   R"                  R%                  S5      -  SS9nU R&                  (       a  XR                   R(                  -   nU R+                  U5      nOUbT  [,        R.                  R1                  XR2                  UR                   S   -
  S45      nUR                  XR                  S9  U R+                  U R!                  U5      SS U24   R                  SS5      5      n[        X5      n[        R                  " UU R                  U R
                  U R                  -  U R
                  U R                  -  /SS9u  pn[        R4                  " U R6                  R9                  5       5      * nU(       Ga  UR:                  U R                     R<                  nUS S 2SS S 24   S S 2S S4   nUR                  SS5      R?                  XLR                   S   U R@                  5      nU RB                  S   R?                  U RB                  R                   S   U R@                  5      n[        R,                  R.                  RE                  UURG                  UR                  5      -   5      n[        RH                  " XRJ                  S   U RJ                  S   5      nUS	   R?                  U R                  U R@                  U R                  5      RG                  [        RL                  S
9n[        R4                  " US   U-  5      RG                  US9nURO                  X@R
                  S5      SS S S 24   nUR?                  X@R
                  U R                  U R
                  -  UR                   S   5      RQ                  5       nURO                  USUR                   S   5      nUS   USS S S 24   -  nURO                  USU R@                  5      nUUS   -  RG                  US9nUR:                  U R                     RR                  U-  U-   nURU                  UU R                  S9nURO                  X@R
                  S5      SS S S 24   nUR?                  X@R
                  U R                  U R
                  -  UR                   S   5      RQ                  5       nURO                  USUR                   S   5      nURG                  UR<                  UR                  S9nURW                  X@R                  -  U R@                  U R                  5      nURW                  X@R                  -  U R                  S5      n[        RX                  " UU5      nURW                  X@R                  U R@                  5      nU RZ                  S   R?                  U RZ                  R                   S   U R@                  5      nUUU-  -   RG                  UR                  5      nURO                  US5      S S 2S S4   nGO[,        R.                  RE                  XRB                  -   5      n[        RH                  " XRJ                  S   U RJ                  S   5      nURO                  XESU R@                  5      R9                  5       nURO                  XESU R                  5      R9                  5       nURO                  XESU R                  5      R9                  5       nUR]                  U R                  U R
                  -  SU R                  S9nUR]                  U R                  U R
                  -  SU R                  S9nU R^                  XPR^                  -  -
  U R^                  -  nU RZ                  S   [a        UU5      -  nXS   -  nURG                  UR                  5      U-  nUUUU4 Vs/ s H  n[c        UUU R^                  5      PM     snu  nnnnURe                  SSSS5      n[        Rf                  " USS9n[        R4                  " [i        U5      5      n US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n!U!R                  SS9n"U"S   U Re                  SSSSS5      S   -  n#U#R                  SS9n$U$S   US S 2S S 2S 4   -  R                  SS9n%[        R4                  " US S 2S S 2S S 2SS 24   U-
  5      n&UU&Re                  SSSS5      S   -  n'U'SS S S 24   US   -  R                  SS9n([        Rj                  " U(S S 2S S24   5      n)[        Rl                  " U)U(/SS9n([        R4                  " [i        [,        R.                  R1                  US S 2S S 2S S 2S4   S5      5      5      n*U*R                  SS5      n*U*S	   U(S S 2S S 2S S4   -  R                  SS9n+U+S S 2S S24   U+S S 2S4   n,n([        R4                  " U5      n-USS S S 24   U(S S 2S S 2S S4   -  n.U-Re                  SSSS5      n/U.R                  S5      U/S   -  n0U%U0-   nURO                  USU R                  U R@                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURO                  XES5      nU,b  Ub  URU                  U,U R                  S9  U Ro                  UU
5      n1U Rq                  U1RG                  U5      5      n2U2$ s  snf )Nr&   r'   r2   r   r   r   .r,   ).NNr   )r.   r-   )r3   output_sizer   r   r1   )r   r   )9r   r/   rC   r   r   r   r   r   r   r   r   r   rp   r   r   sumr   rM   r   r   ru   r   r   r   r    r   r   r   r   r   r.   r5   r   r   softplusr@   r   r   rX   r(   
contiguousr   r   r   bmmr   repeat_interleaver)   r#   r*   permuter:   r>   
zeros_likecatr   r   )3rP   rA   r   rB   r   r   r   r/   r   r   r]   r   r   is_decodingr   r   r   r   cache_devicer   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess3                                                      r"   torch_forwardMamba2Mixer.torch_forward  s
    "/!4!4
Q## 5]S<<6!''+a$2H2H.HH1t}}K\_c_r_rKrrsw  tB  tB  B  GH  H,<,B,Bt55t~~V\^ -C -
)1% .77!<"$.b<3R3RSWSaSa3b &889JVdVd8eK %		kk0088;;! !!$58H8H$H! $): ; ' mm//%(=(=@Q@W@WXZ@[([]^'_ ..{nn.U $5F)GXgX)V)`)`abde)f g89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
! YYtzz'')**'..t~~>EEL Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC &,,T^^<MMPRRUXXJ%<<ZSWSaSa<bJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $ahhaggFJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF $..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A33I3Xii4(
 !%knnU.C D$$A &{s    !pc                     [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XU5      $ U R                  XU5      $ )Ncuda)r   r   rM   r.   r   r   r   r  )rP   rA   r   rB   kwargss        r"   r`   Mamba2Mixer.forwardL  sV     "!f0C0C0J0J0O0O&OXpXrXr,,].YY!!-~NNr$   )r   r   r   r   r)   r   r   r   r   r   rQ   r   r   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )TNN)re   rf   rg   rh   __doc__r   r   r8   rK   r   no_gradr   Tensorr   r   r  r`   ri   rj   rk   s   @r"   rm   rm   y   s    [| [ [W[ [ [z ]]_) )$ &*.2	]||] dl] t+	]F &*.2	{%||{% dl{% t+	{%B &*.2		O dl	O t+		O 	Or$   rm   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Mamba2RMSNormiX  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z=
Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
NrI   rO   s      r"   rK   Mamba2RMSNorm.__init__Y  s/     	ll5::k#:; #r$   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ rV   )	r/   r@   r   rX   rZ   r[   r\   rN   rM   )rP   rA   r^   r_   s       r"   r`   Mamba2RMSNorm.forwarda  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r$   rb   rc   rd   rk   s   @r"   r  r  X  s    $; ;r$   r  c                   b   ^  \ rS rSrU 4S jr  SS\S-  S\R                  S-  4S jjrSr	U =r
$ )	Mamba2Blockii  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XSS9U l
        g )Nrz   F)rp   rq   )rJ   rK   ro   rp   residual_in_fp32r  rQ   r   r   rm   mixer)rP   ro   rp   rS   s      r"   rK   Mamba2Block.__init__j  sO    " & 7 7!&"4"4&:S:ST	 W\]
r$   Nr   rB   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XUS9nXQ-   nU$ )Nr   r   rB   )r   r@   rM   r/   r&  r   rX   r'  )rP   rA   r   rB   r  residuals         r"   r`   Mamba2Block.forwardr  sq     !		-"2"29I9I9O9O"2"PQ  {{5==1H

=\j
k 0r$   )ro   rp   r'  r   r&  r  )re   rf   rg   rh   rK   r   r   r  r`   ri   rj   rk   s   @r"   r$  r$  i  s<    ^ &*.2	 dl t+	 r$   r$  c                   `    \ rS rSr% \\S'   SrS/rSrSr	\
R                  " 5       S 5       rSrg)	Mamba2PreTrainedModeli  ro   backboner$  Tc                 ~   U R                   R                  n[        U[        5      (       Ga)  UR	                  5         [
        R                  " UR                  R                  [        R                  " S5      S9  UR                  R                  b*  [
        R                  " UR                  R                  5        [
        R                  " UR                  R                  [        R                  " S5      S9  U R                   R                  (       aC  UR                  R                  nU[        R                  " U R                   R                  5      -  n[        U[         R"                  5      (       aN  [
        R$                  " UR                  US9  UR                  b!  [
        R                  " UR                  5        gg[        U[&        [(        45      (       a!  [
        R*                  " UR                  5        g[        U[         R,                  5      (       a   [
        R$                  " UR                  US9  gg)zInitialize the weights.   )aN)std)ro   initializer_range
isinstancerm   r   r   kaiming_uniform_r   rM   r   sqrtru   zeros_r   rescale_prenorm_residualnum_hidden_layersr   r   normal_r  rE   r   	Embedding)rP   moduler3  ps       r"   _init_weights#Mamba2PreTrainedModel._init_weights  s`    kk++fk** &&(!!&--"6"6$))A,G}}!!-FMM../!!&//"8"8DIIaLI{{33 OO**TYYt{{<<==fbii((LLC0{{&FKK( '0A BCCJJv}}%--LLC0 .r$    N)re   rf   rg   rh   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   r  r?  ri   rA  r$   r"   r.  r.    s9    "&&*#L
]]_"1 "1r$   r.  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\\R                     S-  \S'   Srg)Mamba2Outputi  a   
cache_params (`Cache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlast_hidden_stater   rA   rA  )re   rf   rg   rh   r  rJ  r   FloatTensorrB  r   r   rA   tupleri   rA  r$   r"   rI  rI    sG     37u((4/6!%L%$,%59M5**+d29r$   rI  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Srg)	Mamba2CausalLMOutputi  au  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cache_params (`Cache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlosslogitsr   rA   rA  )re   rf   rg   rh   r  rO  r   rK  rB  rP  r   r   rA   rL  ri   rA  r$   r"   rN  rN    s[    
 &*D%

d
")'+FE$+!%L%$,%59M5**+d29r$   rN  c                      ^  \ rS rSrU 4S jrS rS rS r\       SS\	R                  S-  S\	R                  S-  S	\S-  S
\S-  S\S-  S\S-  S\	R                  S-  S\\-  4S jj5       rSrU =r$ )Mamba2Modeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )Nr   Frz   )rJ   rK   r   r<  
vocab_sizerQ   
embeddings
ModuleListranger:  r$  r   gradient_checkpointingr  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rP   ro   idxrS   s      r"   rK   Mamba2Model.__init__  s     ,,v'8'8&:L:LMmmSXY_YqYqSr$sSrC[%GSr$st&+##F$6$6F<U<UV//? %ts   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)rP   
state_dictprefixargsks        r"   r[  Mamba2Model.load_hook  s4    Aq EO^^TUEV
99\=AB r$   c                     U R                   $ rH   rU  rP   s    r"   get_input_embeddings Mamba2Model.get_input_embeddings  s    r$   c                     Xl         g rH   rh  rP   new_embeddingss     r"   set_input_embeddings Mamba2Model.set_input_embeddings  s    (r$   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictrB   returnc                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Uc  [        U R                   S9nUn	U(       a  SOSn
U R                   H  nU" U	UUS9n	U(       d  M  X4-   n
M     U R                  U	5      n	U(       a  X4-   n
U(       d  [        S XU
4 5       5      $ [        U	U(       a  UU
S9$ SU
S9$ )	at  
cache_params (`Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
NFz:You must specify exactly one of input_ids or inputs_embeds)ro   rA  r*  c              3   .   #    U  H  oc  M  Uv   M     g 7frH   rA  ).0vs     r"   	<genexpr>&Mamba2Model.forward.<locals>.<genexpr>1  s     f$Tq$Ts   	)rJ  r   rA   )ro   rt  r   rs  ru  
ValueErrorrU  rX  r	   r   rY  rL  rI  )rP   rq  rr  r   rs  rt  ru  rB   r  rA   all_hidden_statesmixer_blocks               r"   r`   Mamba2Model.forward  s[   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++BYBY-t";<YZZ  OOI6M&&4==YI-'t{{;L%"6BD;;K')-M $#$58H$H! ' M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
r$   )rU  rX  r   rY  )NNNNNNN)re   rf   rg   rh   rK   r[  rj  ro  r   r   
LongTensorr   r8   r  rL  rI  r`   ri   rj   rk   s   @r"   rR  rR    s    
)  .215%)!%,0#'.2<
##d*<
 ''$.<
 dl	<

 $;<
 #Tk<
 D[<
 t+<
 
	<
 <
r$   rR  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                   ~  ^  \ rS rSrSS0rU 4S jrS rS r     SS\S-  S	\	R                  S-  S
\S-  4U 4S jjjr\         SS\	R                  S-  S\	R                  S-  S\S-  S\	R                  S-  S\S-  S\S-  S\S-  S	\	R                  S-  S\\	R                  -  S\\-  4S jj5       rSrU =r$ )Mamba2ForCausalLMi:  zlm_head.weightzbackbone.embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFry   )
rJ   rK   rR  r/  r   r   rQ   rT  lm_headr\  )rP   ro   rS   s     r"   rK   Mamba2ForCausalLM.__init__C  sF     #F+yy!3!3V5F5FUSr$   c                 6    U R                   R                  5       $ rH   )r/  rj  ri  s    r"   rj  &Mamba2ForCausalLM.get_input_embeddingsJ  s    }}1133r$   c                 8    U R                   R                  U5      $ rH   )r/  ro  rm  s     r"   ro  &Mamba2ForCausalLM.set_input_embeddingsM  s    }}11.AAr$   Nr   rB   is_first_iterationc           	      \   > [         T	U ]  " U4UUUUUS.UD6nU(       a  U(       d  S US'   U$ )N)rr  rs  r   rB   r  rB   )rJ   prepare_inputs_for_generation)
rP   rq  rr  rs  r   rB   r  r  model_inputsrS   s
            r"   r  /Mamba2ForCausalLM.prepare_inputs_for_generationP  sN     w<
'%)1
 
 /-1L)*r$   rq  rr  labelsrt  ru  rs  logits_to_keeprv  c
           
      2   Ub  UOU R                   R                  nU R                  UUUUUUUS9nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   R                  U R                  R                  R                  5      5      R                  5       nSnUb)  U R                  " SXU R                   R                  S.U
D6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
cache_params (`Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
N)r   rr  rt  ru  rs  rB   r   )rP  r  rT  r   )rO  rP  r   rA   rA  )ro   ru  r/  r5  r   slicer  r@   rM   r/   r   loss_functionrT  rN  r   rA   )rP   rq  rr  r   r  rt  ru  rs  rB   r  r  mamba2_outputsrA   slice_indicesrP  rO  outputs                    r"   r`   Mamba2ForCausalLM.forwardi  s,   2 &1%<k$++BYBY%'!5#) ' 
 'q)8B>SV8W8W~ot4]kmA}a,?@CCDLLDWDWD]D]^_eeg%%pVt{{OeOepiopDY!33F)-)9TGf$EvE#'44(66	
 	
r$   )r/  r  )NNNNF)	NNNNNNNNr   )re   rf   rg   rh   _tied_weights_keysrK   rj  ro  r   r   r  r8   r  r   r  rK  r   rL  rN  r`   ri   rj   rk   s   @r"   r  r  :  sG    +,HI4B %).2*/
 dl t+ !4K 2  .226%)*.,0#'!%.2-.6
##d*6
 ((4/6
 dl	6

   4'6
 #Tk6
 D[6
 $;6
 t+6
 ell*6
 
%	%6
 6
r$   r  )r  rR  r.  )3r  r   dataclassesr   r   r    r   r   activationsr   cache_utilsr   r	   
generationr
   integrationsr   modeling_layersr   modeling_utilsr   utilsr   r   r   r   utils.import_utilsr   configuration_mamba2r   
get_loggerre   r   r  r   r#   r*   r>   rC   ModulerE   rm   r  r$  r.  rI  rN  rR  r  __all__rA  r$   r"   <module>r     s     !   & ! . ) , 9 - S S 9 . 
		H	%VU\\ VS V
((	; ;$\O")) \O~;BII ;", 4 *1O *1 *1Z :; : : :; : :& V
' V
 V
r `
- `
`
F Hr$   