
    Z jN                        S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSKJ	r
  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SSKJrJrJrJ r   SSK!J"r"  \RF                  " \$5      r%\" S5      (       a  SSK&J'r'  OSr'\" 5       (       a  SSK(J)r)  OSr) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\5      r-\ " S S\5      5       r.\" SS9\ " S S \5      5       5       r/\" S!S9\ " S" S#\5      5       5       r0\ " S$ S%\.5      5       r1\" S&S9 " S' S(\.\5      5       r2/ S)Qr3g)*zPyTorch MAMBA model.    N)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_mambapy_availableis_torch_greater_or_equal
is_tracingresolve_internal_import   )MambaConfigz2.9.0)associative_scan)pscanc                   6  ^  \ rS rSrSrSS\S\S\4U 4S jjjr\	R                  " 5       S 5       rS r  SS
\	R                  S\S	-  S\	R                  S	-  4S jjrSS\S	-  S\	R                  S	-  4S jjr  SS\S	-  S\	R                  S	-  4S jjrSrU =r$ )
MambaMixer:   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
config	layer_idxinitialize_mixer_weightsc           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  5      U l
        X l        UR                  U l        [        R                  " U R                  U R                  UR                  UR                  U R                  UR                  S-
  S9U l        UR                   U l        [$        UR                      U l        UR(                  U l        UR*                  U l        [        R,                  " U R                  U R                  S-  UR.                  S9U l        [        R,                  " U R                  U R                  U R
                  S-  -   SS9U l        [        R,                  " U R                  U R                  SS9U l        [        R6                  " [8        R:                  " U R                  U R
                  5      5      U l        [        R6                  " [8        R:                  " U R                  5      5      U l        U(       a>  U R4                  R@                  RB                  RD                  S:w  a  U RG                  5         [        R,                  " U R                  U R                  UR.                  S9U l$        UR.                  U l        [K        S5      q&[O        [L        S	S 5      q([O        [L        S
S 5      q)[K        S5      q*[W        [T        SS9q,[O        [T        SS 5      q-[O        [T        SS 5      q.U R_                  5         g )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r#   FTmetazcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fn)0super__init__r   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr   use_conv_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mambapyuse_associative_scanLinearuse_biasin_projx_projdt_proj	ParametertorchemptyA_logDweightdevicetypeinit_mamba_weightsout_projr   causal_conv1dgetattrr*   r+   	mamba_ssmr   selective_state_updater-   r.   warn_slow_implementation)selfr   r   r   	__class__s       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mamba/modeling_mamba.pyr0   MambaMixer.__init__B   s   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!--$*$?$?! yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX \\%++d.D.DdFYFY"Z[
ekk$*@*@AB#(;(;(B(B(G(G6(Q##%		$"8"8$:J:JQWQ`Q`a )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC%%'    c                    [         R                  " SU R                  S-   [         R                  U R                  R
                  S9S S S 24   nUR                  U R                  S5      R                  5       n[        R                  " U R                  [         R                  " U5      5        [        R                  " U R                  5        U R                  R                  S-  U R                  R                   -  nU R                  R"                  S:X  a,  [        R$                  " U R&                  R(                  U5        OGU R                  R"                  S:X  a-  [        R*                  " U R&                  R(                  U* U5        [         R,                  " [         R.                  " U R                  U R&                  R0                  R
                  [         R                  S9[2        R                  " U R                  R4                  5      [2        R                  " U R                  R6                  5      -
  -  [2        R                  " U R                  R6                  5      -   5      R9                  U R                  R:                  S9nU[         R                  " [         R<                  " U* 5      * 5      -   n[        R                  " U R&                  R0                  U5        g )	Nr   )dtyperL   g      constantrandomrL   r[   )min)rG   aranger3   float32rI   rL   expandr6   
contiguousinitcopy_logones_rJ   r   r8   time_step_scaletime_step_init_scheme	constant_rE   rK   uniform_exprandr#   mathtime_step_maxtime_step_minclamptime_step_floorexpm1)rU   Adt_init_stddtinv_dts        rW   rN   MambaMixer.init_mamba_weightsz   s   LLD//!35==QUQ[Q[QbQbcdhjkdklHHT++R0;;=

4::uyy|,

466kk00$69T9TT;;,,
:NN4<<..<[[..(:MM$,,--|[IYYJJt--dll6G6G6N6NV[VcVcdxx112TXXdkk>W>W5XXZhht{{0012
 %DKK//%
0	 	 eiibS!1 122

4<<$$f-rY   c                    [        [        [        [        [        [
        45      nU(       dW  U R                  (       a0  [        5       (       a  [        R                  S5        g [        S5      e[        R                  S5        g g )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allrS   r-   r+   r*   r.   r?   r   loggerwarning_onceImportError)rU   is_fast_path_availables     rW   rT   #MambaMixer.warn_slow_implementation   sz    !$#%68HJ^`no"
 &'))''S & Z  ##W &rY   Nhidden_statescache_paramsattention_maskc                 4
   U R                  U5      R                  SS5      nU R                  (       Ga.  UGc*  [        UU R                  R
                  U R                  (       a  U R                  R                  OS U R                  R
                  U R                  R
                  U R                  R
                  U R                  (       a$  U R                  R                  R                  5       OS [        R                  " U R                  R                  5       5      * S S U R                   R                  5       U R                  R                  R                  5       SS9nU$ UR#                  SSS9u  pUb  XR%                  S5      -  nUS L=(       a    UR'                  U R(                  5      nU R                  R
                  R+                  U R                  R
                  R-                  S5      U R                  R
                  R-                  S5      5      nU(       ao  [/        UR1                  S5      UR2                  U R(                     R4                  UU R                  R                  U R6                  5      nUR%                  S5      nOUbV  [8        R:                  R=                  XR>                  UR@                  S   -
  S45      n	URC                  XR(                  5        [E        XU R                  R                  U R6                  S9nUb  XR%                  S5      -  nU R                  UR                  SS5      5      n
[        RF                  " XRH                  U RJ                  U RJ                  /SS9u  pnU R                  R
                  UR                  SS5      -  n[        R                  " U R                  R                  5       5      * n[M        U R                  S	5      (       a$  U R                  R                  R                  5       OS nU(       ad  [O        UR2                  U R(                     RP                  US
   US
   UUS S 2S4   US S 2S4   U R                   US
   USS9
R%                  S5      nOo[S        UUUUR                  SS5      UR                  SS5      U R                   R                  5       UUSSS9
u  nnUb  Ub  URU                  UU R(                  5        U R                  UR                  SS5      5      nU$ )Nr   r'   T)
delta_biasdelta_softplusdimr   r\   )r=   r#   ).r   )dt_softplus)r   return_last_state)+rC   	transposetrainingr.   r;   rK   r9   r#   rD   rE   rO   rB   floatrG   rm   rI   rJ   chunk	unsqueezehas_previous_stater   viewsizer*   squeezelayersconv_statesr=   r   
functionalpadr5   shapeupdate_conv_stater+   splitr8   r3   hasattrrS   recurrent_statesr-   update_recurrent_state)rU   r   r   r   projected_statescontextualized_statesgateis_decodingconv_weightsr   ssm_parameters	time_stepBCdiscrete_time_stepru   time_proj_biasscan_outputs	ssm_states                      rW   cuda_kernels_forwardMambaMixer.cuda_kernels_forward   s8     <<6@@AF===\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!t %$S #3"8"8"8"BM) -0H0H0K K&d2f|7V7VW[WeWe7fK  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL 4!))"- ''7CC KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !22;O 0!1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOI! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:W:WT\\..446]aN5 ''7HH!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 77	4>>R %)MM,2H2HA2N$O!$$rY   c           	         UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  SSS9u  pUb  XR                  S5      -  n	UbR  UR                  U R                  5      (       a2  UR                  U R                     R                  R                  5       nO6[        R                  " X@R                  U R                  4U	R                  US9nUGbw  UR                  U R                  5      (       d  [         R"                  R%                  U	U R&                  U	R                   S   -
  S45      nUR)                  XR                  5        U R+                  U R-                  U	5      SS U24   5      n	OUR)                  XR                  5      nUR/                  U R,                  R0                  R                  5      n[        R2                  " XR,                  R0                  S S 2SS S 24   -  SS9n	U R4                  (       a  XR,                  R6                  -  n	U R+                  U	5      R/                  U5      R                  S5      n	O'U R+                  U R-                  U	5      SS U24   5      n	Ub  XR                  S5      -  n	U R9                  U	R                  SS5      5      n[        R:                  " XR<                  U R                  U R                  /SS9u  pnU R?                  U5      n[         R"                  RA                  U5      R                  SS5      n[        RB                  " U RD                  RG                  5       5      * n[        RB                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RG                  5       -  nUU	S S 2S S 2S S 2S 4   RG                  5       -  nU RH                  (       a  U RJ                  (       a  Uc  [M        UR                  SS5      UR                  SS5      5      nUUR                  S5      -  RO                  S5      R                  SS5      nUXRP                  S S S 2S 4   -  -   nUU R+                  U
5      -  nGOU RR                  (       a  [T        b  [W        U	5      (       a  Uc  S	 nUR                  RX                  S
;   a  SOSn[U        UUU4SUS9u  nn[        RZ                  " UR]                  SSSS5      R/                  U5      UR                  S5      5      RO                  S5      R]                  SSS5      nUS S 2S S 2SS S 24   nO/ n[_        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[        RZ                  " UR/                  U5      US S 2US S 24   R                  S5      5      nURa                  US S 2S S 2S4   5        M     [        Rb                  " USS9nUXRP                  S S S 2S 4   -  -   nUU R+                  U
5      -  nUb  URe                  XR                  5        U Rg                  UR                  SS5      5      nU$ )Nr   r'   r   r_   r\   r   .r   c                 (    U u  p#Uu  pEX$-  XC-  U-   4$ N )leftrighta_leftb_lefta_rightb_rights         rW   
combine_fn+MambaMixer.slow_forward.<locals>.combine_fnP  s'    %)NF',$G",g.>.HIIrY   )cudaxpu	pointwisegeneric)r   combine_mode)4r   r[   rC   r   r   r   r   r   r   r   clonerG   zerosr6   r3   rL   r   r   r   r5   r   r>   r;   torK   sumr9   r#   rD   r   r8   rE   softplusrm   rI   r   r?   r   r   r   rJ   r@   r   r   rM   matmulpermuterangeappendstackr   rO   )rU   input_statesr   r   
batch_sizeseq_len_r[   r   r   r   r   
conv_stater   r   r   r   r   ru   
discrete_A
discrete_BdeltaB_uhsscan_outputr   r   all_hr   ir   s                                 rW   slow_forwardMambaMixer.slow_forward  s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM#(G(G(W(W$++DNN;LLRRTI33T5H5HI$++5I #224>>BB]]..!**]-@-@-DDaH

 ..z>>J $])CC'M)R S);;M>>Z
']]4;;+=+=+D+DE
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6K ((-=-IjYfNgNglx  mAJ
 /9.?.?.D.D.W{]f+JX8NTUdpq5#ll5==Aq!+D+G+G+NPQP[P[\^P_`hhiklttuvxy{|}!!QA+.	  "wA *1aA: 6 BXaQRTUWXjEY YI"',,y||E/BAaAgJDXDXY[D\"]K ''Aq!G(<= ( $kk,B?%a9N)NOK&$7K'33I~~N !%k.C.CAq.I J$$rY   c                    [        [        [        [        [        [
        45      nU(       aP  SU R                  R                  R                  R                  ;   a"  [        U5      (       d  U R                  XU5      $ U R                  XU5      $ )Nr   )r{   rS   r-   r+   r*   r.   rD   rK   rL   rM   r   r   r   )rU   r   r   r   kwargsr   s         rW   forwardMambaMixer.forwardn  su     "%#%68HJ^`no"
 "f0B0B0I0I0N0N&NWaboWpWp,,].YY  nMMrY   )rI   rJ   r>   r=   r   r;   r5   rE   r1   rC   r6   r   rO   r3   r8   r@   rB   r9   r?   rD   )TNN)__name__
__module____qualname____firstlineno____doc__r   r7   boolr0   rG   no_gradrN   rT   Tensorr	   
LongTensorr   r   r   __static_attributes____classcell__rV   s   @rW   r   r   :   s    6({ 6(s 6(VZ 6( 6(p ]]_. .*4 &*26	d%||d% dld% ((4/	d%N]%ut| ]%Z_ZjZjmqZq ]%F &*26	N dlN ((4/	N NrY   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )MambaRMSNormi}  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z<
MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
N)r/   r0   r   rF   rG   onesrK   variance_epsilon)rU   r1   epsrV   s      rW   r0   MambaRMSNorm.__init__~  s/     	ll5::k#:; #rY   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr'   r\   T)keepdim)	r[   r   rG   rb   powmeanrsqrtr   rK   )rU   r   input_dtypevariances       rW   r   MambaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::rY   c                 R    U R                   R                  S    SU R                   3$ )Nr   z, eps=)rK   r   r   rU   s    rW   
extra_reprMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEErY   )r   rK   )gư>)	r   r   r   r   r0   r   r   r   r   r   s   @rW   r   r   }  s    $;F FrY   r   c                   b   ^  \ rS rSrU 4S jr  SS\S-  S\R                  S-  4S jjrSr	U =r
$ )	
MambaBlocki  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XSS9U l
        g )Nr   F)r   r   )r/   r0   r   r   residual_in_fp32r   r1   layer_norm_epsilonnormr   mixer)rU   r   r   rV   s      rW   r0   MambaBlock.__init__  sO    " & 7 7 !3!39R9RS	V[\
rY   Nr   r   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XUS9nXQ-   nU$ )N)r[   r   r   )r   r   rK   r[   r   rG   rb   r  )rU   r   r   r   r   residuals         rW   r   MambaBlock.forward  sq     !		-"2"29I9I9O9O"2"PQ  {{5==1H

=\j
k 0rY   )r   r   r  r   r   r   )r   r   r   r   r0   r	   rG   r   r   r   r   r   s   @rW   r   r     s>    ] &*26	 dl ((4/	 rY   r   c                   b    \ rS rSr% \\S'   SrSS/rSrSr	\
R                  " 5       S 5       rSrg	)
MambaPreTrainedModeli  r   backboner   r   Tc                 r   U R                   R                  n[        U[        5      (       Ga)  UR	                  5         [
        R                  " UR                  R                  [        R                  " S5      S9  UR                  R                  b*  [
        R                  " UR                  R                  5        [
        R                  " UR                  R                  [        R                  " S5      S9  U R                   R                  (       aC  UR                  R                  nU[        R                  " U R                   R                  5      -  n[        U[         R"                  5      (       aN  [
        R$                  " UR                  US9  UR                  b!  [
        R                  " UR                  5        gg[        U[&        5      (       a!  [
        R(                  " UR                  5        g[        U[         R*                  5      (       a   [
        R$                  " UR                  US9  gg)zInitialize the weights.   )aN)std)r   initializer_range
isinstancer   rN   re   kaiming_uniform_r;   rK   ro   sqrtr#   zeros_rO   rescale_prenorm_residualnum_hidden_layersr   rA   normal_r   rh   	Embedding)rU   moduler  ps       rW   _init_weights"MambaPreTrainedModel._init_weights  sZ    kk++fj)) %%'!!&--"6"6$))A,G}}!!-FMM../!!&//"8"8DIIaLI{{33 OO**TYYt{{<<==fbii((LLC0{{&FKK( '--JJv}}%--LLC0 .rY   r   N)r   r   r   r   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulrG   r   r  r   r   rY   rW   r  r    s<    "%|4&*#L
]]_"1 "1rY   r  z,
    Class for the MAMBA model outputs.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
S-  \S'   Sr\\R                     S-  \S'   Srg)MambaOutputi  a   
cache_params (`Cache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlast_hidden_stater   r   r   )r   r   r   r   r   r#  rG   FloatTensorr  r   r	   r   tupler   r   rY   rW   r"  r"    sG     37u((4/6!%L%$,%59M5**+d29rY   r"  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Srg)	MambaCausalLMOutputi  au  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cache_params (`Cache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlosslogitsr   r   r   )r   r   r   r   r   r(  rG   r$  r  r)  r   r	   r   r%  r   r   rY   rW   r'  r'    s[    
 &*D%

d
")'+FE$+!%L%$,%59M5**+d29rY   r'  c                      ^  \ rS rSrU 4S jrS rS rS r\       SS\	R                  S-  S\	R                  S-  S	\S-  S
\S-  S\S-  S\S-  S\	R                  S-  S\\-  4S jj5       rSrU =r$ )
MambaModeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )N)r   Fr   )r/   r0   r   r  
vocab_sizer1   
embeddings
ModuleListr   r  r   r   gradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rU   r   idxrV   s      rW   r0   MambaModel.__init__  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$rRq3Z%FRq$rs&+#"6#5#56;T;TU//? %ss   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)rU   
state_dictprefixargsks        rW   r3  MambaModel.load_hook  s4    Aq EO^^TUEV
99\=AB rY   c                     U R                   $ r   r.  r   s    rW   get_input_embeddingsMambaModel.get_input_embeddings  s    rY   c                     Xl         g r   r@  rU   new_embeddingss     rW   set_input_embeddingsMambaModel.set_input_embeddings  s    (rY   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   returnc                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Uc  [        U R                   S9nUn	U(       a  SOSn
U R                   H  nU" U	UUS9n	U(       d  M  X4-   n
M     U R                  U	5      n	U(       a  X4-   n
U(       d  [        S XU
4 5       5      $ [        U	U(       a  UU
S9$ SU
S9$ )	at  
cache_params (`Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
NFz:You must specify exactly one of input_ids or inputs_embeds)r   r   r  c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     rW   	<genexpr>%MambaModel.forward.<locals>.<genexpr>W  s     f$Tq$Ts   	)r#  r   r   )r   rK  r   rJ  rL  
ValueErrorr.  r0  r
   r   r1  r%  r"  )rU   rH  rI  r   rJ  rK  rL  r   r   r   all_hidden_statesmixer_blocks               rW   r   MambaModel.forward   s[   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++BYBY-t";<YZZ  OOI6M&&4==YI-'t{{;L%"6BD;;K')-M $#$58H$H! ' M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
rY   )r.  r0  r   r1  )NNNNNNN)r   r   r   r   r0   r3  rA  rF  r   rG   r   r	   r   r%  r"  r   r   r   r   s   @rW   r+  r+    s    
)  .215%)!%,0#'26<
##d*<
 ''$.<
 dl	<

 $;<
 #Tk<
 D[<
 ((4/<
 
	<
 <
rY   r+  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   ~  ^  \ rS rSrSS0rU 4S jrS rS r     SS\S-  S	\	R                  S-  S
\S-  4U 4S jjjr\         SS\	R                  S-  S	\	R                  S-  S\	R                  S-  S\S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\	R                   -  S\\-  4S jj5       rSrU =r$ )MambaForCausalLMi`  zlm_head.weightzbackbone.embeddings.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr(   )
r/   r0   r+  r	  r   rA   r1   r-  lm_headr4  )rU   r   rV   s     rW   r0   MambaForCausalLM.__init__i  sF     "6*yy!3!3V5F5FUSrY   c                 6    U R                   R                  5       $ r   )r	  rA  r   s    rW   rA  %MambaForCausalLM.get_input_embeddingsp  s    }}1133rY   c                 8    U R                   R                  U5      $ r   )r	  rF  rD  s     rW   rF  %MambaForCausalLM.set_input_embeddingss  s    }}11.AArY   Nr   r   is_first_iterationc           	      \   > [         T	U ]  " U4UUUUUS.UD6nU(       a  U(       d  S US'   U$ )N)rI  rJ  r   r   ra  r   )r/   prepare_inputs_for_generation)
rU   rH  rI  rJ  r   r   ra  r   model_inputsrV   s
            rW   rc  .MambaForCausalLM.prepare_inputs_for_generationv  sN     w<
'%)1
 
 /-1L)*rY   rH  rI  labelsrK  rL  rJ  logits_to_keeprM  c
           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   R                  U R                  R                  R                  5      5      R                  5       nSnUb  UR                  UR                  5      nUSSS2SS24   R                  5       nUSSS24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S9$ )a  
cache_params (`Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
N)r   rI  rK  rL  rJ  r   r   .r\   r   )r(  r)  r   r   )r   rL  r	  r  r7   slicer[  r   rK   r[   r   rL   rd   r   r   r   r'  r   r   )rU   rH  r   rI  r   rf  rK  rL  rJ  rg  r   mamba_outputsr   slice_indicesr)  r(  shift_logitsshift_labelsloss_fctoutputs                       rW   r   MambaForCausalLM.forward  s   2 &1%<k$++BYBY%'!5#) & 
 &a(8B>SV8W8W~ot4]kmA}a,?@CCDLLDWDWD]D]^_eegYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
rY   )r	  r[  )NNNNF)	NNNNNNNNr   )r   r   r   r   _tied_weights_keysr0   rA  rF  r	   rG   r   r   rc  r   r$  r7   r   r%  r'  r   r   r   r   s   @rW   rY  rY  `  sK    +,HI4B %)26*/
 dl ((4/ !4K 2  .22626%)*.,0#'!%-.=
##d*=
 ((4/=
 ((4/	=

 dl=
   4'=
 #Tk=
 D[=
 $;=
 ell*=
 
$	$=
 =
rY   rY  )rY  r+  r  )4r   ro   dataclassesr   rG   r   torch.nnr    r   re   activationsr   cache_utilsr	   r
   
generationr   integrationsr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerr   r|   (torch._higher_order_ops.associative_scanr   mambapy.pscanr   Moduler   r   r   r  r"  r'  r+  rY  __all__r   rY   rW   <module>r     s     !   % & ! . ) , 9 - 
  - 
		H	%W%%I #E@N @NF
F299 F(+ 4 *1? *1 *1Z 
 :+ : : 
 :+ : :& V
% V
 V
r g
+_ g
g
T ErY   