
    Z j                     n   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3  \Rh                  " \55      r6 " S S\SS9r7 " S S\*5      r8S2S jr9 " S S \&5      r: " S! S"\-5      r; " S# S$\Rx                  5      r= " S% S&\(5      r> " S' S(\)5      r? " S) S*\$5      r@\ " S+ S,\5      5       rA\ " S- S.\A5      5       rB " S/ S0\'5      rC/ S1QrDg)3zPyTorch Bamba model.    )	TypedDictN)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )JambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedapply_mask_to_padding_statespad_tensor_by_sizereshape_into_chunkssegment_sum   )BambaConfigc                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
BambaFlashAttentionKwargs<   a!  
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

cu_seq_lens_q (`torch.LongTensor`):
    Gets cumulative sequence length for query state.
cu_seq_lens_k (`torch.LongTensor`):
    Gets cumulative sequence length for key state.
max_length_q (`int`):
    Maximum sequence length for query state.
max_length_k (`int`):
    Maximum sequence length for key state.
seq_idx (`torch.IntTensor`):
    Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idx N)__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__int	IntTensor__static_attributes__r.       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bamba/modular_bamba.pyr'   r'   <   s7      ######__r:   r'   F)totalc                       \ rS rSrSrg)BambaRotaryEmbeddingT   r.   Nr/   r0   r1   r2   r9   r.   r:   r;   r>   r>   T       r:   r>   c                 R   UR                  U5      nUR                  U5      nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R                  " X/SS9n
[        R                  " X/SS9nX4$ )a{  Applies Rotary Position Embedding to the query and key tensors.

Removes the interleaving of cos and sin from GLM

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.Ndim)	unsqueezeshaper   r4   cat)qkcossinunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r;   apply_rotary_pos_embrU   Y   s    ( --
&C
--
&C 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr:   c                       \ rS rSrSrg)BambaAttention   r.   Nr@   r.   r:   r;   rW   rW      rA   r:   rW   c                       \ rS rSrSrg)BambaRMSNormGated   r.   Nr@   r.   r:   r;   rZ   rZ      rA   r:   rZ   c            
       >  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
S-  S	\R                  S-  S
\R                  S-  4S jjr  SS\
S-  S	\R                  S-  4S jjr   SS\
S-  S	\R                  S-  S
\R                  S-  4S jjrSrU =r$ )
BambaMixer   u(  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

The are a few differences between this and Mamba2Mixer:
- The variable use_precomputed_states is slightly different due to the hybrid cache structure
- There's a few non-obvious bugs fixed with batching in the slow path that exist in main
- Some extra variables that our layer doesn't need have been removed
- We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
config	layer_idxc           	        > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        UR                  U R                  -  5      U l        X l        UR                  U l        UR                  U l        ["        UR                     U l        UR&                  U l        UR*                  U l        UR.                  U l        UR2                  U l        UR6                  U l        UR:                  U l        UR<                  U l        UR>                  U l        U R                  SU R0                  -  U R                  -  -   U l         [B        RD                  " U R@                  U R@                  UR                  U R                  U R@                  U R                  S-
  S9U l#        U R                  U R@                  -   U R                  -   n[B        RH                  " U R                  UU R(                  S9U l%        [B        RL                  " [N        RP                  " U R                  5      5      U l)        [N        RT                  " SU R                  S-   5      n[B        RL                  " [N        RV                  " U5      5      U l,        [[        U R                  U R,                  S9U l.        [B        RL                  " [N        RP                  " U R                  5      5      U l/        [B        RH                  " U R                  U R                  U R(                  S9U l0        [c        S5      n[e        USS 5      q3[e        USS 5      q4[c        S	5      n[k        US
S9q6[k        USS9q7[k        USS9q8[s        [l        [n        [p        [h        [f        45      q:[t        (       d  [v        Ry                  S5        g [v        Ry                  S5        g )Nr   r$   )in_channelsout_channelsbiaskernel_sizegroupspadding)rd   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)=super__init__mamba_n_heads	num_headshidden_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer7   mamba_expandintermediate_sizer`   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonmamba_n_groupsn_groupsmamba_d_headhead_dimmamba_chunk_size
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr4   onesdt_biasarangelogA_logrZ   normDout_projr
   getattrrj   rk   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)selfr_   r`   projection_sizeAcausal_conv1d	mamba_ssm	__class__s          r;   rn   BambaMixer.__init__   s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
%d&<&<$BYBYZ	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &%>  fgr:   Nhidden_statescache_paramsattention_maskr-   c                    [        X5      nU R                  U5      nUR                  u  pgnU R                  U R                  -  n	US L=(       a'    UR                  U R                  5      =(       a    US:H  n
U
(       Ga  UR                  S5      R                  U R                  U R                  U R                  /SS9u  pn[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[&        R                  " UU R                  X/SS9u  pn[&        R(                  " U R*                  R-                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R/                  SU R0                  U R                  5      R3                  [&        R4                  S9nUS S 2S S 2S 4   R/                  SSU R0                  5      nU R6                  S S 2S S4   R/                  SU R0                  5      nU R8                  S S 2S S4   R/                  SU R0                  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  U R0                  5      n[=        UR                  U R                     R>                  UUUUUUS USS9
nUR;                  X`R                  U R0                  -  5      nU RA                  X5      nU RC                  U5      S S 2S S4   nU$ [&        R(                  " U R*                  R-                  5       5      * nU RD                  S[-        S	5      4:X  a  0 OS
U RD                  0nU RF                  (       a  Uc  [I        UU R                  R                   R                  S5      U R                  R"                  U R6                  U4U R8                  U RJ                  UU R$                  U R@                  R                   U R@                  RL                  U RB                  R                   U RB                  R"                  U R0                  U R                  SSS.UD6nU$ UR                  U R                  U R                  U R                  /SS9u  pnUbj  URO                  SS5      n[P        RR                  RU                  UU RV                  UR                  S   -
  S45      nURY                  UU R                  5      nU R$                  S;  aH  U R[                  U R                  URO                  SS5      5      SS U24   RO                  SS5      5      nOn[]        URO                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  US9RO                  SS5      n[        X5      n[&        R                  " UU R                  X/SS9u  pn[_        UR;                  XgSU R0                  5      UUUR;                  XgU R                  S5      UR;                  XgU R                  S5      4U RJ                  U R8                  S USU R6                  SS.UD6u  nnUb  Ub  URa                  UU R                  5      nUR;                  XgS5      nU RA                  UU5      nU RC                  U5      nU$ )Nr$   rC   rD   .dtypeT)zr   dt_softplusg        infdt_limitF)r   r   r-   r{   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   r   )siluswish)xweightrd   r{   r-   )r   r   r   r-   r   r   r   )1r    r   rG   r   rs   has_previous_stater`   squeezesplitrw   r   rp   rj   layersconv_statesr   r   rd   r{   r4   expr   floatexpandr   tofloat32r   r   viewr   recurrent_statesr   r   r   trainingr   r   variance_epsilon	transposer   
functionalpadru   update_conv_stater|   rk   r   update_recurrent_state)r   r   r   r   r-   projected_states
batch_sizeseq_len_groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_states                             r;   cuda_kernels_forwardBambaMixer.cuda_kernels_forward   s{    5]S<<6 "/!4!4
Q!%1D1D!D $i)H)H)Xi]dhi]i 	
 "*:*B*B1*E*K*K''GR +L +'DR
 !5!##DNN3??""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K #/"@"@dnn"]K??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff#(, LL $* &*&Y" (\-E , C CIt~~ ^I)..zBG"iiT: mmK0
r:   c                    UR                   u  pEnUR                  n[        X5      nU R                  U5      nUR	                  U R
                  U R                  U R                  /SS9u  pnU
R                  SS5      n
US L=(       a'    UR                  U R                  5      =(       a    US:H  nU(       a  UR                  XR                  5      n[        R                  " XR                  R                  R!                  S5      -  SS9n
U R"                  (       a  XR                  R$                  -   n
U R'                  U
5      n
OUbV  [(        R*                  R-                  XR.                  U
R                   S   -
  S45      nUR                  XR                  5      nU R'                  U R                  U
5      SS U24   R                  SS5      5      n
[        X5      n
[        R                  " U
U R
                  U R0                  U R2                  -  U R0                  U R2                  -  /SS9u  pn[        R4                  " U R6                  R9                  5       5      * nU(       Ga  UR:                  U R                     R<                  R>                  nUS S 2SS S 24   S S 2S S4   nUR                  SS5      RA                  XKR                   S   U RB                  5      nU RD                  S   RA                  U RD                  R                   S   U RB                  5      n[        R(                  R*                  RG                  UURI                  UR                  5      -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nUS   RA                  U R                  U RB                  U R2                  5      RI                  [        RN                  S	9n[        R4                  " US   U-  5      RI                  US
9nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nUS   USS S S 24   -  nURQ                  USU RB                  5      nUUS   -  RI                  US
9nUR:                  U R                     R<                  U-  U-   nURU                  UU R                  5      nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nURI                  UR>                  UR                  S9nURW                  X@R                  -  U RB                  U R2                  5      nURW                  X@R                  -  U R2                  S5      n[        RX                  " UU5      nURW                  X@R                  U RB                  5      nU RZ                  S   RA                  U RZ                  R                   S   U RB                  5      nUUU-  -   RI                  UR                  5      nURQ                  US5      S S 2S S4   nGO[(        R*                  RG                  XRD                  -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nURQ                  XESU RB                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nUR]                  U R                  U R0                  -  SU R                  S9nUR]                  U R                  U R0                  -  SU R                  S9nU R^                  XPR^                  -  -
  U R^                  -  nU RZ                  S   [a        UU5      -  nXS   -  nURI                  UR                  5      U-  nUUUU4 Vs/ s H  n[c        UUU R^                  5      PM     snu  nnnnURe                  SSSS5      n[        Rf                  " USS9n[        R4                  " [i        U5      5      n US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n!U!R                  SS9n"U"S   U Re                  SSSSS5      S   -  n#U#R                  SS9n$U$S   US S 2S S 2S 4   -  R                  SS9n%[        R4                  " US S 2S S 2S S 2SS 24   U-
  5      n&UU&Re                  SSSS5      S   -  n'U'SS S S 24   US   -  R                  SS9n([        Rj                  " U(S S 2S S24   5      n)[        Rl                  " U)U(/SS9n([        R4                  " [i        [(        R*                  R-                  US S 2S S 2S S 2S4   S5      5      5      n*U*R                  SS5      n*U*S   U(S S 2S S 2S S4   -  R                  SS9n+U+S S 2S S24   U+S S 2S4   n,n([        R4                  " U5      n-USS S S 24   U(S S 2S S 2S S4   -  n.U-Re                  SSSS5      n/U.R                  S5      U/S   -  n0U%U0-   nURQ                  USU R                  U RB                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURQ                  XES5      nU,b  Ub  URU                  U,U R                  5      n,U Ro                  UU	5      n1U Rq                  U1RI                  U5      5      n2U2$ s  snf )NrC   rD   r$   r   r   .).N).NNr   device)r   r   )rE   output_sizer      )r$   r   )9rG   r   r    r   r   rw   r   rp   r   r   r`   r   r4   sumr   r   r   ry   rd   r|   r   r   r   ru   r   rs   r   r   r   r   r   r   r   r   r   softplusr   clampr   r   reshape
contiguousr   r   bmmr   repeat_interleaver   r!   r"   permutecumsumr#   
zeros_likerH   r   r   )3r   input_statesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   cache_devicer   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess3                                                      r;   torch_forwardBambaMixer.torch_forward  s
    ".!3!3
Q"" 4LQ<<5&6&<&<''GR '= '
# .77!<!-T!9!~l>]>]^b^l^l>m!~ry}~r~ "&889JNN[K %		kk0088;;! !!$58H8H$H! $): ; ' mm//%(=(=@Q@W@WXZ@[([]^'_ +<<[..Y $5F)GXgX)V)`)`abde)f g89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
! YYtzz'')**!'..t~~>OOVVL Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC &,,T^^<MMPRRUXXJ%<<ZXJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $ahhaggFJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF $..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A(??	4>>Z	ii4(
 !%knnU.C D$$A &{s   .!oc                    [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XX45      $ Ub  [        S5      eUR                  nUbC  UR                  S   S:  a0  UR                  S   S:  a  XS S 2S S 2S 4   -  R                  U5      nU R                  XU5      $ )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r$   r   )r   r   r   r   typer   r   NotImplementedErrorr   rG   r   r  )r   r   r   r   r-   kwargsr   s          r;   forwardBambaMixer.forwardQ  s     "!f0C0C0J0J0O0O&OXpXrXr,,].bb%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*Aq$J-GGKKERM!!-~NNr:   )r   r   r|   r{   r   r   r   ru   r   r   rq   r   rw   r`   r   r   r   rp   r   rs   r   r   r   r~   ry   )NNN)NN)r/   r0   r1   r2   r3   r%   r7   rn   r4   Tensorr   r8   r   r  r  r9   __classcell__r   s   @r;   r]   r]      s    Zh{ Zhs Zh~ &*.2*._||_ dl_ t+	_
 4'_J &*.2	z% dlz% t+	z%@ &*.2*.O dlO t+	O
 4'O Or:   r]   c                       \ rS rSrSrg)BambaMLPig  r.   Nr@   r.   r:   r;   r  r  g  rA   r:   r  c                       \ rS rSrSrg)BambaRMSNormik  r.   Nr@   r.   r:   r;   r   r   k  rA   r:   r   c                   \  ^  \ rS rSrSS\S\S\4U 4S jjjr     SS\R                  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\\R                  \R                  4   S-  S\\   S\\R                   \\R                   \R                   4   S-  4   4S jjrSrU =r$ )BambaDecoderLayerio  r_   r`   
layer_typec                    > [         TU ]  X5        U ?SnUS:X  a  [        OS nU" U5      U l        X0l        US:X  a  [        XS9U l        g US:X  a  [        X5      U l        g [        S5      e)Nr$   mamba)r_   r`   	attentionzInvalid layer_type)
rm   rn   	self_attnr  feed_forwardr#  r]   r%  rW   
ValueError)r   r_   r`   r#  num_expertsffn_layer_classr   s         r;   rn   BambaDecoderLayer.__init__p  sn    +N&1Q&6(D+F3$ #6GDJ;&+F>DN122r:   Nr   r   position_idspast_key_values	use_cacheposition_embeddingsr  returnc           
      0   UnU R                  U5      nU R                  S:X  a  U R                  " SUUUS.UD6nS n	O+U R                  S:X  a  U R                  " SUUUUUUS.UD6u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nUW	4$ )Nr%  )r   r   r   r&  )r   r   r-  r.  r/  r0  r.   )input_layernormr#  r%  r'  pre_ff_layernormr(  )
r   r   r   r-  r.  r/  r0  r  residualself_attn_weightss
             r;   r  BambaDecoderLayer.forward  s     !,,];??g% JJ +,- 	M !%__+/3~~ 0+-) /#$70 0,M !0 --m<))-8 0///r:   )r(  r#  r%  r'  )r%  )NNNFN)r/   r0   r1   r2   r%   r7   strrn   r4   r  r5   r   booltupler   r'   FloatTensorr  r9   r  r  s   @r;   r"  r"  o  s    3{ 3s 3 3 3( /304(,!&HL(0||(0 t+(0 &&-	(0
 (0 $;(0 #5<<#=>E(0 23(0 
u  %(9(95;L;L(L"MPT"TT	U(0 (0r:   r"  c                      ^  \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSr\\S.r\R"                  " 5       U 4S j5       rS	rU =r$ )
BambaPreTrainedModeli  r_   modelTr"  r.  )r   
attentionsc           
      ~  > [         TU ]  U5        [        U[        5      (       a  [        R
                  " UR                  5        [        R                  " UR                  [        R                  " [        R                  " SUR                  S-   5      5      5        [        R
                  " UR                  5        g g )Nr$   )rm   _init_weights
isinstancer]   initones_r   copy_r   r4   r   r   rp   r   )r   moduler   s     r;   rA  "BambaPreTrainedModel._init_weights  st    f%fj))JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  *r:   r.   )r/   r0   r1   r2   r%   r6   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr"  rW   _can_record_outputsr4   no_gradrA  r9   r  r  s   @r;   r=  r=    s\    &*#,-"3NL*$
 ]]_! !r:   r=  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rS rSrU =r$ )
BambaModeli  r_   c           	      N  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H)  nUR                  [        XUR                  U   S95        M+     [
        R                  " U5      U l        UR                   U l        [#        UR                  UR$                  S9U l        [)        US9U l        SU l        U R/                  5         g )N)r`   r#  rh   r_   F)rm   rn   pad_token_idpadding_idx
vocab_sizer   	Embeddingrq   embed_tokensrangenum_hidden_layersappendr"  layers_block_type
ModuleListr   _attn_implementationr   r   final_layernormr>   
rotary_embgradient_checkpointing	post_init)r   r_   decoder_layersir   s       r;   rn   BambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0A!!"3FTZTlTlmnTo"pq 1mmN3$*$?$?!+F,>,>FDWDWX.f=&+#r:   N	input_idsr   r-  r.  inputs_embedsr/  r  r1  c           
      Z   US L US L-  (       a  [        S5      eUc  U R                  U5      nUnU(       a  Uc  [        U R                  S9nUc;  [        R
                  " UR                  S   UR                  S9R                  S5      n[        U R                  UUUUS9n	U R                  X$5      n
U R                  XS9n[        U R                  5       H7  u  pU R                  R                  U   S:X  a  U
OU	nU" U4UUUUUS	.UD6u  pM9     U R                  U5      n[!        UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedsrT  r$   r   r   )r_   rh  r   r.  r-  )r-  r%  )r   r-  r.  r/  r0  )last_hidden_stater.  )r)  rY  r	   r_   r4   r   rG   r   rF   r   _update_mamba_maskra  	enumerater   r]  r`  r   )r   rg  r   r-  r.  rh  r/  r  r   causal_mask
mamba_maskr0  re  decoder_layer
layer_maskattn_weightss                   r;   r  BambaModel.forward  sO    -t";<YZZ  --i8M%0*$++>O <<(;(;A(>}G[G[\ffghiL(;;')+%
 ,,^M
"oomoW )$++ 6A'+{{'D'DQ'G7'RXcJ*7+)) /#$7+ +'M< !7 ,,];&++
 	
r:   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr$   )r   r4   r   )r   r   r.  rn  s       r;   rk  BambaModel._update_mamba_mask  sA     $
'O,N,N,P,P&599^q5H+I+IJr:   )r_  rY  r`  rb  r   rV  ra  rW  )NNNNNN)r/   r0   r1   r2   r%   rn   r   r   r   r4   r5   r  r   r;  r9  r   r'   r   r  rk  r9   r  r  s   @r;   rR  rR    s    { &   .2.204(,26!%3
##d*3
 t+3
 &&-	3

 3
 ((4/3
 $;3
 233
 
!3
    3
j r:   rR  c                   B  ^  \ rS rSrU 4S jr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\S-  S\\R                  -  S\4S jj5       5       r      SU 4S jjrSrU =r$ )BambaForCausalLMi  c                 f   > [         TU ]  U5        UR                  U l        U R                  5         g )N)rm   rn   z_loss_coefficientrc  )r   r_   r   s     r;   rn   BambaForCausalLM.__init__   s*     "(";"; 	r:   Nrg  r   r-  r.  rh  labelsr/  logits_to_keepr1  c	           
      D   U R                   " S
UUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " S
XU R                  R                  S.U	D6nU R                  S:  aT  UR                  SS9R                  UR                  S9R                  S5      R                  5       nXR                  U-  -   n[        UUU
R                   U
R"                  U
R$                  S	9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, BambaForCausalLM

>>> model = BambaForCausalLM.from_pretrained("...")
>>> tokenizer = AutoTokenizer.from_pretrained("...")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)rg  r   r-  r.  rh  r/  N)logitsrz  rW  r   rC   rD   r   r   )lossr}  r.  r   r?  r.   )r>  rj  rB  r7   slicelm_headloss_functionr_   rW  rx  	logsumexpr   r   powmeanr   r.  r   r?  )r   rg  r   r-  r.  rh  rz  r/  r{  r  outputsr   slice_indicesr}  r~  z_losss                   r;   r  BambaForCausalLM.forward'  s1   H ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD&&*))b)1444::4FJJ1MRRT55>>%#33!//))
 	
r:   c           
      j   > U R                   R                  US'   [        T
U ]  " U4UUUUUUS.UD6n	U	$ )Nr{  )r.  r   rh  r-  r/  is_first_iteration)r_   num_logits_to_keeprm   prepare_inputs_for_generation)r   rg  r.  r   rh  r-  r/  r  r  model_inputsr   s             r;   r  .BambaForCausalLM.prepare_inputs_for_generationh  sU     $(;;#A#A w<	
+)'%1	
 	
 r:   )rx  )NNNNNNNr   )NNNNTF)r/   r0   r1   r2   rn   r   r   r4   r5   r  r   r;  r9  r7   r   r  r  r9   r  r  s   @r;   rv  rv    s      .2.204(,26*.!%-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ell*=
 
 =
  =
D   r:   rv  )rR  rv  r=  )r$   )Er3   typingr   r4   r    r   rC  activationsr   cache_utilsr   r	   integrations.hub_kernelsr
   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   jamba.modeling_jambar   llama.modeling_llamar   r   r   r   r   r   mamba2.modeling_mamba2r   r    r!   r"   r#   configuration_bambar%   
get_loggerr/   r   r'   r>   rU   rW   rZ   Moduler]   r  r   r"  r=  rR  rv  __all__r.   r:   r;   <module>r     s0  &     & ! . 8 / O - & X X 7 9 5 =   - 
		H	%	 0	/ 	
#L	^ 		) 	
\O \O~	x 		< 	:02 :0z !? ! !. W% W Wt`' `F Er:   