
    Z j{                     B   S SK Jr  S SKJrJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJr  SSKJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,J-r-  SSK.J/r/  SSK0J1r1  SSK2J3r3  \*Rh                  " \55      r6 " S S\SS9r7 " S S\Rp                  5      r9S r:S\Rv                  S \<S!\Rv                  4S" jr= SGS#\Rp                  S$\Rv                  S%\Rv                  S&\Rv                  S'\Rv                  S-  S(\>S)\>S*\$\&   4S+ jjr?SHS, jr@\" \@5       " S- S.\Rp                  5      5       rA " S/ S0\R                  Rp                  5      rBS1\Rv                  S2\<4S3 jrCS4 rDS5 rES6 rF " S7 S8\Rp                  5      rG " S9 S:\Rp                  5      rH\" S;5       " S< S=\Rp                  5      5       rI " S> S?\5      rJ\' " S@ SA\"5      5       rK\' " SB SC\K5      5       rL\' " SD SE\K\5      5       rM/ SFQrNg)I    )Callable)Optional	TypedDictN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernelized_func)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocastmerge_with_config_defaults)resolve_internal_import)capture_outputs   )BambaConfigc                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
BambaFlashAttentionKwargs6   a!  
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

cu_seq_lens_q (`torch.LongTensor`):
    Gets cumulative sequence length for query state.
cu_seq_lens_k (`torch.LongTensor`):
    Gets cumulative sequence length for key state.
max_length_q (`int`):
    Maximum sequence length for query state.
max_length_k (`int`):
    Maximum sequence length for key state.
seq_idx (`torch.IntTensor`):
    Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idx N)__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__int	IntTensor__static_attributes__r,       y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bamba/modeling_bamba.pyr%   r%   6   s7      ######__r8   r%   F)totalc                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )BambaRotaryEmbeddingN   inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr>   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr?   rope_parametersrA   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr?   devicerope_init_fnr>   	__class__s        r9   rF   BambaRotaryEmbedding.__init__Q   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr8   rP   ztorch.deviceseq_lenreturnztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNg      ?r      dtyperP   r[   )	rJ   getattrhidden_sizenum_attention_headsr2   arangeint64tofloat)r?   rP   rT   basedimattention_factorr>   s          r9   rK   4BambaRotaryEmbedding.compute_default_rope_parametersa   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r8   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r"   mpscpuF)device_typeenabledrY   re   rZ   )r>   rc   expandshaperb   rP   
isinstancetypestrr   	transposer2   catcosrL   sinr[   )
rO   xposition_idsinv_freq_expandedposition_ids_expandedrl   freqsembrv   rw   s
             r9   forwardBambaRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rL   r?   rH   rI   rA   NNNN)r-   r.   r/   r0   r2   Tensorr4   r#   rF   staticmethodr   r5   tuplerc   rK   no_gradr   r~   r7   __classcell__rR   s   @r9   r<   r<   N   s    llV{ V V  %)+/"*d"*(* t* 
~u$	%	* *: ]]_<  <r8   r<   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nri   rY   rn   )rp   r2   ru   )rx   x1x2s      r9   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   hidden_statesn_reprU   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rp   ro   reshape)r   r   batchnum_key_value_headsslenrX   s         r9   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrY   r   ri   )re   r[   )ptrainingr"   )r   num_key_value_groupsr2   matmulrt   r   
functionalsoftmaxfloat32rb   r[   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r9   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r8   c                 R   UR                  U5      nUR                  U5      nUR                  S   nU SSU24   U SUS24   pvUSSU24   USUS24   pXb-  [        U5      U-  -   n
X-  [        U5      U-  -   n[        R                  " X/SS9n
[        R                  " X/SS9nX4$ )a{  Applies Rotary Position Embedding to the query and key tensors.

Removes the interleaving of cos and sin from GLM

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
ri   .Nrn   )	unsqueezerp   r   r2   ru   )qkrv   rw   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r9   apply_rotary_pos_embr      s    ( --
&C
--
&C 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr8   c                     ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )BambaAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr?   	layer_idxc                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )NrX   g      Tbias)rE   rF   r?   r   r]   r^   r_   rX   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)rO   r?   r   rR   s      r9   rF   BambaAttention.__init__   sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r8   Nr   position_embeddingsr   past_key_valuesr   rU   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nri   r"   rY           )r   r   )rp   rX   r   viewrt   r   r   r   updater   r   get_interfacer?   _attn_implementationr   r   r   r   r   r   r   )rO   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rv   rw   attention_interfacer   r   s                   r9   r~   BambaAttention.forward   s~    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r8   )r   r?   rX   r   r   r   r   r   r   r   r   r   )r-   r.   r/   r0   r1   r#   r5   rF   r2   r   r   r
   r   r   r~   r7   r   r   s   @r9   r   r      s    G
{ 
s 
4 IM.2(,&)||&) #5<<#=>E&) t+	&)
 &) +,&) 
u||U\\)	*&) &)r8   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )BambaRMSNormGatedi&  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g r   rE   rF   r   	Parameterr2   onesweightvariance_epsilonrO   r^   epsrR   s      r9   rF   BambaRMSNormGated.__init__'  s-    ll5::k#:; #r8   c                    UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  S5      R                  SSS9nU[        R                  " X@R                  -   5      -  nU R                  UR                  U5      -  $ NrY   ri   T)keepdim)r[   rb   r2   r   r   r   silupowmeanrsqrtr   r   )rO   r   gateinput_dtypevariances        r9   r~   BambaRMSNormGated.forward,  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%H?T?T4T(UU{{]--k:::r8   r   r   gư>r   r-   r.   r/   r0   rF   r~   r7   r   r   s   @r9   r   r   &  s    $
	; 	;r8   r   input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moder   )lenrp   r2   r   r   pad)r   r   	pad_shapes      r9   pad_tensor_by_sizer   ;  sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr8   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r   ri   rY   )r   r   rp   r   )r   r   
chunk_sizes      r9   reshape_into_chunksr   F  s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r8   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
ri   .Nr\   )diagonalr   rn   )
sizero   r2   trilr   rP   boolmasked_fillcumsuminf)r   r   masktensor_segsums       r9   segment_sumr  Z  s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr8   c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr"   r   )rp   r[   rb   )r   r   r[   s      r9   apply_mask_to_padding_statesr  n  s_    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr8   c            
       >  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
S-  S	\R                  S-  S
\R                  S-  4S jjr  SS\
S-  S	\R                  S-  4S jjr   SS\
S-  S	\R                  S-  S
\R                  S-  4S jjrSrU =r$ )
BambaMixeri{  u(  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

The are a few differences between this and Mamba2Mixer:
- The variable use_precomputed_states is slightly different due to the hybrid cache structure
- There's a few non-obvious bugs fixed with batching in the slow path that exist in main
- Some extra variables that our layer doesn't need have been removed
- We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
r?   r   c           	        > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        UR                  U R                  -  5      U l        X l        UR                  U l        UR                  U l        ["        UR                     U l        UR&                  U l        UR*                  U l        UR.                  U l        UR2                  U l        UR6                  U l        UR:                  U l        UR<                  U l        UR>                  U l        U R                  SU R0                  -  U R                  -  -   U l         [B        RD                  " U R@                  U R@                  UR                  U R                  U R@                  U R                  S-
  S9U l#        U R                  U R@                  -   U R                  -   n[B        RH                  " U R                  UU R(                  S9U l%        [B        RL                  " [N        RP                  " U R                  5      5      U l)        [N        RT                  " SU R                  S-   5      n[B        RL                  " [N        RV                  " U5      5      U l,        [[        U R                  U R,                  S9U l.        [B        RL                  " [N        RP                  " U R                  5      5      U l/        [B        RH                  " U R                  U R                  U R(                  S9U l0        [c        S5      n[e        USS 5      q3[e        USS 5      q4[c        S	5      n[k        US
S9q6[k        USS9q7[k        USS9q8[s        [l        [n        [p        [h        [f        45      q:[t        (       d  [v        Ry                  S5        g [v        Ry                  S5        g )NrY   r"   )in_channelsout_channelsr   kernel_sizegroupspaddingr   r   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)=rE   rF   mamba_n_heads	num_headsr^   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer5   mamba_expandintermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonmamba_n_groupsn_groupsmamba_d_headrX   mamba_chunk_sizer   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr   r2   r   dt_biasr`   logA_logr   normDout_projr   r]   r  r  r    selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rO   r?   r   projection_sizeAcausal_conv1d	mamba_ssmrR   s          r9   rF   BambaMixer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
%d&<&<$BYBYZ	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &%>  fgr8   Nr   cache_paramsr   r+   c                    [        X5      nU R                  U5      nUR                  u  pgnU R                  U R                  -  n	US L=(       a'    UR                  U R                  5      =(       a    US:H  n
U
(       Ga  UR                  S5      R                  U R                  U R                  U R                  /SS9u  pn[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[&        R                  " UU R                  X/SS9u  pn[&        R(                  " U R*                  R-                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R/                  SU R0                  U R                  5      R3                  [&        R4                  S9nUS S 2S S 2S 4   R/                  SSU R0                  5      nU R6                  S S 2S S4   R/                  SU R0                  5      nU R8                  S S 2S S4   R/                  SU R0                  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  U R0                  5      n[=        UR                  U R                     R>                  UUUUUUS USS9
nUR;                  X`R                  U R0                  -  5      nU RA                  X5      nU RC                  U5      S S 2S S4   nU$ [&        R(                  " U R*                  R-                  5       5      * nU RD                  S[-        S	5      4:X  a  0 OS
U RD                  0nU RF                  (       a  Uc  [I        UU R                  R                   R                  S5      U R                  R"                  U R6                  U4U R8                  U RJ                  UU R$                  U R@                  R                   U R@                  RL                  U RB                  R                   U RB                  R"                  U R0                  U R                  SSS.UD6nU$ UR                  U R                  U R                  U R                  /SS9u  pnUbj  URO                  SS5      n[P        RR                  RU                  UU RV                  UR                  S   -
  S45      nURY                  UU R                  5      nU R$                  S;  aH  U R[                  U R                  URO                  SS5      5      SS U24   RO                  SS5      5      nOn[]        URO                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  US9RO                  SS5      n[        X5      n[&        R                  " UU R                  X/SS9u  pn[_        UR;                  XgSU R0                  5      UUUR;                  XgU R                  S5      UR;                  XgU R                  S5      4U RJ                  U R8                  S USU R6                  SS.UD6u  nnUb  Ub  URa                  UU R                  5      nUR;                  XgS5      nU RA                  UU5      nU RC                  U5      nU$ )Nr"   ri   rn   .rZ   T)zr1  dt_softplusr   r  dt_limitF)r5  r   r+   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrY   r   )r   swish)rx   r   r   r   r+   )r   r5  rE  r+   rO  r1  rF  )1r  r0  rp   r'  r  has_previous_stater   squeezesplitr  r-  r  r  layersconv_statesr/  r   r   r   r2   expr3  rc   ro   rX   rb   r   r1  r5  r   r7  recurrent_statesr4  r6  r*  r   r9  r   r   rt   r   r   r   r  update_conv_stater!  r  r8  update_recurrent_state)rO   r   rC  r   r+   projected_states
batch_sizerT   _groups_time_state_sizeuse_precomputed_statesr   hidden_states_B_CdtBCr?  r1  r5  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrU  scan_output	ssm_states                             r9   cuda_kernels_forwardBambaMixer.cuda_kernels_forward  s{    5]S<<6 "/!4!4
Q!%1D1D!D $i)H)H)Xi]dhi]i 	
 "*:*B*B1*E*K*K''GR +L +'DR
 !5!##DNN3??""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K #/"@"@dnn"]K??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff#(, LL $* &*&Y" (\-E , C CIt~~ ^I)..zBG"iiT: mmK0
r8   c                    UR                   u  pEnUR                  n[        X5      nU R                  U5      nUR	                  U R
                  U R                  U R                  /SS9u  pnU
R                  SS5      n
US L=(       a'    UR                  U R                  5      =(       a    US:H  nU(       a  UR                  XR                  5      n[        R                  " XR                  R                  R!                  S5      -  SS9n
U R"                  (       a  XR                  R$                  -   n
U R'                  U
5      n
OUbV  [(        R*                  R-                  XR.                  U
R                   S   -
  S45      nUR                  XR                  5      nU R'                  U R                  U
5      SS U24   R                  SS5      5      n
[        X5      n
[        R                  " U
U R
                  U R0                  U R2                  -  U R0                  U R2                  -  /SS9u  pn[        R4                  " U R6                  R9                  5       5      * nU(       Ga  UR:                  U R                     R<                  R>                  nUS S 2SS S 24   S S 2S S4   nUR                  SS5      RA                  XKR                   S   U RB                  5      nU RD                  S   RA                  U RD                  R                   S   U RB                  5      n[        R(                  R*                  RG                  UURI                  UR                  5      -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nUS   RA                  U R                  U RB                  U R2                  5      RI                  [        RN                  S	9n[        R4                  " US   U-  5      RI                  US
9nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nUS   USS S S 24   -  nURQ                  USU RB                  5      nUUS   -  RI                  US
9nUR:                  U R                     R<                  U-  U-   nURU                  UU R                  5      nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nURI                  UR>                  UR                  S9nURW                  X@R                  -  U RB                  U R2                  5      nURW                  X@R                  -  U R2                  S5      n[        RX                  " UU5      nURW                  X@R                  U RB                  5      nU RZ                  S   RA                  U RZ                  R                   S   U RB                  5      nUUU-  -   RI                  UR                  5      nURQ                  US5      S S 2S S4   nGO[(        R*                  RG                  XRD                  -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nURQ                  XESU RB                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nUR]                  U R                  U R0                  -  SU R                  S9nUR]                  U R                  U R0                  -  SU R                  S9nU R^                  XPR^                  -  -
  U R^                  -  nU RZ                  S   [a        UU5      -  nXS   -  nURI                  UR                  5      U-  nUUUU4 Vs/ s H  n[c        UUU R^                  5      PM     snu  nnnnURe                  SSSS5      n[        Rf                  " USS9n[        R4                  " [i        U5      5      n US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n!U!R                  SS9n"U"S   U Re                  SSSSS5      S   -  n#U#R                  SS9n$U$S   US S 2S S 2S 4   -  R                  SS9n%[        R4                  " US S 2S S 2S S 2SS 24   U-
  5      n&UU&Re                  SSSS5      S   -  n'U'SS S S 24   US   -  R                  SS9n([        Rj                  " U(S S 2S S24   5      n)[        Rl                  " U)U(/SS9n([        R4                  " [i        [(        R*                  R-                  US S 2S S 2S S 2S4   S5      5      5      n*U*R                  SS5      n*U*S   U(S S 2S S 2S S4   -  R                  SS9n+U+S S 2S S24   U+S S 2S4   n,n([        R4                  " U5      n-USS S S 24   U(S S 2S S 2S S4   -  n.U-Re                  SSSS5      n/U.R                  S5      U/S   -  n0U%U0-   nURQ                  USU R                  U RB                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURQ                  XES5      nU,b  Ub  URU                  U,U R                  5      n,U Ro                  UU	5      n1U Rq                  U1RI                  U5      5      n2U2$ s  snf )Nri   rn   r"   rY   r   .r   ).NNrZ   rP   r\   )re   output_sizer   r   r   )r"   r   )9rp   r[   r  r0  rS  r  r-  r  rt   rQ  r   rX  r2   sumr/  r   rR  r  r   r!  r   r   r   r  r'  r  rV  r3  rc   rT  rW  rP   ro   rX   r1  softplusrb   clampr*  r   r   r   rY  r   bmmr5  repeat_interleaver   r   r   permuter  r  
zeros_likeru   r4  r6  )3rO   input_statesrC  r   r[  rT   r\  r[   rZ  r   r_  r`  r^  rU  r   ra  rb  r?  cache_devicer1  dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr5  r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesrh  state_decay_outC_times_statesstate_decay_out_permutedY_offrg  contextualized_statess3                                                      r9   torch_forwardBambaMixer.torch_forward  s
    ".!3!3
Q"" 4LQ<<5&6&<&<''GR '= '
# .77!<!-T!9!~l>]>]^b^l^l>m!~ry}~r~ "&889JNN[K %		kk0088;;! !!$58H8H$H! $): ; ' mm//%(=(=@Q@W@WXZ@[([]^'_ +<<[..Y $5F)GXgX)V)`)`abde)f g89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
! YYtzz'')**!'..t~~>OOVVL Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC &,,T^^<MMPRRUXXJ%<<ZXJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $ahhaggFJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF $..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A(??	4>>Z	ii4(
 !%knnU.C D$$A &{s   .!oc                    [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XX45      $ Ub  [        S5      eUR                  nUbC  UR                  S   S:  a0  UR                  S   S:  a  XS S 2S S 2S 4   -  R                  U5      nU R                  XU5      $ )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r"   r   )r;  r0  r   rP   rr   r   ri  NotImplementedErrorr[   rp   rb   r  )rO   r   rC  r   r+   r   r[   s          r9   r~   BambaMixer.forwardD  s     "!f0C0C0J0J0O0O&OXpXrXr,,].bb%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*Aq$J-GGKKERM!!-~NNr8   )r3  r5  r!  r   r   r/  r-  r  r1  rX   r^   r0  r  r   r%  r'  r4  r  r6  r  r*  r,  r+  r#  r  r   )NN)r-   r.   r/   r0   r1   r#   r5   rF   r2   r   r
   r6   ri  r  r~   r7   r   r   s   @r9   r
  r
  {  s    Zh{ Zhs Zh~ &*.2*._||_ dl_ t+	_
 4'_J &*.2	z% dlz% t+	z%@ &*.2*.O dlO t+	O
 4'O Or8   r
  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BambaMLPiZ  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nr   )rE   rF   r?   r^   r  r   r   mlp_bias	gate_projup_proj	down_projr	   r  act_fnrO   r?   rR   s     r9   rF   BambaMLP.__init__[  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r8   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r  r  r  r  )rO   rx   r  s      r9   r~   BambaMLP.forwarde  s6    NN4;;t~~a/@#ADLLQRO#ST	r8   )r  r?   r  r  r^   r  r  r   r   s   @r9   r  r  Z  s    0 r8   r  RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )BambaRMSNormij  r   rU   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
BambaRMSNorm is equivalent to T5LayerNorm
Nr   r   s      r9   rF   BambaRMSNorm.__init__l  s/     	ll5::k#:; #r8   r   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r   )	r[   rb   r2   r   r   r   r   r   r   )rO   r   r   r   s       r9   r~   BambaRMSNorm.forwardt  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r8   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r   rp   r   )rO   s    r9   
extra_reprBambaRMSNorm.extra_repr{  s*    ))*+6$2G2G1HIIr8   r   r   )r-   r.   r/   r0   rc   rF   r2   r   r~   r  r7   r   r   s   @r9   r  r  j  sB    $ $$ $ $;U\\ ;ell ;J Jr8   r  c                   \  ^  \ rS rSrSS\S\S\4U 4S jjjr     SS\R                  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\\R                  \R                  4   S-  S\\   S\\R                   \\R                   \R                   4   S-  4   4S jjrSrU =r$ )BambaDecoderLayeri  r?   r   
layer_typec                 `  > [         TU ]  5         SnUS:X  a  [        OS nU" U5      U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        X0l	        US:X  a  [        XS9U l        g US:X  a  [        X5      U l        g [        S5      e)Nr"   r  mamba)r?   r   	attentionzInvalid layer_type)rE   rF   r  feed_forwardr  r^   r$  input_layernormpre_ff_layernormr  r
  r  r   	self_attn
ValueError)rO   r?   r   r  num_expertsffn_layer_classrR   s         r9   rF   BambaDecoderLayer.__init__  s    &1Q&6(D+F3+F,>,>FDWDWX ,V-?-?VEXEX Y$ #6GDJ;&+F>DN122r8   Nr   r   ry   r   	use_cacher   r   rU   c           
      0   UnU R                  U5      nU R                  S:X  a  U R                  " SUUUS.UD6nS n	O+U R                  S:X  a  U R                  " SUUUUUUS.UD6u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nUW	4$ )Nr  )r   rC  r   r  )r   r   ry   r   r  r   r,   )r  r  r  r  r  r  )
rO   r   r   ry   r   r  r   r   residualself_attn_weightss
             r9   r~   BambaDecoderLayer.forward  s     !,,];??g% JJ +,- 	M !%__+/3~~ 0+-) /#$70 0,M !0 --m<))-8 0///r8   )r  r  r  r  r  r  )r  )NNNFN)r-   r.   r/   r0   r#   r5   rs   rF   r2   r   r3   r
   r   r   r   r%   FloatTensorr~   r7   r   r   s   @r9   r  r    s    3{ 3s 3 3 3( /304(,!&HL(0||(0 t+(0 &&-	(0
 (0 $;(0 #5<<#=>E(0 23(0 
u  %(9(95;L;L(L"MPT"TT	U(0 (0r8   r  c                      ^  \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSr\\S.r\R"                  " 5       U 4S j5       rS	rU =r$ )
BambaPreTrainedModeli  r?   modelTr  r   )r   
attentionsc           
      ~  > [         TU ]  U5        [        U[        5      (       a  [        R
                  " UR                  5        [        R                  " UR                  [        R                  " [        R                  " SUR                  S-   5      5      5        [        R
                  " UR                  5        g g )Nr"   )rE   _init_weightsrq   r
  initones_r1  copy_r3  r2   r2  r`   r  r5  )rO   r   rR   s     r9   r  "BambaPreTrainedModel._init_weights  st    f%fj))JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  *r8   r,   )r-   r.   r/   r0   r#   r4   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r   _can_record_outputsr2   r   r  r7   r   r   s   @r9   r  r    s\    &*#,-"3NL*$
 ]]_! !r8   r  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rS rSrU =r$ )
BambaModeli  r?   c           	      N  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H)  nUR                  [        XUR                  U   S95        M+     [
        R                  " U5      U l        UR                   U l        [#        UR                  UR$                  S9U l        [)        US9U l        SU l        U R/                  5         g )N)r   r  r  r?   F)rE   rF   pad_token_idpadding_idx
vocab_sizer   	Embeddingr^   embed_tokensrangenum_hidden_layersappendr  layers_block_type
ModuleListrT  r   r  r$  final_layernormr<   
rotary_embgradient_checkpointing	post_init)rO   r?   decoder_layersirR   s       r9   rF   BambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0A!!"3FTZTlTlmnTo"pq 1mmN3$*$?$?!+F,>,>FDWDWX.f=&+#r8   N	input_idsr   ry   r   inputs_embedsr  r   rU   c           
      Z   US L US L-  (       a  [        S5      eUc  U R                  U5      nUnU(       a  Uc  [        U R                  S9nUc;  [        R
                  " UR                  S   UR                  S9R                  S5      n[        U R                  UUUUS9n	U R                  X$5      n
U R                  XS9n[        U R                  5       H7  u  pU R                  R                  U   S:X  a  U
OU	nU" U4UUUUUS	.UD6u  pM9     U R                  U5      n[!        UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr  r"   rl  r   )r?   r  r   r   ry   )ry   r  )r   ry   r   r  r   )last_hidden_stater   )r  r  r   r?   r2   r`   rp   rP   r   r   _update_mamba_maskr  	enumeraterT  r  r  r   )rO   r  r   ry   r   r  r  r   r   causal_mask
mamba_maskr   r  decoder_layer
layer_maskr   s                   r9   r~   BambaModel.forward  sO    -t";<YZZ  --i8M%0*$++>O <<(;(;A(>}G[G[\ffghiL(;;')+%
 ,,^M
"oomoW )$++ 6A'+{{'D'DQ'G7'RXcJ*7+)) /#$7+ +'M< !7 ,,];&++
 	
r8   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr"   )rQ  r2   r:  )rO   r   r   r  s       r9   r  BambaModel._update_mamba_mask!  sA     $
'O,N,N,P,P&599^q5H+I+IJr8   )r   r  r  r  rT  r  r  r  )NNNNNN)r-   r.   r/   r0   r#   rF   r   r!   r   r2   r3   r   r
   r  r   r   r%   r   r~   r  r7   r   r   s   @r9   r  r    s    { &   .2.204(,26!%3
##d*3
 t+3
 &&-	3

 3
 ((4/3
 $;3
 233
 
!3
    3
j r8   r  c                   b  ^  \ rS rSrSS0rSS0rSS/S/40rU 4S jr\\	        SS
\
R                  S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\\
R                  -  S\4S jj5       5       r      SU 4S jjrSrU =r$ )BambaForCausalLMi/  zlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                   > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        U R                  5         g )NFr   )rE   rF   r  r  r  r   r   r^   r  z_loss_coefficientr  r  s     r9   rF   BambaForCausalLM.__init__5  sc     '
 ++yy!3!3V5F5FUS"(";"; 	r8   Nr  r   ry   r   r  labelsr  logits_to_keeprU   c	           
      D   U R                   " S
UUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " S
XU R                  R                  S.U	D6nU R                  S:  aT  UR                  SS9R                  UR                  S9R                  S5      R                  5       nXR                  U-  -   n[        UUU
R                   U
R"                  U
R$                  S	9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, BambaForCausalLM

>>> model = BambaForCausalLM.from_pretrained("...")
>>> tokenizer = AutoTokenizer.from_pretrained("...")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r   ry   r   r  r  N)r  r  r  r   ri   rn   rZ   rY   )lossr  r   r   r  r,   )r  r  rq   r5   slicer  loss_functionr?   r  r  	logsumexprb   r[   r   r   r   r   r   r  )rO   r  r   ry   r   r  r  r  r  r   outputsr   slice_indicesr  r  z_losss                   r9   r~   BambaForCausalLM.forward?  s1   H ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD&&*))b)1444::4FJJ1MRRT55>>%#33!//))
 	
r8   c           
      j   > U R                   R                  US'   [        T
U ]  " U4UUUUUUS.UD6n	U	$ )Nr  )r   r   r  ry   r  is_first_iteration)r?   num_logits_to_keeprE   prepare_inputs_for_generation)rO   r  r   r   r  ry   r  r
  r   model_inputsrR   s             r9   r  .BambaForCausalLM.prepare_inputs_for_generation  sU     $(;;#A#A w<	
+)'%1	
 	
 r8   )r  r  r  r  )NNNNNNNr   )NNNNTF)r-   r.   r/   r0   _tied_weights_keys_tp_plan_pp_planrF   r   r   r2   r3   r   r
   r  r   r5   r   r~   r  r7   r   r   s   @r9   r  r  /  s&   *,GH23H_-z:;H  .2.204(,26*.!%-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ell*=
 
 =
  =
D   r8   r  )r  r  r  )r   )r"   )Ocollections.abcr   typingr   r   r2   r    r   r  activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.import_utilsr    utils.output_capturingr!   configuration_bambar#   
get_loggerr-   r<  r%   Moduler<   r   r   r5   r   rc   r   r   r   r   r   r   r  r  r
  r  r  r  r  r  r  __all__r,   r8   r9   <module>r(     sE  4 % &   & ! . ) L 8 / 9 O K F & l l G 9 5 , 
		H	%	 0><299 ><B(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4#L )*@)RYY @) +@)F; ;*VU\\ VS V
((	\O \O~ryy   Y'J299 J (J(:02 :0z !? ! !. W% W Wt g+_ g gT Er8   