
    Z j3                        S r SSKrSSKJr  SSKrSSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/  \'R`                  " \15      r2 " S S\Rf                  5      r4S\Rj                  S\6S\Rj                  4S jr7 S<S\Rf                  S\Rj                  S\Rj                  S \Rj                  S!\Rj                  S-  S"\8S#\84S$ jjr9 " S% S&\Rf                  5      r: " S' S(\Rf                  5      r; " S) S*\Rf                  5      r< " S+ S,\Rf                  5      r= " S- S.\5      r> " S/ S0\5      r?\% " S1 S2\ 5      5       r@\% " S3 S4\@5      5       rA " S5 S6\@\5      rB\%" S7S89 " S9 S:\@5      5       rC/ S;QrDg)=zPyTorch Zamba model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )ZambaConfigc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )ZambaRMSNorm1   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
ZambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer#   	__class__s      y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/zamba/modeling_zamba.pyr'   ZambaRMSNorm.__init__2   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   r3   input_dtypevariances       r0   forwardZambaRMSNorm.forward:   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler+   shaper,   )r-   s    r0   
extra_reprZambaRMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr2   )r,   r+   )gư>)__name__
__module____qualname____firstlineno__floatr'   r)   Tensorr@   rE   __static_attributes____classcell__r/   s   @r0   r!   r!   1   sB    $ $$ $ $;U\\ ;ell ;J Jr2   r!   r3   n_repr$   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rD   expandreshape)r3   rP   batchnum_key_value_headsslenhead_dims         r0   	repeat_kvrX   F   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr2   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr5   r   r6   )dimr8   )ptrainingr   )rX   num_key_value_groupsr)   matmul	transposer   
functionalsoftmaxr:   r9   r8   r_   rc   
contiguous)rY   rZ   r[   r\   r]   r^   r_   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r0   eager_attention_forwardro   R   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r2   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S\R                  S-  S	\
S-  S
\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )ZambaAttentionk   a  
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
config	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        g )Nr5         TFbias)r&   r'   rs   rt   attention_hidden_sizeattention_head_dimrW   num_attention_headsrU   rd   max_position_embeddingsr^   	is_causalattention_dropoutr   Linearq_projk_projv_projr.   o_projr-   rs   rt   r/   s      r0   r'   ZambaAttention.__init__y   s5   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr2   Nr3   r]   past_key_valuesrj   r$   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Ub  UR                  XU5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                  U R                  S.UD6u  pUR                   " / UQSP76 R#                  5       nU R%                  U5      nX4$ )Nr6   r   r5           )r_   r^   )rD   rW   r   viewrf   r   r   updater   get_interfacers   _attn_implementationro   rc   r~   r^   rS   ri   r   )r-   r3   rt   r]   r   rj   input_shapehidden_shapequery_statesrk   rl   attention_interfacern   rm   s                 r0   r@   ZambaAttention.forward   sa    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=jXa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r2   )r~   ry   rs   rW   r}   r   rt   r|   rd   r   r   r^   r   N)rG   rH   rI   rJ   __doc__r   intr'   r)   rL   r   r   r   rC   r@   rM   rN   rO   s   @r0   rq   rq   k   s    l{ ls l. )-#)||#) #) t+	#)
 #) +,#) 
u||U\\D0%2E2LL	M#) #)r2   rq   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	S-  4S jjr
SS\	S-  4S	 jjrSS\	S-  4S
 jjrSrU =r$ )ZambaMambaMixer   u!  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
- Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
`self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
rs   c           	      	  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR"                  U l        [&        R(                  " U R                  U R                  U R                   U R                  U R                  U R                  S-
  S9U l        UR,                  U l        [0        UR,                     U l        UR4                  U l        [&        R8                  " U R                  U R                  S-  U R$                  S9U l        [&        R<                  " [>        R@                  " U R                  U R                  U R                  S-  -   U R                  5      5      U l!        [&        R<                  " [>        R@                  " U R                  U R                  U R                  5      S-
  S-  U R                  S-  -  5      U l"        [&        R<                  " [>        R@                  " U R                  U R                  5      5      U l#        [>        RH                  " SU R                  S-   [>        RJ                  S9S S S 24   nURM                  U R                  S5      RO                  5       n[&        R<                  " [>        RP                  " U5      RS                  U R                  U R                  S5      5      U l*        [&        R<                  " [>        RV                  " U R                  U R                  5      5      U l,        [&        R8                  " U R                  U R                  U R$                  S9U l-        []        S5      q/[a        [^        S	S 5      q1[a        [^        S
S 5      q2[]        S5      q3[i        [f        SS9q5[a        [f        SS 5      q6[a        [f        SS 5      q7[q        [j        [l        [d        [b        [n        45      q9[r        (       d  [t        Rw                  S5        g g )Nr   )in_channelsout_channelsrx   kernel_sizegroupspaddingr5   rw   g      ?r8   r6   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fnaq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)<r&   r'   rs   rt   r.   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankn_mamba_headsmamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr
   actuse_mamba_kernelsuse_fast_kernelsr   in_projr(   r)   zerosx_proj_weightdt_proj_weightdt_proj_biasaranger:   rR   ri   logrS   A_logr*   Dout_projr   causal_conv1dgetattrr   r   	mamba_ssmr   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r-   rs   rt   Ar/   s       r0   r'   ZambaMambaMixer.__init__   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &%^ &r2   Nr3   cache_paramsc                    UR                   u  pEnUS L=(       a    UR                  =(       a    US:H  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pUR                  S5      R                  5       nU	R                  S5      n	U	R                  X@R                  SU5      R                  SS5      n	U R                  R                  R	                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ao  [        UR                  S5      UR                  U R                     R                   U
U R                  R"                  U R$                  5      nUR'                  S5      nOUb1  [(        R*                  " US:H  5      (       d  XR'                  S5      -  nUbV  [,        R.                  R1                  XR2                  UR                   S   -
  S45      nUR5                  XR                  5      n[7        XU R                  R"                  U R$                  S9nUb1  [(        R*                  " US:H  5      (       d  XR'                  S5      -  nUR                  SU R                  U R8                  U5      R                  SS5      nU R:                  S S 2S S S 2S S 24   U-  R                  SS5      n[(        R<                  " XR>                  U R@                  U R@                  /SS9u  pnU RB                  S S 2S 4   UR                  SS5      -  n[(        RD                  " U RF                  RI                  5       5      * nU RJ                  b  U RJ                  RI                  5       OS n[(        RL                  " USU4URN                  URP                  S9nU(       a  [S        U R                  5       H  n[U        UR                  U R                     RV                  S S 2U4   UUS	S4   UUS	S4   UU   UUS S 2S4   UUS S 2S4   U RX                  U   U	US	S4   UU   S
S9
R'                  S5      n[(        RZ                  " UU4SS9nM     GO0[(        RL                  " USU R8                  U R@                  4URN                  URP                  S9n[S        U R                  5       H  n[]        UU   UU   UU   UU   R                  SS5      UU   R                  SS5      U RX                  U   RI                  5       U	U   UU   S
S
S9
u  nn[(        RZ                  " UU4SS9R                  5       n[(        RZ                  " UUR'                  S5      4SS9nM     Ub  Ub  UR_                  UU R                  5        U Ra                  UR                  SS5      5      nU$ )Nr   r5   r6   ra   r   )r   devicer8   .T)dt_softplus)delta_softplusreturn_last_state)1rD   has_previous_stater   rf   r   chunksqueezeri   rS   r   r   r+   sizer   layersrt   conv_statesrx   r   	unsqueezer)   r   r   rg   padr   update_conv_stater   r   r   splitr   r   r   expr   rK   r   emptyr   r8   ranger   recurrent_statesr   catr   update_recurrent_stater   )r-   r3   r   r]   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r0   cuda_kernels_forward$ZambaMambaMixer.cuda_kernels_forward  sh    "/!4!4
Q!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M)%))Na<O2P2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st*<<[..Y,]$++JZJZgkgvgvwM)%))Na<O2P2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. 6 ''7HHAN!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FAN /  Q 3 3T5H5HI$++#))I
 4--.,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	 / $)A33It~~N !%l.D.DQ.J K$$r2   c           
         UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pU	R                  S5      R                  5       n	U
R                  S5      n
U
R                  X@R                  SU5      R                  SS5      n
UbR  UR                  U R                  5      (       a2  UR                  U R                     R                  R                  5       nOA[        R                   " X@R                  U R"                  U R$                  4U	R&                  US9nUGb  UR                  U R                  5      (       a  US:X  a  UR)                  XR                  5      n[        R*                  " XR,                  R.                  S S 2SS S 24   -  SS9n	U R0                  (       a  XR,                  R2                  -  n	U R5                  U	5      R7                  U5      R9                  S5      n	GO+Ub*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	[:        R<                  R?                  XR@                  U	R                   S   -
  S45      nUR)                  XR                  5      nU R5                  U R-                  U	5      SS U24   5      n	Ub*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	OSUb  XR9                  S5      -  n	U R5                  U R-                  U	5      SS U24   5      n	Ub  XR9                  S5      -  n	U	R                  SU R                  U R"                  U5      R                  SS5      n	U RB                  S S 2S S S 2S S 24   U	-  R                  SS5      n[        RD                  " XRF                  U R$                  U R$                  /SS9u  pnU RH                  S S 2S 4   UR                  SS5      -  U RJ                  S S 2S S S 2S 4   -   n[:        R<                  RM                  U5      n[        RN                  " U RP                  RS                  5       5      * n[        RN                  " US S 2S S S 2S S S 24   US S 2S S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S S 2S 4   US S 2S S 2S S S 2S S 24   RS                  5       -  nUU	S S 2S S 2S S 2S S 2S 4   RS                  5       -  n/ n[U        U5       H  nUS S 2S S 2S S 2US S 24   R                  SS5      U-  US S 2S S 2S S 2US S 24   R                  SS5      -   n[        RV                  " UR                  SS5      R7                  U5      US S 2S S 2US S 24   R9                  S5      5      nURY                  US S 2S S 2S S 2S4   5        M     [        RZ                  " USS9nUXR\                  S S 2S S S 2S 4   -  -   nUU R5                  U
5      -  nUb  UR_                  XR                  5        U Ra                  UR                  SS5      R                  USU5      R                  SS5      5      nU$ )	Nr   r5   r6   r   r   r   .r   )1rD   r8   r   rf   r   r   r   ri   rS   r   r   rt   r   r   cloner)   r   r   r   r   r   sumr   r+   r   rx   r   r9   r   r   rg   r   r   r   r   r   r   r   softplusr   r   rK   r   re   appendstackr   r   r   )r-   input_statesr   r]   r   r   r   r8   r   r3   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r0   slow_forwardZambaMambaMixer.slow_forwardn  s   !-!3!3
Q""<<5??1E.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX#(G(G(W(W$++DNN;LLRRTI//1D1DdFYFYZ$++I #..t~~>>7a<);;M>>Z
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O!-$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}?T?TWdWjWjkmWn?npq>rs
);;JW
 $])CC'M)R S!-$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM) -0H0H0K K HHT[[%?XgX%NOM) -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGwA"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78   kk,B7!]VVAtQ<L5M%MN!DHHTN2#//	>>J !%!!!Q'//
BHRRSTVWX!
 %$r2   c                    [        [        [        [        [        [
        45      nU R                  (       aF  U(       a$  SU R                  R                  R                  ;  a  [        S5      eU R                  XUS9$ U R                  XUS9$ )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r]   )r   r   r   r   r   r   r   r   r   type
ValueErrorr   r  )r-   r3   r   r]   rj   r   s         r0   r@   ZambaMambaMixer.forward  s    !$#%68HJ^`no"
   )V4;M;M;T;T;Y;Y-Y i 
 ,,]Yg,hh  ^ \\r2   )r   r   r   r   rs   r   r   r   r   r.   r   r   rt   r   r   r   r   r   r   r   r   r   )NN)rG   rH   rI   rJ   r   r   r'   r)   rL   r   r   r  r@   rM   rN   rO   s   @r0   r   r      sj    
M{ M` ^b_%"\\_%9>_%BP%ut| P%d]54< ] ]r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ZambaMLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFrw   )r&   r'   rs   r.   r   r   r   	gate_projup_proj	down_projr
   
hidden_actact_fnr-   rs   r/   s     r0   r'   ZambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r  r  r  r  )r-   xr  s      r0   r@   ZambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r  rs   r  r  r.   r   r  )rG   rH   rI   rJ   r'   r@   rM   rN   rO   s   @r0   r  r    s    0 r2   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjr   SS\R                  S\R                  S\S\R                  S-  S	\	S-  S
\
S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )ZambaAttentionDecoderLayeri  Nrs   rt   c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l        g )Nr#   )r&   r'   rq   	self_attnr  feed_forwardr!   ry   rms_norm_epsinput_layernormr.   pre_ff_layernormr   s      r0   r'   #ZambaAttentionDecoderLayer.__init__  s]    ':$V,+F,H,HfNaNab ,V-?-?VEXEX Yr2   r3   original_hidden_statesr]   r   	use_cacherj   r$   c           	          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUS.UD6u  pU R	                  U5      nU R                  U5      nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
r6   r   )r3   rt   r]   r   r(   )r)   concatenater$  r!  r%  r"  )	r-   r3   r'  rt   r]   r   r(  rj   r   s	            r0   r@   "ZambaAttentionDecoderLayer.forward  s    2 ))=*QWYZ,,];>> 
')+
 
 --m<))-8r2   )r"  r$  r%  r!  r   )NNF)rG   rH   rI   rJ   r   r   r'   r)   rL   r   boolr   r   rC   FloatTensorr@   rM   rN   rO   s   @r0   r  r    s    Z{ ZsTz Z Z /3(,!&'||' !&' 	'
 t+' ' $;' +,' 
u  %(9(95;L;L(L"MPT"TT	U' 'r2   r  c                     ^  \ rS rSrS\S\4U 4S jjr        SS\R                  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\R                  S-  S\R                  S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )ZambaMambaDecoderLayeri  rs   rt   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)rs   rt   r   )	r&   r'   r   mambar!   r.   r#  r$  rt   r   s      r0   r'   ZambaMambaDecoderLayer.__init__  s:    $FH
+F,>,>FDWDWX"r2   Nr3   r'  r]   causal_maskr   r(  position_idstransformer_hidden_statesrj   r$   c
                 r    UnU	b  X-   OUnU R                  U5      nU R                  " SUUUS.U
D6nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
)r3   r   r]   r*  )r$  r2  )r-   r3   r'  rt   r]   r4  r   r(  r5  r6  rj   residuals               r0   r@   ZambaMambaDecoderLayer.forward  sg    0 !
 :S9^M5dq 	 ,,];

 
'()
 	
 !0r2   )r$  rt   r2  )NNNNNFNN)rG   rH   rI   rJ   r   r   r'   r)   rL   r   r-  
LongTensorr   r   rC   r.  r@   rM   rN   rO   s   @r0   r0  r0    s   #{ #s # 7; $.2+/(,!&049=*||* !&t 3* :	*
 t+* \\D(* * $;* &&-* $)<<$#6* +,* 
u  %(9(95;L;L(L"MPT"TT	U* *r2   r0  c                   Z  ^  \ rS rSrS\S\R                  S\4U 4S jjr      SS\	R                  S\	R                  S-  S	\S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S-  S\\   S\\	R"                  \\	R"                  \	R"                  4   S-  4   4S jjrSrU =r$ )ZambaHybridLayeriH  shared_transflinearr2  c                 F   > [         TU ]  5         Xl        X l        X0l        g r   )r&   r'   r=  r>  mamba_decoder)r-   r=  r>  r2  r/   s       r0   r'   ZambaHybridLayer.__init__I  s    *"r2   Nr3   r'  rt   r]   r4  r   r(  rj   r$   c           	          U R                   " U4UUUUUS.UD6n	U R                  U	5      n	U R                  " U4U	UUUS.UD6nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
)r'  rt   r]   r   r(  )r6  r]   r   r(  )r=  r>  r@  )
r-   r3   r'  rt   r]   r4  r   r(  rj   r6  s
             r0   r@   ZambaHybridLayer.forwardO  s~    2 %)$6$6%
#9&+%
 %
! %)KK0I$J!**
&?)+
 
 r2   )r>  r@  r=  )NNNNNF)rG   rH   rI   rJ   r  r   r   r0  r'   r)   rL   r   r   r-  r   r   rC   r.  r@   rM   rN   rO   s   @r0   r<  r<  H  s    #&@ #")) #\r # 7; $.2+/(,!&-||- !&t 3- :	-
 t+- \\D(- - $;- +,- 
u  %(9(95;L;L(L"MPT"TT	U- -r2   r<  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\S	.r\R"                  " 5       U 4S
 j5       rSrU =r$ )ZambaPreTrainedModeli  rs   modelTr<  r0  r   F)r3   
attentionsc                   > U R                   R                  n[        TU ]  U5        [	        U[
        5      (       Ga  [        R                  " UR                  SUS9  U R                   R                  S-  n[        R                  " UR                  U* U5        U R                   R                  U R                   R                  -  U R                   R                  -  n[        R                   " [        R"                  " U R                   R                  U5      [$        R&                  " U R                   R(                  5      [$        R&                  " U R                   R*                  5      -
  -  [$        R&                  " U R                   R*                  5      -   5      R-                  U R                   R.                  S9nU[        R&                  " [        R0                  " U* 5      * 5      -   n[        R2                  " UR4                  U5        [        R6                  " SUR8                  S-   [        R:                  S9S S S 24   nUR=                  UR>                  S5      RA                  5       n[        R2                  " URB                  [        R&                  " U5      RE                  UR                  URF                  S5      5        [        RH                  " URJ                  5        g g )Nr   )r<   stdrv   )minr   r   r6   )&rs   initializer_ranger&   _init_weights
isinstancer   initnormal_r   r   uniform_r   r   r.   r   r)   r   randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1copy_r   r   r   r:   rR   r   ri   r   rS   r   ones_r   )	r-   rY   rI  dt_init_stdr   dtinv_dtr   r/   s	           r0   rL  "ZambaPreTrainedModel._init_weights  s   kk++f%fo..LL--CSA++33T9KMM&//+{K![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv**F3Q 5 5 9OPTVWPWXA1126AACAJJv||UYYq\%9%9&:N:NPVPePegi%jkJJvxx % /r2   r*  )rG   rH   rI   rJ   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr0  rq   _can_record_outputsr)   no_gradrL  rM   rN   rO   s   @r0   rE  rE    s_    &*#+-EF"3 NL/$
 ]]_! !r2   rE  c                     ^  \ rS rSrSrS\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )
ZambaModeli  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

Args:
    config: ZambaConfig
rs   c                 @  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        UR                  U l	        / nS U l
        [        U R                  5       H  u  p4[        XS9nUS:X  a  [
        R                  " U R                  R                  U R                  R                  SS9nUR                  [!        [#        U5      Xe5      5        U R                  c  SU S3SU S30U l
        M  M  UR                  U5        M     [
        R$                  " U5      U l        [)        UR                  UR*                  S	9U l        SU l        U R1                  5         g )
N)rt   hybridFrw   z
layers.(?!z\.)\d+.shared_transfzlayers.z.shared_transfr   )r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokenslayers_block_type_tied_weights_keys	enumerater0  r   rs   r   r<  r  
ModuleListr   r!   r#  final_layernormgradient_checkpointing	post_init)r-   rs   r   layer_id
layer_typer2  r>  r/   s          r0   r'   ZambaModel.__init__  s]    !.. ++LL):):F<N<NPTP`P`a!'!9!9"&$-d.D.D$E H*6FEX%4;;#:#:DKK<S<SZ_`./I&/QSYab**2%hZ/CDPXzYgFh/D+ 3
 e$ %F mmF++F,>,>FDWDWX&+#r2   N	input_idsr]   r5  r   inputs_embedsr(  rj   r$   c                 L   US L US L-  (       a  [        S5      eUc  U R                  U5      nUn[        R                  " U5      n	U(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      n[        U R
                  UUUUS9n[        U R                  5       H  u  pU" UU	UUU4UUS.UD6nM     U R                  U5      n[        UU(       a  US9$ S S9$ )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either one)rs   r   r   r   )rs   r|  r]   r   r5  )r   r(  )last_hidden_stater   )r  rp  r)   r   r   rs   get_seq_lengthr   rD   r   r   r   rs  r   ru  r   )r-   r{  r]   r5  r   r|  r(  rj   r3   r'  past_seen_tokensr4  rt   layers                 r0   r@   ZambaModel.forward  sW    -t";<s    --i8M%!&]!; 0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 !*$++ 6I!&	 !0#	 	M !7 ,,];&+/8O
 	
>B
 	
r2   )rr  rp  ru  rv  r   rq  rm  rn  )NNNNNN)rG   rH   rI   rJ   r   r   r'   r   r   r   r)   r:  rL   r   r.  r-  r   r   rC   r   r@   rM   rN   rO   s   @r0   ri  ri    s    { 8   .2.204(,26!%8
##d*8
 t+8
 &&-	8

 8
 ((4/8
 $;8
 +,8
 
(	(8
    8
r2   ri  c                   b  ^  \ rS rSrSS0rS\4U 4S jjr\\        SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S-  S\	R                  S-  S\	R                  S-  S\S-  S\\	R                  -  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )ZambaForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightrs   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
r&   r'   ri  rF  rn  r   r   r.   lm_headrw  r  s     r0   r'   ZambaForCausalLM.__init__  sU     '
 ++yy!3!3V5F5FUS 	r2   Nr{  r]   r5  r   r|  labelsr(  logits_to_keeprj   r$   c	           
      h   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UUU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, ZambaForCausalLM

>>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r{  r]   r5  r   r|  r(  Nlosslogitsr   r3   rG  r*  )rF  r  rM  r   slicer  loss_functionrn  r   r   r3   rG  )r-   r{  r]   r5  r   r|  r  r(  r  rj   outputsr3   slice_indicesr  r  s                  r0   r@   ZambaForCausalLM.forward  s    H ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%% 	D &#33!//))
 	
r2   c           
      j   > U R                   R                  US'   [        T
U ]  " U4UUUUUUS.UD6n	U	$ )Nr  )r   r]   r|  r5  r(  is_first_iteration)rs   num_logits_to_keepr&   prepare_inputs_for_generation)r-   r{  r   r]   r|  r5  r(  r  rj   model_inputsr/   s             r0   r  .ZambaForCausalLM.prepare_inputs_for_generation[  sU     $(;;#A#A w<	
+)'%1	
 	
 r2   )r  rF  rn  )NNNNNNNr   )NNNNTF)rG   rH   rI   rJ   rr  r   r'   r   r   r)   r:  rL   r   r.  r-  r   r   r   rC   r   r@   r  rM   rN   rO   s   @r0   r  r    s0   *,GH{   .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
'	'@
  @
J   r2   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                     ^  \ rS rSrU 4S jr\\       SS\R                  S-  S\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )ZambaForSequenceClassificationiu  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r  )
r&   r'   
num_labelsri  rF  r   r   r.   scorerw  r  s     r0   r'   'ZambaForSequenceClassification.__init__  sS      ++'
YYv114??O
 	r2   Nr{  r]   r5  r   r|  r  r(  rj   r$   c           	         U R                   " U4UUUUUS.UD6n	U	R                  n
U R                  U
5      nUb  UR                  S   nOUR                  S   nU R                  R
                  c  US:w  a  [        S5      eU R                  R
                  c  SnOUb  XR                  R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nX-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                  R"                  c  U R$                  S:X  a  S
U R                  l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                  l        OSU R                  l        U R                  R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                  R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                  R"                  S:X  a  [5        5       nU" UU5      n[7        UUU	R8                  U	R:                  U	R<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r]   r5  r   r|  r(  Nr   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r~  
regressionsingle_label_classificationmulti_label_classificationr  )rF  r  r  rD   rs   rl  r  r9   r   r)   int32r   argmaxr   r   r/   rG   problem_typer  r8   longr   r   r   r   r   r   r   r   r3   rG  )r-   r{  r]   r5  r   r|  r  r(  rj   transformer_outputsr3   r  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fcts                      r0   r@   &ZambaForSequenceClassification.forward  s   & 8<zz8
)%+'8
 8
 ,==M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6/ /??-;;*55
 	
r2   )rF  r  r  )NNNNNNN)rG   rH   rI   rJ   r'   r   r   r)   r:  rL   r   r.  r-  r   r   rC   r   r@   rM   rN   rO   s   @r0   r  r  u  s      .2.204(,26*.!%R
##d*R
 t+R
 &&-	R

 R
 ((4/R
   4'R
 $;R
 +,R
 
1	1R
  R
r2   r  )r  r  ri  rE  )r   )Er   rR  collections.abcr   r)   r   torch.nnr   r   r    r	   rN  activationsr
   cache_utilsr   r   
generationr   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   configuration_zambar   
get_loggerrG   r   Moduler!   rL   r   rX   rK   ro   rq   r   r  r  r0  r<  rE  ri  r  r  __all__r*  r2   r0   <module>r     s  &   $   A A & ! . ) 8 / 9 q q F & R R 7 9 5 , 
		H	%J299 J*	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % %2C)RYY C)L\]bii \]@	ryy  0 0f17 1h41 4n $!? $! $!N _
% _
 _
Fg+_ gT ^
%9 ^
^
B gr2   