
    Z jH                        S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/J0r0J1r1J2r2J3r3J4r4J5r5J6r6J7r7  SSK8J9r9  Sr:\ Rv                  " \<5      r= " S S\R                  R|                  5      r? " S S\65      r@ " S S\(5      rA " S S \/5      rB " S! S"\R|                  5      rC " S# S$\R|                  5      rD " S% S&\05      rE " S' S(\45      rF " S) S*\35      rG\ " S+ S,\5      5       rH " S- S.\5\H5      rI " S/ S0\15      rJ " S1 S2\25      rK/ S3QrLg)4    N)Callable)cycle)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask)BaseModelOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)merge_with_config_defaults)resolve_internal_import)capture_outputs   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)	ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2ConfigzZyphra/Zamba2-2.7Bc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGated7   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer7   eps	__class__s       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/zamba2/modular_zamba2.pyr1   Zamba2RMSNormGated.__init__8   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ )Nr   T)keepdim)dtypetor3   float32r   
functionalsilushaper7   viewpowmeanrsqrtr6   r5   )	r8   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r<   forwardZamba2RMSNormGated.forward>   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r>   )r7   r6   r5   )gư>r/   )__name__
__module____qualname____firstlineno__r1   rT   __static_attributes____classcell__r;   s   @r<   r,   r,   7   s    %; ;r>   r,   c                       \ rS rSrSrg)Zamba2RMSNormL    NrV   rW   rX   rY   rZ   r`   r>   r<   r^   r^   L       r>   r^   c                       \ rS rSrSrg)Zamba2RotaryEmbeddingP   r`   Nra   r`   r>   r<   rd   rd   P   rb   r>   rd   c                   X  ^  \ rS rSrSr   SS\S\S-  S\S-  S\S-  4U 4S jjjr   SS	\R                  S\S
\R                  S-  S\
S-  S\\R                  \R                  4   S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Zamba2AttentionT   a*  
Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
Nconfig	layer_idxnum_fwd_mem_blocksblock_idc           
        > [         TU ]  X5        X0l        UR                  U l        X@l        UR                  (       Ga  [        R                  " / 5      U l	        [        R                  " / 5      U l
        [        R                  " / 5      U l        [        U R                  5       GH  nXQR                  -  U:X  Gar  [        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      nO?[        R&                  " 5       n[        R&                  " 5       n[        R&                  " 5       nU R                  R)                  U5        U R                  R)                  U5        U R                  R)                  U5        GM     [+        U R                  5       V	V
s0 s H  u  pX_M	     sn
n	U l        g s  sn
n	f )NFbias)r0   r1   rk   hybrid_layer_idslayer_block_maprl   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listrangenum_mem_blocks
SequentialLinearattention_hidden_sizeri   adapter_rankIdentityappend	enumerate	layer_dic)r8   ri   rj   rk   rl   ilinear_q_adapterlinear_k_adapterlinear_v_adapterindexvaluer;   s              r<   r1   Zamba2Attention.__init__d   s    	+"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   K4rL   attention_maskpast_key_valuesposition_embeddingskwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         R"                  " U R
                  R$                  [&        5      nU" U U	U
UU4U R(                  (       d  SOU R*                  U R,                  S.UD6u  nnUR.                  " / UQSP76 R1                  5       nU R3                  U5      nUU4$ )Nr@   r)   r   g        )dropoutscaling)rG   head_dimq_projk_projv_projri   rr   r   rt   ru   rv   rH   	transposeuse_mem_roper   updater   get_interface_attn_implementationr(   trainingattention_dropoutr   reshape
contiguouso_proj)r8   rL   rj   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightss                     r<   rT   Zamba2Attention.forward   s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L&'6'='=jXa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r>   )rl   rq   r   ru   rt   rv   rk   NNN)rV   rW   rX   rY   __doc__r*   intr1   r3   Tensorr	   tupler   r   rT   rZ   r[   r\   s   @r<   rg   rg   T   s   $ !%)-#'\'\ :'\  $J	'\
 *'\ '\Z /3(,HL1)||1) 1) t+	1)
 1) #5<<#=>E1) +,1) 
u||U\\D0%2E2LL	M1) 1)r>   rg   c                     ^  \ rS rSrSrSS\S\S-  4U 4S jjjr  SS\R                  S\
S-  S	\R                  S-  4S
 jjrSS\
S-  S	\R                  S-  4S jjr  SS\
S-  S	\R                  S-  4S jjrSrU =r$ )Zamba2MambaMixer   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
Nri   rj   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        [S        U R                  U R                  U R&                  -  SS9U l*        [        RD                  " [F        RH                  " U R.                  5      5      U l+        [        R>                  " U R                  U R                  UR@                  S9U l,        URZ                  (       as  []        S	5      n[_        US
S 5      q0[_        USS 5      q1[]        S5      n[e        USS9q3[e        USS9q4[e        USS9q5[m        [f        [h        [j        [b        [`        45      q7OS q0S q1S q3S q4S q5Sq7[_        USS5      (       a"  [n        (       d  [p        Rs                  S5        g g g )NrF   r   Tr)   )in_channelsout_channelsro   kernel_sizegroupspaddingrn   gh㈵>)r7   r:   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combinedFuse_mamba_kernelsa  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d):r0   r1   ri   r9   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer   mamba_expandintermediate_sizerj   use_conv_bias
activationr   SiLUactuse_mem_eff_pathmamba_ngroupsn_groupsmamba_headdimr   n_mamba_heads	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1drz   add_bias_linearin_projr2   r3   r4   dt_biasarangelogA_logr,   normDout_projr   r   getattrr   r   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)r8   ri   rj   projection_sizeAcausal_conv1d	mamba_ssmr;   s          r<   r1   Zamba2MambaMixer.__init__   sN   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&""t/E/E/V\`
	 ejj89		$"8"8$:J:JQWQgQgh ##,_=M#*=:PRV#W &}6H$O(5I%<(b&" )@([)% 0G(b0, &)*-4$(&" $( #%)"(,%/3,%*"6.55>T>T> ?U5r>   rL   cache_paramsr   c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGbH  UR                  U R                  5      (       Ga'  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R&                  " U R(                  R+                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R-                  SU R.                  U R                  5      R1                  [        R2                  S9nUS S 2S S 2S 4   R-                  SSU R.                  5      nU R4                  S S 2S S4   R-                  SU R.                  5      nU R6                  S S 2S S4   R-                  SU R.                  5      nUR9                  X@R                  UR                   S   U R                  -  5      nUR9                  X@R                  UR                   S   U R                  -  5      nUR9                  X@R                  U R.                  5      n[;        UR                  U R                     R<                  UUUUUUS USS9
nUR9                  X@R                  U R.                  -  5      nU R?                  X5      nU RA                  U5      S S 2S S4   nU$ UbG  [        RB                  " US:H  5      (       d)  URD                  nXS S 2S S 2S 4   -  R1                  U5      nU R                  U5      n[        R&                  " U R(                  R+                  5       5      * nU RF                  c  0 OS	U RF                  0nUb  [        RB                  " US:H  5      nOSnU RH                  (       Ga   U RJ                  (       a  Uc  U(       a  [M        UU R                  R                   R                  S5      U R                  R"                  U R4                  U4U R6                  U RN                  S U R$                  U R>                  R                   U R>                  RP                  U R@                  R                   U R@                  R"                  U R.                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbj  URS                  SS5      n[T        RV                  RY                  UU RZ                  UR                   S   -
  S45      nUR]                  UU R                  5      n[^        b  U R$                  S;  aJ  U Ra                  U R                  URS                  SS5      5      RS                  SS5      S S 2S U24   5      nOv[_        URS                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  S9RS                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        RB                  " US:H  5      (       d)  URD                  nXS S 2S S 2S 4   -  R1                  U5      n[c        UR9                  XESU R.                  5      UUUR9                  XEU R                  S5      UR9                  XEU R                  S5      4U RN                  U R6                  S S SU R4                  SS.UD6u  nnUb  Ub  URe                  UU R                  5        UR9                  XES5      nU R?                  UU5      nU RA                  U5      nU$ )Nr   r)   r@   dim.rB   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rF   swish)xr5   ro   r   )r   r   r   r   r   r   r   )3rG   r   r   r   r   has_previous_staterj   r   squeezer   r3   splitr   layersconv_statesr   r5   ro   r   expr   floatexpandr   rC   rD   r   r   rH   r   recurrent_statesr   r   r   rB   r   r   r   r   r   r6   r   r   rE   padr   update_conv_stater   r   r   update_recurrent_state)r8   rL   r   r   
batch_sizeseq_len_groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrM   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutrB   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_t
conv_statescan_outputs                                  r<   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forward-  sY    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(W(W"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!##DNN3??""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J ".!?!?
DNN![J#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E 77	4>>R)..zBG"iiT:mmK0
r>   c                 F   UR                   u  pEnUR                  nUb2  UR                  U R                  5      (       a  U R	                  U5      nO1Ub  XS S 2S S 2S 4   -  R                  U5      nU R	                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUR                  SS5      nUS L=(       a    UR                  U R                  5      nU(       a  UR                  XR                  5      n[        R                  " XR                   R"                  S S 2SS S 24   -  SS9nU R$                  (       a  XR                   R&                  -  nU R)                  U5      R                  U5      S S 2S S4   nOUbW  [*        R,                  R/                  UU R0                  UR                   S   -
  S45      nUR                  XR                  5      nU R)                  U R!                  U5      SS U24   R                  SS5      5      nUb)  UR                  nXS S 2S S 2S 4   -  R                  U5      n[        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[        R2                  " U R4                  R7                  5       5      * nU(       GaX  UR8                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR                  SS5      R;                  XLR                   S   U R<                  5      nU R>                  S   R;                  U R>                  R                   S   U R<                  5      n[        R*                  R,                  RA                  UUR                  UR                  5      -   5      n[        RB                  " XRD                  5      nUS   R;                  U R                  U R<                  U R                  5      R                  [        RF                  S	9n[        R2                  " US   U-  5      nURI                  X@R                  S5      SS S S 24   nUR;                  X@R                  U R                  U R                  -  UR                   S   5      RK                  5       nURI                  USUR                   S   5      nUS   USS S S 24   -  nURI                  USU R<                  5      nUUS   -  nURL                  U R                     RN                  RQ                  5       nUU-  U-   nURS                  UU R                  5      nURI                  X@R                  S5      SS S S 24   nUR;                  X@R                  U R                  U R                  -  UR                   S   5      RK                  5       nURI                  USUR                   S   5      nUR                  UR                  5      nURU                  X@R                  -  U R<                  U R                  5      nURU                  X@R                  -  U R                  S5      n[        RV                  " UU5      nURU                  X@R                  U R<                  5      nU RX                  S   R;                  U RX                  R                   S   U R<                  5      nUUU-  -   R                  UR                  5      nURI                  US5      S S 2S S4   nGO[*        R,                  RA                  XR>                  -   5      n[        RB                  " XRD                  5      nURI                  XESU R<                  5      R7                  5       nURI                  XESU R                  5      R7                  5       nURI                  XESU R                  5      R7                  5       nUR[                  U R                  U R                  -  SU R                  S
9nUR[                  U R                  U R                  -  SU R                  S
9nU R\                  XPR\                  -  -
  U R\                  -  nU RX                  S   [_        UU5      -  nXS   -  nUR                  UR                  5      U-  nUUUU4 Vs/ s H  n[a        UUU R\                  5      PM     snu  nnnnURc                  SSSS5      n[        Rd                  " USS9n[        R2                  " [g        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R                  SS9n!U!S   URc                  SSSSS5      S   -  n"U"R                  SS9n#U#S   US S 2S S 2S 4   -  R                  S5      n$[        R2                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rc                  SSSS5      S   -  n&U&Rc                  SSSSS5      S   URc                  SSSSS5      SS S S 24   -  R                  SS9Rc                  SSSSS5      n'[        Rh                  " U'S S 2S S24   5      n([        Rj                  " U(U'/SS9n'[        R2                  " [g        [*        R,                  R/                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rc                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R                  SS9n+U+Rc                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   n-n'[        R2                  " U5      n.USS S S 24   U'S S 2S S 2S S4   -  n/U.Rc                  SSSS5      n0U/R                  S5      U0S   -  n1U$U1-   nURI                  USU R                  U R<                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURI                  XES5      nU-b  Ub  URS                  U-U R                  5        U Rm                  UU
5      n2U Ro                  U2R                  U5      5      n3U3$ s  snf )Nr@   r   r   r)   r   .).N).NNr   )r   output_sizer      )r)   r   )8rG   rB   r   rj   r   rC   r   r   r   r   r   r   r   r  r3   sumr   r5   r   ro   r   r   rE   r  r   r  r   r  ndimr  r   r   softplusclampr   rD   r   r   r   r  cloner  rH   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likecatr   r   )4r8   input_statesr   r   r  r	  r
  rB   r  r  rM   rL   r  use_precomputed_stater  r  r  r   r   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesr  state_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statess4                                                       r<   torch_forwardZamba2MambaMixer.torch_forward  s   !-!3!3
Q""#(G(G(W(W#||L9) ,aDj/I IMMeT#||L9!''+a$2H2H.HH1t}}K\_c_r_rKrrtx  uC  uC  C  HI  I(8(>(>t55t~~V\^ )? )
%1M &//15 ,D 8 l\=\=\]a]k]k=l !%77~~VJ!IIj;;3E3EaAg3N&NTVWM!!!1!11 HH]366u=aslKM']]..!**]-@-@-DDaH
 *;;JW
 HHT[[%?XgX%N%X%XYZ\]%^_M)%++!.1d
1K!K O OPU V#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**  &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C &,,T^^<MMSSUJ#b3.J%<<ZXJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $qww/J",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF#..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A33It~~Nii4(
 !%knnU.C D$$C &{s   %!rc                     [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XU5      $ U R                  XU5      $ )Ncuda)r   r   r5   devicetyper   r  rN  )r8   rL   r   r   r   s        r<   rT   Zamba2MambaMixer.forwardz  sV     "!f0C0C0J0J0O0O&OXpXrXr,,].YY!!-~NNr>   )r   r   r   r   r   ri   r   r   r   r   r   r9   r   r   rj   r   r   r   r   r   r   r   r   r   r   r/   NN)rV   rW   rX   rY   r   r*   r   r1   r3   r   r	   r  rN  rT   rZ   r[   r\   s   @r<   r   r      s    b| bd
 b bN &*.2	T||T dlT t+	Tns% s%[`[g[gjn[n s%r &*.2	
O dl
O t+	
O 
Or>   r   c                   H   ^  \ rS rSrSS\S\S-  4U 4S jjjrS	S jrSrU =r	$ )
	Zamba2MLPi  Nri   rl   c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 s H  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
r   rn   FN)r0   r1   ri   r9   r   rk   rl   r   rz   r   gate_up_proj	down_projr   
hidden_actact_fnrs   gate_up_proj_adapter_listrw   rx   ry   r|   r}   r~   rp   r   r   )
r8   ri   rk   rl   r   gate_up_proj_adapterrq   r   r   r;   s
            r<   r1   Zamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr   r@   r   r   r)   )rY  r   r]  r3   chunkr\  rZ  )r8   hidden_staterj   gate_up_stateoutputs        r<   rT   Zamba2MLP.forward  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r>   )
r\  rl   ri   rZ  rY  r]  r9   r   r   rk   rU  r/   )
rV   rW   rX   rY   r*   r   r1   rT   rZ   r[   r\   s   @r<   rW  rW    s0    W| WPSVZPZ W W< r>   rW  c                      ^  \ rS rSrSS\S\S-  S\S-  4U 4S jjjr   SS\R                  S\R                  S\S	\R                  S-  S
\	S-  S\R                  S-  S\\   S\\R                     4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  Nri   rl   rj   c                    > X l         [        UR                  5      n[        TU ]  X5        [        USXBS9U l        [        XUS9U l        g )Nr@   )rj   rk   rl   )rk   rl   )	rl   lenrp   r0   r1   rg   	self_attnrW  feed_forward)r8   ri   rl   rj   num_gsr;   s        r<   r1   $Zamba2AttentionDecoderLayer.__init__  sF     V,,-+(2RXl%fRZ[r>   rL   original_hidden_statesr   r   r   r   r   c           	          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
r@   r   )rL   rj   r   r   r   r`   )r3   concatenateinput_layernormrj  pre_ff_layernormrk  )	r8   rL   rn  rj   r   r   r   r   r
  s	            r<   rT   #Zamba2AttentionDecoderLayer.forward  s    6 ))=*QWYZ,,];>> 
')+ 3
 
 --m<))-Cr>   )rl   rk  rj  rU  r   )rV   rW   rX   rY   r*   r   r1   r3   r   r	   
LongTensorr   r   r   FloatTensorrT   rZ   r[   r\   s   @r<   rg  rg    s    \| \sTz \UX[_U_ \ \ /3(,7;)||) !&) 	)
 t+) ) #--4) +,) 
u  	!) )r>   rg  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Zamba2MambaDecoderLayeri  ri   rj   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        g )N)ri   rj   r:   )r0   r1   r   mambar^   r9   rms_norm_epsrq  )r8   ri   rj   r;   s      r<   r1    Zamba2MambaDecoderLayer.__init__  s7    +%VI
,V-?-?VEXEXYr>   )rq  rz  )	rV   rW   rX   rY   r*   r   r1   rZ   r[   r\   s   @r<   rw  rw    s    Z| Z Z Zr>   rw  c                     ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\	R                  S-  S	\S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S-  S\	R                  S-  S\	R                  S-  S\\   S\\	R$                  \\	R$                  \	R$                  4   S-  4   4S jjrSrU =r$ )Zamba2HybridLayeri  shared_transformerlinearrz  c                 6   > [         TU ]  XU5        U ?Xl        g r/   )r0   r1   shared_transfr  )r8   r  r  rz  r;   s       r<   r1   Zamba2HybridLayer.__init__  s!     	+U;"4r>   NrL   rn  rj   r   causal_maskr   	use_cacher   position_idsr   r   c
           
          U R                   " U4UUUUUU	S.U
D6nU R                  U5      nU R                  " U4UUUUUS.U
D6nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)rn  rj   r   r   r   r  )transformer_hidden_statesr   r   r  r   )r  r  mamba_decoder)r8   rL   rn  rj   r   r  r   r  r   r  r   r  s               r<   rT   Zamba2HybridLayer.forward  s    < %)$;$;	%
#9&+ 3%	%
 	%
! %)KK0I$J!**
&?)+ 3
 
 r>   )r  )NNNNNFNN)rV   rW   rX   rY   rg  r   rz   rw  r1   r3   r   r   r	   boolrt  r   r   r   ru  rT   rZ   r[   r\   s   @r<   r~  r~    s'   5"=5GIyy5Yp5 7; $.2+/(,!&7;044||4 !&t 34 :	4
 t+4 \\D(4 4 $;4 #--44 &&-4 +,4 
u  %(9(95;L;L(L"MPT"TT	U4 4r>   r~  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSr\\S.r\R$                  " 5       U 4S	 j5       rS
rU =r$ )Zamba2PreTrainedModeli+  ri   modelTr~  rw  r   )rL   
attentionsc                   > [         TU ]  U5        [        U[        5      (       Ga  [        R
                  " [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      -
  -  [        R                  " U R                  R                  5      -   5      R                  U R                  R                  S9nU[        R                  " [        R                  " U* 5      * 5      -   n[         R"                  " UR$                  U5        [        R&                  " SUR(                  S-   5      n[         R"                  " UR*                  [        R                  " U5      5        [         R,                  " UR.                  5        g g )N)minr)   )r0   _init_weights
isinstancer   r3   r  randri   r   mathr   r   r   r&  time_step_floorexpm1initcopy_r   r   r   r   ones_r   )r8   moduler  inv_dtr   r;   s        r<   r  #Zamba2PreTrainedModel._init_weights;  s+   f%f.//

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv~~v.Q 0 01 45AJJv||UYYq\2JJvxx  0r>   r`   )rV   rW   rX   rY   r*   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulrw  rg   _can_record_outputsr3   no_gradr  rZ   r[   r\   s   @r<   r  r  +  se    &*#,.GH"3NL0%
 ]]_! !r>   r  c                      \ rS rSrSrS\4S jrS r\\	\
      SS\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rSrg)Zamba2ModeliM  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
ri   c                 d   [         R                  X5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        UR                  U l
        U R                  5       U l        UR                  U l        [        UR                  UR                  S9U l        UR"                  (       a6  UR$                  (       a  [&        R)                  S5        [+        U5      U l        SU l        U R1                  5         g )Nry  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)r  r1   ri   pad_token_idpadding_idx
vocab_sizer   	Embeddingr9   embed_tokenslayers_block_type
get_layersr   r   r^   r{  final_layernormr   use_long_contextr   r   rd   
rotary_embgradient_checkpointing	post_init)r8   ri   s     r<   r1   Zamba2Model.__init__U  s    &&t4!.. ++LL):):F<N<NPTP`P`a!'!9!9oo'$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	r>   c                 (   / n0 U l         SU l        / n[        U R                  5       GHQ  u  p4[	        U R
                  US9nUS:X  Ga  SU S3n[        U[        5      (       a#  [        U5      U R
                  R                  :  aH  [        U[        5      (       a  [        U5      n[        U5      nU R                   R                  Xg05        OUR                  U5        X0R
                  R                  -  n[        U R
                  US9n	[        R                   " U R
                  R"                  U R
                  R"                  SS9n
UR                  [%        XU5      5        GM@  UR                  U5        GMT     [        R&                  " U5      $ )	Nr   )rj   hybridzlayers.z.shared_transformer)rl   Frn   )_tied_weights_keysfirst_transformer_layer_idr   r  rw  ri   r  listri  rx   r   nextr   r~   rg  r   rz   r9   r~  rs   )r8   r   unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternrl   
attn_blocklinear_layers              r<   r  Zamba2Model.get_layersl  sK   "$*+'!$-d.D.D$E H1$++RKX%#*8*4G!H ##7>>/0DKK4N4NN!"6==/45I/J,%)*>%?N++22N3ST )//?#kk&@&@@8xX
!yy)@)@$++BYBY`ef/
+VWk*3 %F4 }}V$$r>   N	input_idsr   r  r   inputs_embedsr  r   r   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      nUn[        R                  " U5      n	U(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      n[        U R
                  UUUUS9nU R
                  R                  (       a  U R                  XS9nOS n[        U R                  5       H  u  pU" UU	UUU4UUUUS.UD6nM     U R!                  U5      n[#        UU(       a  US	9$ S S	9$ )
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either one)ri   r   r)   rR  )ri   r  r   r   r  )r  )r   r  r   r  )last_hidden_stater   )
ValueErrorr  r3   r'  r
   ri   get_seq_lengthr   rG   rR  	unsqueezer   r   r  r   r   r  r   )r8   r  r   r  r   r  r  r   rL   rn  past_seen_tokensr  r   rj   layers                  r<   rT   Zamba2Model.forward  s    -t";<s    --i8M%!&]!; 0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ;;##"&//-/"["& )$++ 6I!& !0#$7) M !7 ,,];&+/8O
 	
>B
 	
r>   )r   r  ri   r  r  r  r  r   r  r  r  r  )NNNNNN)rV   rW   rX   rY   r   r*   r1   r  r   r   r   r3   rt  r   r	   ru  r  r   r   r   r   rT   rZ   r`   r>   r<   r  r  M  s    | . %D   .2.204(,26!%@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 $;@
 +,@
 
(	(@
    @
r>   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Zamba2ForCausalLMi  ri   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r/   r0   r1   r  r  r  r8   ri   r;   s     r<   r1   Zamba2ForCausalLM.__init__  &      (
r>   r  )rV   rW   rX   rY   r*   r1   rZ   r[   r\   s   @r<   r  r    s    |  r>   r  c                   >  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\R                  S-  S
\R                  S-  S\S-  S\\R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )Zamba2ForSequenceClassificationi  ri   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r/   r  r  s     r<   r1   (Zamba2ForSequenceClassification.__init__  r  r>   Nr  r   r  r   r  labelsr  logits_to_keepr   r   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R                  c  US:w  a  [        S5      eU R                  R                  c  SnOUb  XR                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                   S35        U[        R                  " XR                  S	9U4   nSnUb!  U R                   " SUUUU R                  S
.U	D6n[#        UUU
R$                  U
R&                  U
R(                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   r  r   r  r  r   Nr)   z=Cannot handle batch sizes > 1 if no padding token is defined.r@   )rR  rB   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )logitsr  pooled_logitsri   )lossr  r   rL   r  r`   )r  scorerG   ri   r  r  rC   rR  r3   int32r   argmaxr   r   r;   rV   loss_functionr   r   rL   r  )r8   r  r   r  r   r  r  r  r  r   transformer_outputsrL   r  r  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                      r<   rT   'Zamba2ForSequenceClassification.forward  s   ( 8<zz8
)%+'8
 8
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%% $V=Y]YdYdhnD 0 /??-;;*55
 	
r>   r  )NNNNNNNr   )rV   rW   rX   rY   r*   r1   r   r   r3   rt  r   r	   ru  r  r   r   r   r   r   rT   rZ   r[   r\   s   @r<   r  r    s   | 
  .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
1	1@
  @
r>   r  )r  r  r  r  )Mr  collections.abcr   	itertoolsr   r3   r    r   r  activationsr   cache_utilsr	   r
   integrations.hub_kernelsr   masking_utilsr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar    r!   r"   r#   r$   r%   r&   r'   r(   configuration_zamba2r*   _CONFIG_FOR_DOC
get_loggerrV   r   Moduler,   r^   rd   rg   r   rW  rg  rw  r~  r  r  r  r  __all__r`   r>   r<   <module>r     sP    $    & ! . 8 / Y F & l l 7 9 5 M Y Y
 
 
 / '			H	%; ;*	L 		0 	j)n j)ZCOryy COL'		 'T1"< 1hZ4 Z<( <~ !O ! !BD
*3 D
N( H
&D H
Vr>   