
    Z j
                    v   S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJr  SSK	J
r  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.J/r/  SSK0J1r1  SSK2J3r3  SSK4J5r5  \,Rl                  " \75      r8 " S S\R                  Rr                  5      r: " S S\Rr                  5      r; " S S\Rr                  5      r<S\Rz                  S \>S!\Rz                  4S" jr? SJS#\Rr                  S$\Rz                  S%\Rz                  S&\Rz                  S'\Rz                  S-  S(\@S)\@4S* jjrAS+ rB\" S,5      SKS- j5       rC " S. S/\Rr                  5      rDS0\Rz                  S1\>4S2 jrES3 rFS4 rG " S5 S6\Rr                  5      rH " S7 S8\Rr                  5      rI " S9 S:\Rr                  5      rJ " S; S<\5      rK " S= S>\5      rL\) " S? S@\$5      5       rM\) " SA SB\M5      5       rN " SC SD\M\5      rO\)" SESF9 " SG SH\M5      5       rP/ SIQrQg)L    N)Callable)cycle)Optional)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hub)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocastmerge_with_config_defaults)resolve_internal_import)capture_outputs   )Zamba2Configc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGated3   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer0   eps	__class__s       {/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/zamba2/modeling_zamba2.pyr*   Zamba2RMSNormGated.__init__4   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ N   T)keepdim)dtypetor,   float32r   
functionalsilushaper0   viewpowmeanrsqrtr/   r.   )	r1   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r5   forwardZamba2RMSNormGated.forward:   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r7   )r0   r/   r.   gư>r(   )__name__
__module____qualname____firstlineno__r*   rO   __static_attributes____classcell__r4   s   @r5   r%   r%   3   s    %; ;r7   r%   c                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Zamba2RMSNormH   r3   returnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z,
Zamba2RMSNorm is equivalent to T5LayerNorm
N)r)   r*   r   r+   r,   r-   r.   r/   )r1   r2   r3   r4   s      r5   r*   Zamba2RMSNorm.__init__I   s/     	ll5::k#:; #r7   rG   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r9   )	r=   r>   r,   r?   rD   rE   rF   r/   r.   )r1   rG   rI   rN   s       r5   rO   Zamba2RMSNorm.forwardQ   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r7   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler.   rB   r/   )r1   s    r5   
extra_reprZamba2RMSNorm.extra_reprX   s*    ))*+6$2G2G1HIIr7   )r/   r.   rQ   )rR   rS   rT   rU   floatr*   r,   TensorrO   rc   rV   rW   rX   s   @r5   rZ   rZ   H   sB    $ $$ $ $;U\\ ;ell ;J Jr7   rZ   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )Zamba2RotaryEmbedding\   inv_freqNconfigc                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrj   F)
persistentoriginal_inv_freq)r)   r*   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrk   rope_parametersrm   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r1   rk   devicerope_init_fnrj   r4   s        r5   r*   Zamba2RotaryEmbedding.__init___   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr7   ry   ztorch.deviceseq_lenr\   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNg      ?r   r:   r=   ry   r=   )	rt   getattrr2   num_attention_headsr,   arangeint64r>   re   )rk   ry   r|   basedimattention_factorrj   s          r5   ru   5Zamba2RotaryEmbedding.compute_default_rope_parameterso   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r7   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r;   r"   mpscpuF)device_typeenabledr:   r   r   )rj   re   expandrB   r>   ry   
isinstancetypestrr   	transposer,   catcosrv   sinr=   )
r1   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   rO   Zamba2RotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rv   rk   rr   rs   rm   r(   NNN)rR   rS   rT   rU   r,   rf   __annotations__r#   r*   staticmethodr   intrb   re   ru   no_gradr   rO   rV   rW   rX   s   @r5   rh   rh   \   s    llV| V V  &*+/"*t#*(* t* 
~u$	%	* *: ]]_<  <r7   rh   rG   n_repr\   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rB   r   reshape)rG   r   batchnum_key_value_headsslenr   s         r5   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr7   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr:   r   r;   )r   r=   )ptrainingr"   )r   num_key_value_groupsr,   matmulr   r   r@   softmaxr?   r>   r=   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r5   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r7   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr;   r:   r   )rB   r,   r   )r   x1x2s      r5   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r5   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr7   c                   X  ^  \ rS rSrSr   SS\S\S-  S\S-  S\S-  4U 4S jjjr   SS	\R                  S\S
\R                  S-  S\
S-  S\\R                  \R                  4   S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Zamba2Attention   a*  
Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
Nrk   	layer_idxnum_fwd_mem_blocksblock_idc           
        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        X0l        UR,                  U l        X@l        UR2                  (       Ga  [        R4                  " / 5      U l        [        R4                  " / 5      U l        [        R4                  " / 5      U l        [=        U R*                  5       GH  nXQR>                  -  U:X  Gar  [        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      nO?[        RD                  " 5       n[        RD                  " 5       n[        RD                  " 5       nU R6                  RG                  U5        U R8                  RG                  U5        U R:                  RG                  U5        GM     [I        U R.                  5       V	V
s0 s H  u  pX_M	     sn
n	U l%        g s  sn
n	f )Nr:   g      TFbias)&r)   r*   rk   r   attention_hidden_sizeattention_head_dimr   r   r   r   rq   r   	is_causalattention_dropoutr   Linearq_projk_projv_projr2   o_projr   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listrangenum_mem_blocks
Sequentialadapter_rankIdentityappend	enumerate	layer_dic)r1   rk   r   r   r   ilinear_q_adapterlinear_k_adapterlinear_v_adapterindexr   r4   s              r5   r*   Zamba2Attention.__init__   s>    	"%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejk"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   Q/rG   r   past_key_valuesposition_embeddingsr   r\   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         R"                  " U R
                  R$                  [&        5      nU" U U	U
UU4U R(                  (       d  SOU R*                  U R,                  S.UD6u  nnUR.                  " / UQSP76 R1                  5       nU R3                  U5      nUU4$ )Nr;   r"   r:           )r   r   )rB   r   r   r   r   rk   r   r   r   r   r   rC   r   use_mem_roper   updater   get_interface_attn_implementationr   r   r   r   r   r   r   )r1   rG   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer   r   s                     r5   rO   Zamba2Attention.forward+  s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L&'6'='=jXa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r7   )r   r   r   rk   r   r   r   r   r   r   r   r   r   rq   r   r   r   r   r   r   r   )rR   rS   rT   rU   __doc__r#   r   r*   r,   rf   r
   rb   r   r   rO   rV   rW   rX   s   @r5   r   r      s   $ !%)-#6\6\ :6\  $J	6\
 *6\ 6\x /3(,HL1)||1) 1) t+	1)
 1) #5<<#=>E1) +,1) 
u||U\\D0%2E2LL	M1) 1)r7   r   input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moder   )lenrB   r,   r   r@   pad)r  r  	pad_shapes      r5   pad_tensor_by_sizer  b  sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr7   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r   r;   r:   )r  r  rB   r   )r  r  
chunk_sizes      r5   reshape_into_chunksr  m  s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r7   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r;   .Nr   )diagonalr   r   )
sizer   r,   trilr-   ry   boolmasked_fillcumsuminf)r  r  masktensor_segsums       r5   segment_sumr    s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr7   c                     ^  \ rS rSrSrSS\S\S-  4U 4S jjjr  SS\R                  S\
S-  S	\R                  S-  4S
 jjrSS\
S-  S	\R                  S-  4S jjr  SS\
S-  S	\R                  S-  4S jjrSrU =r$ )Zamba2MambaMixeri  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
Nrk   r   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        [S        U R                  U R                  U R&                  -  SS9U l*        [        RD                  " [F        RH                  " U R.                  5      5      U l+        [        R>                  " U R                  U R                  UR@                  S9U l,        URZ                  (       as  []        S	5      n[_        US
S 5      q0[_        USS 5      q1[]        S5      n[e        USS9q3[e        USS9q4[e        USS9q5[m        [f        [h        [j        [b        [`        45      q7OS q0S q1S q3S q4S q5Sq7[_        USS5      (       a"  [n        (       d  [p        Rs                  S5        g g g )NrA   r:   Tr"   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r0   r3   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combinedFuse_mamba_kernelsa  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d):r)   r*   rk   r2   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer   mamba_expandintermediate_sizer   use_conv_bias
activationr   SiLUactuse_mem_eff_pathmamba_ngroupsn_groupsmamba_headdimr   n_mamba_heads	num_headsr  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr+   r,   r-   dt_biasr   logA_logr%   normDout_projr'  r   r   r$  r%  r    selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)r1   rk   r   projection_sizeAcausal_conv1d	mamba_ssmr4   s          r5   r*   Zamba2MambaMixer.__init__  sN   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&""t/E/E/V\`
	 ejj89		$"8"8$:J:JQWQgQgh ##,_=M#*=:PRV#W &}6H$O(5I%<(b&" )@([)% 0G(b0, &)*-4$(&" $( #%)"(,%/3,%*"6.55>T>T> ?U5r7   rG   cache_paramsr   c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGbH  UR                  U R                  5      (       Ga'  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R&                  " U R(                  R+                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R-                  SU R.                  U R                  5      R1                  [        R2                  S9nUS S 2S S 2S 4   R-                  SSU R.                  5      nU R4                  S S 2S S4   R-                  SU R.                  5      nU R6                  S S 2S S4   R-                  SU R.                  5      nUR9                  X@R                  UR                   S   U R                  -  5      nUR9                  X@R                  UR                   S   U R                  -  5      nUR9                  X@R                  U R.                  5      n[;        UR                  U R                     R<                  UUUUUUS USS9
nUR9                  X@R                  U R.                  -  5      nU R?                  X5      nU RA                  U5      S S 2S S4   nU$ UbG  [        RB                  " US:H  5      (       d)  URD                  nXS S 2S S 2S 4   -  R1                  U5      nU R                  U5      n[        R&                  " U R(                  R+                  5       5      * nU RF                  c  0 OS	U RF                  0nUb  [        RB                  " US:H  5      nOSnU RH                  (       Ga   U RJ                  (       a  Uc  U(       a  [M        UU R                  R                   R                  S5      U R                  R"                  U R4                  U4U R6                  U RN                  S U R$                  U R>                  R                   U R>                  RP                  U R@                  R                   U R@                  R"                  U R.                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbj  URS                  SS5      n[T        RV                  RY                  UU RZ                  UR                   S   -
  S45      nUR]                  UU R                  5      n[^        b  U R$                  S;  aJ  U Ra                  U R                  URS                  SS5      5      RS                  SS5      S S 2S U24   5      nOv[_        URS                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  S9RS                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        RB                  " US:H  5      (       d)  URD                  nXS S 2S S 2S 4   -  R1                  U5      n[c        UR9                  XESU R.                  5      UUUR9                  XEU R                  S5      UR9                  XEU R                  S5      4U RN                  U R6                  S S SU R4                  SS.UD6u  nnUb  Ub  URe                  UU R                  5        UR9                  XES5      nU R?                  UU5      nU RA                  U5      nU$ )Nr:   r"   r;   r   .r   T)zr@  dt_softplusdt_limitF)rD  r  seq_idxr/  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rA   swish)r   r.   r   r/  )r  rD  rT  rW  r_  r@  rU  )3rB   r4  r)  r-  r7  has_previous_stater   r?  squeezer;  r,   splitr$  layersconv_statesr=  r.   r   r/  exprB  re   r   r   r>   r?   r@  rD  rC   rF  recurrent_statesrC  rE  rI  r=   r8  r2  r   rH  r  r/   r   r   r@   r	  r+  update_conv_stater%  r1  rG  update_recurrent_state)r1   rG   rR  r   
batch_sizer|   _groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrH   hidden_states_B_CdtBCrN  r@  rD  hidden_states_reshapedoutr=   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_t
conv_statescan_outputs                                  r5   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forward  sY    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(W(W"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!##DNN3??""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J ".!?!?
DNN![J#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E 77	4>>R)..zBG"iiT:mmK0
r7   c                 F   UR                   u  pEnUR                  nUb2  UR                  U R                  5      (       a  U R	                  U5      nO1Ub  XS S 2S S 2S 4   -  R                  U5      nU R	                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUR                  SS5      nUS L=(       a    UR                  U R                  5      nU(       a  UR                  XR                  5      n[        R                  " XR                   R"                  S S 2SS S 24   -  SS9nU R$                  (       a  XR                   R&                  -  nU R)                  U5      R                  U5      S S 2S S4   nOUbW  [*        R,                  R/                  UU R0                  UR                   S   -
  S45      nUR                  XR                  5      nU R)                  U R!                  U5      SS U24   R                  SS5      5      nUb)  UR                  nXS S 2S S 2S 4   -  R                  U5      n[        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[        R2                  " U R4                  R7                  5       5      * nU(       GaX  UR8                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR                  SS5      R;                  XLR                   S   U R<                  5      nU R>                  S   R;                  U R>                  R                   S   U R<                  5      n[        R*                  R,                  RA                  UUR                  UR                  5      -   5      n[        RB                  " XRD                  5      nUS   R;                  U R                  U R<                  U R                  5      R                  [        RF                  S	9n[        R2                  " US   U-  5      nURI                  X@R                  S5      SS S S 24   nUR;                  X@R                  U R                  U R                  -  UR                   S   5      RK                  5       nURI                  USUR                   S   5      nUS   USS S S 24   -  nURI                  USU R<                  5      nUUS   -  nURL                  U R                     RN                  RQ                  5       nUU-  U-   nURS                  UU R                  5      nURI                  X@R                  S5      SS S S 24   nUR;                  X@R                  U R                  U R                  -  UR                   S   5      RK                  5       nURI                  USUR                   S   5      nUR                  UR                  5      nURU                  X@R                  -  U R<                  U R                  5      nURU                  X@R                  -  U R                  S5      n[        RV                  " UU5      nURU                  X@R                  U R<                  5      nU RX                  S   R;                  U RX                  R                   S   U R<                  5      nUUU-  -   R                  UR                  5      nURI                  US5      S S 2S S4   nGO[*        R,                  RA                  XR>                  -   5      n[        RB                  " XRD                  5      nURI                  XESU R<                  5      R7                  5       nURI                  XESU R                  5      R7                  5       nURI                  XESU R                  5      R7                  5       nUR[                  U R                  U R                  -  SU R                  S
9nUR[                  U R                  U R                  -  SU R                  S
9nU R\                  XPR\                  -  -
  U R\                  -  nU RX                  S   [_        UU5      -  nXS   -  nUR                  UR                  5      U-  nUUUU4 Vs/ s H  n[a        UUU R\                  5      PM     snu  nnnnURc                  SSSS5      n[        Rd                  " USS9n[        R2                  " [g        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R                  SS9n!U!S   URc                  SSSSS5      S   -  n"U"R                  SS9n#U#S   US S 2S S 2S 4   -  R                  S5      n$[        R2                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rc                  SSSS5      S   -  n&U&Rc                  SSSSS5      S   URc                  SSSSS5      SS S S 24   -  R                  SS9Rc                  SSSSS5      n'[        Rh                  " U'S S 2S S24   5      n([        Rj                  " U(U'/SS9n'[        R2                  " [g        [*        R,                  R/                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rc                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R                  SS9n+U+Rc                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   n-n'[        R2                  " U5      n.USS S S 24   U'S S 2S S 2S S4   -  n/U.Rc                  SSSS5      n0U/R                  S5      U0S   -  n1U$U1-   nURI                  USU R                  U R<                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURI                  XES5      nU-b  Ub  URS                  U-U R                  5        U Rm                  UU
5      n2U Ro                  U2R                  U5      5      n3U3$ s  snf )Nr;   r:   r   r"   r   .r  ).NNr   )r   output_sizer   r  )r"   r   )8rB   r=   ra  r   r?  r>   r-  r4  r)  r7  rc  r;  r   rh  r,   sumr=  r.   r.  r   r1  r   r@   r	  r+  rf  rB  re   ndimr   r   r@  softplusclampr9  r?   r   r   rd  rg  rx   ri  rC   bmmrD  repeat_interleaver  r  r  permuter  r  
zeros_liker   rC  rE  )4r1   input_statesrR  r   rj  r|   rk  r=   rw  ro  rH   rG   rr  use_precomputed_stater}  rs  rt  rN  r@  dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyrD  r  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesrz  state_decay_outC_times_statesstate_decay_out_permutedY_offr~  contextualized_statess4                                                       r5   torch_forwardZamba2MambaMixer.torch_forward  s   !-!3!3
Q""#(G(G(W(W#||L9) ,aDj/I IMMeT#||L9!''+a$2H2H.HH1t}}K\_c_r_rKrrtx  uC  uC  C  HI  I(8(>(>t55t~~V\^ )? )
%1M &//15 ,D 8 l\=\=\]a]k]k=l !%77~~VJ!IIj;;3E3EaAg3N&NTVWM!!!1!11 HH]366u=aslKM']]..!**]-@-@-DDaH
 *;;JW
 HHT[[%?XgX%N%X%XYZ\]%^_M)%++!.1d
1K!K O OPU V#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**  &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C &,,T^^<MMSSUJ#b3.J%<<ZXJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $qww/J",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF#..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A33It~~Nii4(
 !%knnU.C D$$C &{s   %!rc                     [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XU5      $ U R                  XU5      $ )Ncuda)rJ  r?  r.   ry   r   r   r  r  )r1   rG   rR  r   r   s        r5   rO   Zamba2MambaMixer.forwardN  sV     "!f0C0C0J0J0O0O&OXpXrXr,,].YY!!-~NNr7   )rB  rD  r1  r/  r  rk   r=  r;  r+  r@  r   r2   r?  r-  r   r4  rC  r7  rE  r)  r8  r:  r9  r.  r2  r(   NN)rR   rS   rT   rU   r  r#   r   r*   r,   rf   r
   r  r  rO   rV   rW   rX   s   @r5   r  r    s    b| bd
 b bN &*.2	T||T dlT t+	Tns% s%[`[g[gjn[n s%r &*.2	
O dl
O t+	
O 
Or7   r  c                   H   ^  \ rS rSrSS\S\S-  4U 4S jjjrS	S jrSrU =r	$ )
	Zamba2MLPi[  Nrk   r   c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 s H  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
r:   r   FN)r)   r*   rk   r2   r-  r   r   r   r   r>  gate_up_proj	down_projr	   
hidden_actact_fnr   gate_up_proj_adapter_listr   r   r   r   r   r   r   r   r   )
r1   rk   r   r   r   gate_up_proj_adapterr   r   r   r4   s
            r5   r*   Zamba2MLP.__init__\  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr:   r;   r   r   r"   )r  r   r  r,   chunkr  r  )r1   hidden_stater   gate_up_stateoutputs        r5   rO   Zamba2MLP.forwardz  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r7   )
r  r   rk   r  r  r  r2   r-  r   r   r  r(   )
rR   rS   rT   rU   r#   r   r*   rO   rV   rW   rX   s   @r5   r  r  [  s0    W| WPSVZPZ W W< r7   r  c                      ^  \ rS rSrSS\S\S-  S\S-  4U 4S jjjr   SS\R                  S\R                  S\S	\R                  S-  S
\	S-  S\R                  S-  S\\   S\\R                     4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  Nrk   r   r   c                 "  > [         TU ]  5         X l        [        UR                  5      n[        USXBS9U l        [        XUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr;   )r   r   r   )r   r   r3   )r)   r*   r   r  r   r   	self_attnr  feed_forwardrZ   r   rms_norm_epsinput_layernormr2   pre_ff_layernorm)r1   rk   r   r   num_gsr4   s        r5   r*   $Zamba2AttentionDecoderLayer.__init__  sz     V,,-(2RXl%fRZ[,V-I-IvObObc -f.@.@fFYFY Zr7   rG   original_hidden_statesr   r   r   r   r\   c           	          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
r;   r   )rG   r   r   r   r    )r,   concatenater  r  r  r  )	r1   rG   r  r   r   r   r   r   rk  s	            r5   rO   #Zamba2AttentionDecoderLayer.forward  s    6 ))=*QWYZ,,];>> 
')+ 3
 
 --m<))-Cr7   )r   r  r  r  r  r  r   )rR   rS   rT   rU   r#   r   r*   r,   rf   r
   
LongTensorr   r   rb   FloatTensorrO   rV   rW   rX   s   @r5   r  r    s    [| [sTz [UX[_U_ [ [ /3(,7;)||) !&) 	)
 t+) ) #--4) +,) 
u  	!) )r7   r  c                     ^  \ rS rSrS\S\4U 4S jjr        SS\R                  S\R                  S-  S\S-  S\R                  S-  S	\R                  S-  S
\	S-  S\
S-  S\R                  S-  S\R                  S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Zamba2MambaDecoderLayeri  rk   r   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)rk   r   r  )	r)   r*   r  mambarZ   r2   r  r  r   )r1   rk   r   r4   s      r5   r*    Zamba2MambaDecoderLayer.__init__  s:    %VI
,V-?-?VEXEXY"r7   NrG   r  r   causal_maskr   	use_cacher   transformer_hidden_statesr   r\   c
                 r    UnU	b  X-   OUnU R                  U5      nU R                  " SUUUS.U
D6nX-   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
)rG   rR  r   r  )r  r  )r1   rG   r  r   r   r  r   r  r   r  r   residuals               r5   rO   Zamba2MambaDecoderLayer.forward  sg    0 !
 :S9^M5dq 	 ,,];

 
'()
 	
 !0r7   )r  r   r  NNNNNFNN)rR   rS   rT   rU   r#   r   r*   r,   rf   r
   r  r  r   r   rb   r  rO   rV   rW   rX   s   @r5   r  r    s   #| # # 7; $.2+/(,!&049=*||* !&t 3* :	*
 t+* \\D(* * $;* &&-* $)<<$#6* +,* 
u  %(9(95;L;L(L"MPT"TT	U* *r7   r  c                     ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\	R                  S-  S	\S-  S
\	R                  S-  S\	R                  S-  S\S-  S\S-  S\	R                  S-  S\	R                  S-  S\\   S\\	R$                  \\	R$                  \	R$                  4   S-  4   4S jjrSrU =r$ )Zamba2HybridLayeri  shared_transformerlinearr  c                 F   > [         TU ]  5         X l        X0l        Xl        g r(   )r)   r*   r  mamba_decoderr  )r1   r  r  r  r4   s       r5   r*   Zamba2HybridLayer.__init__  s!     	""4r7   NrG   r  r   r   r  r   r  r   r   r   r\   c
           
          U R                   " U4UUUUUU	S.U
D6nU R                  U5      nU R                  " U4UUUUUS.U
D6nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)r  r   r   r   r   r   )r  r   r   r  r   )r  r  r  )r1   rG   r  r   r   r  r   r  r   r   r   r  s               r5   rO   Zamba2HybridLayer.forward  s    < %)$;$;	%
#9&+ 3%	%
 	%
! %)KK0I$J!**
&?)+ 3
 
 r7   )r  r  r  r  )rR   rS   rT   rU   r  r   r   r  r*   r,   rf   r   r
   r  r  r   r   rb   r  rO   rV   rW   rX   s   @r5   r  r    s'   5"=5GIyy5Yp5 7; $.2+/(,!&7;044||4 !&t 34 :	4
 t+4 \\D(4 4 $;4 #--44 &&-4 +,4 
u  %(9(95;L;L(L"MPT"TT	U4 4r7   r  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSr\\S.r\R$                  " 5       U 4S	 j5       rS
rU =r$ )Zamba2PreTrainedModeli/  rk   modelTr  r  r   )rG   
attentionsc                   > [         TU ]  U5        [        U[        5      (       Ga  [        R
                  " [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      -
  -  [        R                  " U R                  R                  5      -   5      R                  U R                  R                  S9nU[        R                  " [        R                  " U* 5      * 5      -   n[         R"                  " UR$                  U5        [        R&                  " SUR(                  S-   5      n[         R"                  " UR*                  [        R                  " U5      5        [         R,                  " UR.                  5        g g )N)minr"   )r)   _init_weightsr   r  r,   rf  randrk   r6  mathrA  r:  r9  r  time_step_floorexpm1initcopy_r@  r   r7  rB  ones_rD  )r1   r   rr  inv_dtrN  r4   s        r5   r  #Zamba2PreTrainedModel._init_weights?  s+   f%f.//

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv~~v.Q 0 01 45AJJv||UYYq\2JJvxx  0r7   r  )rR   rS   rT   rU   r#   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  r   _can_record_outputsr,   r   r  rV   rW   rX   s   @r5   r  r  /  se    &*#,.GH"3NL0%
 ]]_! !r7   r  c                     ^  \ rS rSrSrS\4U 4S jjr\\\	      SS\
R                  S-  S\
R                  S-  S\
R                  S-  S	\S-  S
\
R                  S-  S\S-  S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )Zamba2ModeliQ  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
rk   c                 Z  > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        UR                  U l
        U R                  5       U l        UR                  U l        [        UR                  UR                  S9U l        UR"                  (       a6  UR$                  (       a  [&        R)                  S5        [+        U5      U l        SU l        U R1                  5         g )Nr  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)r)   r*   rk   pad_token_idpadding_idx
vocab_sizer   	Embeddingr2   embed_tokenslayers_block_type
get_layersrd  r   rZ   r  final_layernormr   use_long_contextrK  rL  rh   
rotary_embgradient_checkpointing	post_initr1   rk   r4   s     r5   r*   Zamba2Model.__init__Z  s     !.. ++LL):):F<N<NPTP`P`a!'!9!9oo'$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	r7   N	input_idsr   r   r   inputs_embedsr  r   r\   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      nUn[        R                  " U5      n	U(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      n[        U R
                  UUUUS9nU R
                  R                  (       a  U R                  XS9nOS n[        U R                  5       H  u  pU" UU	UUU4UUUUS.UD6nM     U R!                  U5      n[#        UU(       a  US	9$ S S	9$ )
NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either one)rk   r   r"   ry   )rk   r  r   r   r   )r   )r   r  r   r   )last_hidden_stater   )
ValueErrorr  r,   rx   r   rk   get_seq_lengthr   rB   ry   r   r   r   r  r   rd  r
  r   )r1   r  r   r   r   r  r  r   rG   r  past_seen_tokensr  r   r   layers                  r5   rO   Zamba2Model.forwardq  s    -t";<s    --i8M%!&]!; 0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ;;##"&//-/"["& )$++ 6I!& !0#$7) M !7 ,,];&+/8O
 	
>B
 	
r7   c                 (   / n0 U l         SU l        / n[        U R                  5       GHQ  u  p4[	        U R
                  US9nUS:X  Ga  SU S3n[        U[        5      (       a#  [        U5      U R
                  R                  :  aH  [        U[        5      (       a  [        U5      n[        U5      nU R                   R                  Xg05        OUR                  U5        X0R
                  R                  -  n[        U R
                  US9n	[        R                   " U R
                  R"                  U R
                  R"                  SS9n
UR                  [%        XU5      5        GM@  UR                  U5        GMT     [        R&                  " U5      $ )	Nr   )r   hybridzlayers.z.shared_transformer)r   Fr   )_tied_weights_keysfirst_transformer_layer_idr   r  r  rk   r   listr  r   r   nextr   r   r  r   r   r2   r  r   )r1   rd  unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternr   
attn_blocklinear_layers              r5   r	  Zamba2Model.get_layers  sK   "$*+'!$-d.D.D$E H1$++RKX%#*8*4G!H ##7>>/0DKK4N4NN!"6==/45I/J,%)*>%?N++22N3ST )//?#kk&@&@@8xX
!yy)@)@$++BYBY`ef/
+VWk*3 %F4 }}V$$r7   )r   r  rk   r  r
  r  r  rd  r  r  r  r  )NNNNNN)rR   rS   rT   rU   r  r#   r*   r   r!   r   r,   r  rf   r
   r  r  r   r   rb   r   rO   r	  rV   rW   rX   s   @r5   r  r  Q  s    | .   .2.204(,26!%@
##d*@
 t+@
 &&-	@

 @
 ((4/@
 $;@
 +,@
 
(	(@
    @
D %  %r7   r  c                   b  ^  \ rS rSrSS0rS\4U 4S jjr\\        SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S-  S\	R                  S-  S\	R                  S-  S\S-  S\\	R                  -  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )Zamba2ForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightrk   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g NFr   )
r)   r*   r  r  r  r   r   r2   lm_headr  r  s     r5   r*   Zamba2ForCausalLM.__init__  sU      (
 ++yy!3!3V5F5FUS 	r7   Nr  r   r   r   r  labelsr  logits_to_keepr   r\   c	           
      h   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UUU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Zamba2ForCausalLM

>>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r   r   r   r  r  Nlosslogitsr   rG   r  r  )r  r  r   r   slicer.  loss_functionr  r   r   rG   r  )r1   r  r   r   r   r  r0  r  r1  r   outputsrG   slice_indicesr5  r4  s                  r5   rO   Zamba2ForCausalLM.forward  s    H ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%% 	D &#33!//))
 	
r7   c           
      j   > U R                   R                  US'   [        T
U ]  " U4UUUUUUS.UD6n	U	$ )Nr1  )r   r   r  r   r  is_first_iteration)rk   num_logits_to_keepr)   prepare_inputs_for_generation)r1   r  r   r   r  r   r  r<  r   model_inputsr4   s             r5   r>  /Zamba2ForCausalLM.prepare_inputs_for_generation*  sU     $(;;#A#A w<	
+)'%1	
 	
 r7   )r.  r  r  NNNNNNNr   )NNNNTF)rR   rS   rT   rU   r  r#   r*   r   r   r,   r  rf   r
   r  r  r   r   r   rb   r   rO   r>  rV   rW   rX   s   @r5   r+  r+    s0   *,GH|   .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
'	'@
  @
J   r7   r+  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   >  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\R                  S-  S
\R                  S-  S\S-  S\\R                  -  S\\   S\\-  4S jj5       5       rSrU =r$ )Zamba2ForSequenceClassificationiD  rk   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r-  )
r)   r*   
num_labelsr  r  r   r   r2   scorer  r  s     r5   r*   (Zamba2ForSequenceClassification.__init__S  sS      ++ (
YYv114??O
 	r7   Nr  r   r   r   r  r0  r  r1  r   r\   c	           	         U R                   " U4UUUUUS.U	D6n
U
S   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R                  c  US:w  a  [        S5      eU R                  R                  c  SnOUb  XR                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                   S35        U[        R                  " XR                  S	9U4   nSnUb!  U R                   " SUUUU R                  S
.U	D6n[#        UUU
R$                  U
R&                  U
R(                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   r   r   r  r  r   Nr"   z=Cannot handle batch sizes > 1 if no padding token is defined.r;   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r5  r0  pooled_logitsrk   r3  r  )r  rG  rB   rk   r  r  r>   ry   r,   int32r   argmaxrK  rL  r4   rR   r7  r   r   rG   r  )r1   r  r   r   r   r  r0  r  r1  r   transformer_outputsrG   r5  rj  last_non_pad_tokennon_pad_masktoken_indicesrJ  r4  s                      r5   rO   'Zamba2ForSequenceClassification.forward\  s   ( 8<zz8
)%+'8
 8
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%% $V=Y]YdYdhnD 0 /??-;;*55
 	
r7   )r  rF  rG  rA  )rR   rS   rT   rU   r#   r*   r   r   r,   r  rf   r
   r  r  r   r   r   rb   r   rO   rV   rW   rX   s   @r5   rD  rD  D  s   |   .2.204(,26*.!%-.@
##d*@
 t+@
 &&-	@

 @
 ((4/@
   4'@
 $;@
 ell*@
 +,@
 
1	1@
  @
r7   rD  )r+  rD  r  r  )r   )r"   )Rr  collections.abcr   	itertoolsr   typingr   r,   r    r   r  activationsr	   cache_utilsr
   r   
generationr   integrationsr   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.import_utilsr    utils.output_capturingr!   configuration_zamba2r#   
get_loggerrR   rK  Moduler%   rZ   rh   rf   r   r   re   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r+  rD  __all__r  r7   r5   <module>ri     sO  *  $     & ! . ) 4 8 / 9 q q K F & l l G 9 5 . 
		H	%; ;*JBII J(><BII ><B	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % %2( *+ ,2y)bii y)~VU\\ VS V
((COryy COL'		 'T3")) 3l18 1h=2 =@ !O ! !B D%' D% D%Pg- gT L
&; L
L
^ kr7   