
    Z j                        S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
Jr  SSKJr  SS	KJrJrJrJrJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJrJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+J,r,  SSK-J.r.  \%R^                  " \05      r1\" S5       " S S\Rd                  5      5       r3S r4\" S5      SCS j5       r5S\Rl                  S\7S\Rl                  4S jr8 SDS\Rd                  S \Rl                  S!\Rl                  S"\Rl                  S#\Rl                  S-  S$\9S%\9S&\ \"   4S' jjr:\" \55       " S( S)\Rd                  5      5       r; " S* S+\Rd                  5      r< " S, S-\Rd                  5      r=\ " S. S/\Rd                  5      5       r> " S0 S1\Rd                  5      r? " S2 S3\5      r@ " S4 S5\5      rA " S6 S7\5      rB\@\AS8.rC\# " S9 S:\B5      5       rD   SES;\Rl                  \E\Rl                     -  S-  S<\7S-  S#\Rl                  S-  S\Rl                  \7-  4S= jjrF\# " S> S?\B\5      5       rG " S@ SA\\B5      rH/ SBQrIg)F    )CallableN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)lazy_load_kerneluse_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )JambaConfigRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )JambaRMSNorm8   epsreturnNc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
JambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer&   	__class__s      y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/jamba/modeling_jamba.pyr*   JambaRMSNorm.__init__:   s/     	ll5::k#:; #    hidden_statesc                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor,   float32powmeanrsqrtr/   r.   )r0   r6   input_dtypevariances       r3   forwardJambaRMSNorm.forwardB   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r5   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler.   shaper/   )r0   s    r3   
extra_reprJambaRMSNorm.extra_reprI   s*    ))*+6$2G2G1HIIr5   )r/   r.   )gư>)__name__
__module____qualname____firstlineno__floatr*   r,   TensorrC   rH   __static_attributes____classcell__r2   s   @r3   r$   r$   8   sB    $ $$ $ $;U\\ ;ell ;J Jr5   r$   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr9   r8   dim)rG   r,   cat)xx1x2s      r3   rotate_halfrZ   M   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r5   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerZ   )qkcossinunsqueeze_dimq_embedk_embeds          r3   apply_rotary_pos_embre   T   sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr5   r6   n_repr'   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)rG   expandreshape)r6   rf   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvrn   n   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr5   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr8   r   r9   rU   r;   )ptrainingr    )rn   num_key_value_groupsr,   matmul	transposer   
functionalsoftmaxr=   r<   r;   ru   rz   
contiguous)ro   rp   rq   rr   rs   rt   ru   rv   
key_statesvalue_statesattn_weightsattn_outputs               r3   eager_attention_forwardr   z   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r5   c                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S	\
S-  S
\\   S\\R                  \R                  S-  4   4
S jjrSrU =r$ )JambaAttention   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        g )Nrm   g      TFbias)r)   r*   r   r   getattrr1   num_attention_headsrm   rk   r{   rt   attention_dropout	is_causalr   Linearq_projk_projv_projo_proj)r0   r   r   r2   s      r3   r*   JambaAttention.__init__   s(   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr5   Nr6   rs   past_key_valuesrv   r'   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	Ub  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                  (       d  SOU R                  U R                   S.UD6u  pUR"                  " / UQSP76 R%                  5       nU R'                  U5      nX4$ )Nr9   r    r8           )ru   rt   )rG   rm   r   viewr}   r   r   updater   r   get_interfacer   _attn_implementationr   rz   r   rt   ri   r   r   )r0   r6   rs   r   rv   input_shapehidden_shapequery_statesr   r   attention_interfacer   r   s                r3   rC   JambaAttention.forward   sg    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r5   )r   r   rm   r   r   r   r{   r   r   rt   r   NN)rJ   rK   rL   rM   __doc__r!   intr*   r,   rO   r   r   r   rF   rC   rP   rQ   rR   s   @r3   r   r      s    Gl{ ls l" /3(,	")||") t+") 	")
 +,") 
u||U\\D00	1") ")r5   r   c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	S-  S\R                  S-  4S	 jjrSS\	S-  S\R                  S-  4S
 jjr  SS\	S-  S\R                  S-  4S jjrSrU =r$ )JambaMambaMixer   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        UR                  U l        ["        R$                  " U R                  U R                  U R                  U R                  U R                  U R                  S-
  S9U l        UR(                  U l        [,        UR(                     U l        ["        R0                  " U R                  U R                  S-  U R                   S9U l        ["        R0                  " U R                  U R                  U R                  S-  -   SS9U l        ["        R0                  " U R                  U R                  SS9U l        [8        R:                  " SU R                  S-   5      S S S 24   nUR=                  U R                  S5      R?                  5       n["        R@                  " [8        RB                  " U5      5      U l"        ["        R@                  " [8        RF                  " U R                  5      5      U l$        ["        R0                  " U R                  U R                  U R                   S9U l%        [M        U R                  URN                  S9U l(        [M        U R                  URN                  S9U l)        [M        U R                  URN                  S9U l*        [W        S	5      n[Y        US
S 5      q-[Y        USS 5      q.[W        S5      n[_        USS9q0[Y        USS 5      q1[Y        USS 5      q2[g        [`        [b        [\        [Z        [d        45      q4[h        (       d  [j        Rm                  S5        g g )Nr    )in_channelsout_channelsr   kernel_sizegroupspaddingr8   r   FTr9   r&   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r)   r*   r   r   r1   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr   in_projx_projdt_projr,   arangerh   r   r+   logA_logr-   Dout_projr$   rms_norm_epsdt_layernormb_layernormc_layernormr   r   r   r   r   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r0   r   r   Acausal_conv1d	mamba_ssmr2   s         r3   r*   JambaMambaMixer.__init__   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &%R &r5   Nr6   cache_paramsrs   c                 	   UR                   u  pEnUS L=(       a'    UR                  U R                  5      =(       a    US:H  nU R                  U5      R	                  SS5      nUR                  SSS9u  pUb  XR                  S5      -  nU R                  R                  R                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ao  [        UR                  S5      UR                  U R                     R                  U
U R                  R                  U R                   5      nUR                  S5      nOUbV  ["        R$                  R'                  XR(                  UR                   S   -
  S45      nUR+                  XR                  5        [-        XU R                  R                  U R                   S9nUb  XR                  S5      -  nU R/                  UR	                  SS5      5      n[0        R2                  " XR4                  U R6                  U R6                  /SS9u  pnU R9                  U5      nU R;                  U5      nU R=                  U5      nU R>                  R                  R@                  n[0        RB                  " 5          [0        RD                  " U R>                  R                  R@                  5      U R>                  R                  l         S S S 5        U R?                  U5      R	                  SS5      n[0        RB                  " 5          UU R>                  R                  l         S S S 5        [0        RF                  " U RH                  RK                  5       5      * nUb  URK                  5       OS nU(       ad  [M        UR                  U R                     RN                  US   US   UUS S 2S4   US S 2S4   U RP                  U	S   USS	9
R                  S5      nOo[S        UUUUR	                  SS5      UR	                  SS5      U RP                  RK                  5       U	USSS
9
u  nnUb  Ub  URU                  UU R                  5        U RW                  UR	                  SS5      5      nU$ ! , (       d  f       GN= f! , (       d  f       GNb= f)Nr    r8   rT   r   r9   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state),rG   has_previous_stater   r   r}   chunkr]   r   r.   r   sizer   squeezelayersconv_statesr   r   r   r~   padr   update_conv_stater   r   r,   splitr   r   r   r   r   r   datano_grad
zeros_likeexpr   rN   r   recurrent_statesr   r   update_recurrent_stater   )r0   r6   r   rs   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r3   cuda_kernels_forward$JambaMambaMixer.cuda_kernels_forward  s    "/!4!4
Q$i)H)H)Xi]dhi]i 	  <<6@@AF /44QA4>%),D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st..{NNK,]$++JZJZgkgvgvwM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ **//]]_%*%5%5dll6G6G6L6L%MDLL" !\\)4>>q!D]]_%3DLL"  YYtzz'')**3A3M--/SW!1##DNN3DDf%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A33It~~N !%l.D.DQ.J K$$S _ _s   AR8S
8
S

Sc           	      
   UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  SSS9u  pUb  XR                  S5      -  n	UbR  UR                  U R                  5      (       a2  UR                  U R                     R                  R                  5       nO6[        R                  " X@R                  U R                  4U	R                  US9nUGbN  UR                  U R                  5      (       a  US:X  a  UR!                  XR                  5      n[        R"                  " XR$                  R&                  S S 2SS S 24   -  SS9n	U R(                  (       a  XR$                  R*                  -  n	U R-                  U	5      R/                  U5      R                  S5      n	O[0        R2                  R5                  U	U R6                  U	R                   S   -
  S45      nUR!                  XR                  5      nU R-                  U R%                  U	5      SS U24   5      n	O'U R-                  U R%                  U	5      SS U24   5      n	Ub  XR                  S5      -  n	U R9                  U	R                  SS5      5      n[        R:                  " XR<                  U R                  U R                  /SS9u  pnU R?                  U5      nU RA                  U5      nU RC                  U5      nU RE                  U5      n[0        R2                  RG                  U5      R                  SS5      n[        RH                  " U RJ                  RM                  5       5      * n[        RH                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RM                  5       -  nUU	S S 2S S 2S S 2S 4   RM                  5       -  n/ n[O        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[        RP                  " UR/                  U5      US S 2US S 24   R                  S5      5      nURS                  US S 2S S 2S4   5        M     [        RT                  " USS9nUXRV                  S S S 2S 4   -  -   nUU R-                  U
5      -  nUb  URY                  XR                  5        U R[                  UR                  SS5      5      nU$ )Nr    r8   rT   )devicer;   r   r9   .).rG   r;   r   r}   r   r]   r   r   r   r   cloner,   zerosr   r   r  r   sumr   r.   r   r   r   r<   r   r~   r   r   r   r   r   r   r   r   r   softplusr   r   rN   ranger|   appendstackr   r   r   )r0   input_statesr   rs   r   r   r   r;   r   r6   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r3   slow_forwardJambaMambaMixer.slow_forward{  sq   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM#(G(G(W(W$++DNN;LLRRTI33T5H5HI$++5I #..t~~>>7a<);;M>>Z
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 *;;JW
 $])CC'M)R S HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDwA"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45   kk,B7!]VVD!TM5J%JK"TXXd^3#//	>>J !%k.C.CAq.I J$$r5   c                 t   U R                   R                  (       a_  [        (       a.  SU R                  R                  R
                  R                  ;  a&  [        R                  S5        SU R                   l        U R                   R                  (       a  U R                  XU5      $ U R                  XU5      $ )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)r   use_mamba_kernelsr   r   r.   r  typer   r   r   r  )r0   r6   r   rs   s       r3   rC   JambaMambaMixer.forward  s     ;;((&&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,].YY  nMMr5   )r   r   r   r   r   r   r   r   r   r   r   r1   r   r   r   r   r   r   r   r   r   r   )rJ   rK   rL   rM   r   r!   r*   r,   rO   r   
LongTensorr   r  rC   rP   rQ   rR   s   @r3   r   r      s    A{ AL &*26	c%||c% dlc% ((4/	c%LJ%ut| J%\a\l\los\s J%` &*26	N dlN ((4/	N Nr5   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JambaMLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFr   )r)   r*   r   r1   r   r   r   	gate_projup_proj	down_projr   r   act_fnr0   r   r2   s     r3   r*   JambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r5   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ )N)r   r!  r  r  )r0   rW   r   s      r3   rC   JambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r5   )r!  r   r   r  r1   r   r  )rJ   rK   rL   rM   r*   rC   rP   rQ   rR   s   @r3   r  r    s    0 r5   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )JambaExpertsi  z2Collection of expert weights stored as 3D tensors.r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        R                  " [        R                  " U R                  SU R                  -  U R
                  5      5      U l        [        R                  " [        R                  " U R                  U R
                  U R                  5      5      U l        [        UR                     U l        g )Nr8   )r)   r*   num_local_expertsnum_expertsr1   
hidden_dimr   intermediate_dimr   r+   r,   emptygate_up_projr   r   r   r!  r"  s     r3   r*   JambaExperts.__init__  s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r5   r6   top_k_indextop_k_weightsr'   c                 X   [         R                  " U5      n[         R                  " 5          [         R                  R                  R                  X R                  S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  5       nS S S 5        W H  nUS   nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
[        R                  R                  XR                  U   5      R                  SSS9u  pU R                  U5      U-  n[        R                  R                  XR                   U   5      nXXS 4   -  nUR#                  SXR%                  UR&                  5      5        M     U$ ! , (       d  f       N= f)N)num_classesr8   r    r   )r9   rT   r9   )r,   r   r   r   r~   one_hotr*  permutegreaterr  nonzerowherelinearr.  r   r!  r   
index_add_r<   r;   )r0   r6   r0  r1  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stater   upcurrent_hidden_statess                 r3   rC   JambaExperts.forward  so    $..}=]]_((--55kO_O_5`K%--aA6K{8'DaHPPRJ 
 %J#AJ---#(;;{:/F#G I)4M}}++M;L;LZ;XY__`agi_jHD$(KK$5$:!$&MM$8$89NP^P^_iPj$k!$9)`dJd<e$e!**1i9Q9QReRkRk9lm % #"# _s   A7F
F))r!  r   r.  r+  r,  r*  )rJ   rK   rL   rM   r   r!   r*   r,   rO   rC   rP   rQ   rR   s   @r3   r'  r'    sR    <0{ 0#||# \\# ||	#
 
# #r5   r'  c                   t   ^  \ rS rSrSrS\4U 4S jjrS rS\R                  S\R                  4S jr
S	rU =r$ )
JambaSparseMoeBlocki  a  
This implementation is
strictly equivalent to standard MoE with full capacity (no
dropped tokens). It's faster since it formulates MoE operations
in terms of block-sparse operations to accommodate imbalanced
assignments of tokens to experts, whereas standard MoE either
(1) drop tokens at the cost of reduced performance or (2) set
capacity factor to number of experts and thus waste computation
and memory on padding.
r   c                 (  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        U5      U l        g r  )r)   r*   r1   r+  r   ffn_dimr*  num_experts_per_toktop_kr   r   routerr'  expertsr"  s     r3   r*   JambaSparseMoeBlock.__init__!  sm     ,,//!--//
ii1A1AN#F+r5   c                     [         R                  R                  R                  US[         R                  S9n[         R
                  " X0R                  SS9u  pEXTR                  UR                  5      4$ )Nr9   rx   rT   )	r,   r   r~   r   rN   topkrK  r<   r;   )r0   r6   router_logitsrouting_weightsr1  r0  s         r3   route_tokens_to_experts+JambaSparseMoeBlock.route_tokens_to_experts+  s\    ((--55mSXS^S^5_%*ZZQS%T",,]-@-@AAAr5   r6   r'   c                     UR                   u  p#nUR                  SU5      nU R                  U5      nU R                  X5      u  pgU R	                  XU5      nUR                  X#U5      nU$ )Nr9   )rG   r   rL  rS  rM  ri   )r0   r6   r   sequence_lengthr+  rQ  r0  r1  s           r3   rC   JambaSparseMoeBlock.forward0  sm    2?2E2E/
Z%**2z:M2%)%A%A-%_"]O%--j:Vr5   )rM  rI  r+  r*  rL  rK  )rJ   rK   rL   rM   r   r!   r*   rS  r,   rO   rC   rP   rQ   rR   s   @r3   rG  rG    s:    	,{ ,B
U\\ ell  r5   rG  c                      ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\   S\R                  4S jjrSrU =r$ )JambaAttentionDecoderLayeri:  r   r   c                 P  > [         TU ]  5         UR                  (       a  UR                  U   OSn[        X5      U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr    r   )r)   r*   layers_num_expertsr   	self_attnrG  r  feed_forwardr$   r1   r   input_layernormpre_ff_layernormr0   r   r   r*  ffn_layer_classr2   s        r3   r*   #JambaAttentionDecoderLayer.__init__;  s    >D>W>Wf//	:]^':1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr5   Nr6   rs   position_idsr   	use_cacherv   r'   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pXq-   nUnU R                  U5      nU R                  U5      nXq-   nU$ )N)r6   rs   rc  r   rd   )r^  r\  r_  r]  )	r0   r6   rs   rc  r   rd  rv   residualr   s	            r3   rC   "JambaAttentionDecoderLayer.forwardE  s     !,,];>> 
')%+
 
 !0 --m<))-8 0r5   )r]  r^  r_  r\  )NNNF)rJ   rK   rL   rM   r!   r   r*   r,   rO   r  r   boolr   r   FloatTensorrC   rP   rQ   rR   s   @r3   rY  rY  :  s    Z{ Zs Z /304(,!&|| t+ &&-	
  $; +, 
		 r5   rY  c                      ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\\   S\R                  4S jjrSrU =r$ )JambaMambaDecoderLayeri`  r   r   c                 L  > [         TU ]  5         UR                  (       a  UR                  U   OSn[        XS9U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr    )r   r   r   )r)   r*   r[  r   mambarG  r  r]  r$   r1   r   r^  r_  r`  s        r3   r*   JambaMambaDecoderLayer.__init__a  s    >D>W>Wf//	:]^$FH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr5   Nr6   rs   rc  r   rv   r'   c                     UnU R                  U5      nU R                  UUUS9nXa-   nUnU R                  U5      nU R                  U5      nXa-   nU$ )N)r6   r   rs   )r^  rn  r_  r]  )r0   r6   rs   rc  r   rv   rg  s          r3   rC   JambaMambaDecoderLayer.forwardj  sr     !,,];

'() # 

 !0 --m<))-8 0r5   )r]  r^  rn  r_  )NNN)rJ   rK   rL   rM   r!   r   r*   r,   rO   r  r   r   r   rj  rC   rP   rQ   rR   s   @r3   rl  rl  `  s    Z{ Zs Z /304(,|| t+ &&-	
  +, 
		 r5   rl  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\/\\" \R$                  SS	9S
.r\R*                  " 5       U 4S j5       rSrU =r$ )JambaPreTrainedModeli  r   modelTrY  rl  r   rL  )
layer_name)r6   
attentionsrQ  c                   > [         TU ]  U5        [        U[        5      (       a  [        R
                  " SUR                  S-   5      S S S 24   nUR                  UR                  S5      R                  5       n[        R                  " UR                  [        R                  " U5      5        [        R                  " UR                  5        g [        U[         5      (       ai  [        R"                  " UR$                  SU R&                  R(                  S9  [        R"                  " UR*                  SU R&                  R(                  S9  g g )Nr    r9   r   )r?   std)r)   _init_weights
isinstancer   r,   r   r   rh   r   r   initcopy_r   r   ones_r   r'  normal_r.  r   initializer_ranger   )r0   ro   r   r2   s      r3   ry  "JambaPreTrainedModel._init_weights  s    f%fo..Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx --LL,,3DKK<Y<YZLL))9V9VW .r5   rf  )rJ   rK   rL   rM   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrY  rl  r   r   r   r   _can_record_outputsr,   r   ry  rP   rQ   rR   s   @r3   rs  rs    su    &*#57OP"3NL46LM$'		hG ]]_	X 	Xr5   rs  )	attentionrn  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rS rSrU =r$ )
JambaModeli  r   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H.  n[        UR                  U      nUR                  U" XS95        M0     [
        R                  " U5      U l        [!        UR                  UR"                  S9U l        SU l        U R)                  5         g )N)r   r   F)r)   r*   pad_token_idpadding_idx
vocab_sizer   	Embeddingr1   embed_tokensr  num_hidden_layersALL_DECODER_LAYER_TYPESlayers_block_typer	  
ModuleListr   r$   r   final_layernormgradient_checkpointing	post_init)r0   r   decoder_layersr  layer_classr2   s        r3   r*   JambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0A1&2J2J12MNK!!+f"BC 1 mmN3+F,>,>FDWDWX&+#r5   N	input_idsrs   rc  r   inputs_embedsrd  rv   r'   c           	      D   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	U R                  X$5      n
UnU R                   H*  n[        U[        5      (       a  U
OU	nU" U4UUUUS.UD6nM,     U R                  U5      n[!        UUS9$ )	Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r    )r  )r   r  rs   r   rc  )rs   rc  r   rd  )last_hidden_stater   )
ValueErrorr  r	   r   get_seq_lengthr,   r   rG   r  r]   r   _update_mamba_maskr   rz  rl  r  r   )r0   r  rs   rc  r   r  rd  rv   past_seen_tokenscausal_mask
mamba_maskr6   decoder_layer
layer_masks                 r3   rC   JambaModel.forward  sF    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ,,^M
%![[M'1-AW'X'X^iJ))) /# M ) ,,];%++
 	
r5   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr    )r   r,   r   )r0   rs   r   r  s       r3   r  JambaModel._update_mamba_mask  sA     $
'O,N,N,P,P&599^q5H+I+IJr5   )r  r  r  r   r  r  )NNNNNN)rJ   rK   rL   rM   r!   r*   r   r   r   r,   r  rO   r   rj  ri  r   r   r   rC   r  rP   rQ   rR   s   @r3   r  r    s    { $   .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
 2
    2
h r5   r  gate_logitsr*  c                    U b  [        U [        5      (       d  g[        U [        5      (       aC  U S   R                  n[        R                  " U  Vs/ s H  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGOUR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XX45      R                  SU5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  n[        R                   " XR#                  S5      -  5      nUU-  $ s  snf )ax  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   rT   r9   )rz  rF   r  r,   rV   r<   r   r~   r   rP  r5  r?   rN   rG   rh   ri   r  r]   )r  r*  rK  rs   compute_device
layer_gateconcatenated_gate_logitsrR  r   selected_expertsr=  tokens_per_expertrouter_prob_per_expertr   rV  r  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r3   load_balancing_loss_funcr    s+   : *[%"@"@+u%%$Q..#(99^i-j^iPZmmN.K^i-jpq#r hh))112JPR1SO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
4::1=*B^_ 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OQRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1Q1QRS1TTUL+%%[ .ks   Ic                   d  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjr\	\
         SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )JambaForCausalLMiK  zlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr6   logitsr   c                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  5         g r  )r)   r*   r  rt  r  r   r   r1   r  router_aux_loss_coefr*  rJ  r  r"  s     r3   r*   JambaForCausalLM.__init__Q  s     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r5   Nr  rs   rc  r   r  labelsrd  output_router_logitslogits_to_keeprv   r'   c
                 z   Ub  UOU R                   R                  nU R                  " SUUUUUUUS.U
D6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 U
D6nSnU(       aY  [        UR                  U R                  U R                  U5      nUb*  XR                  UR                  UR                   5      -  -  n[#        UUUUR$                  UR&                  UR(                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, JambaForCausalLM

>>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r  rs   rc  r   r  rd  r  )lossaux_lossr  r   r6   rv  rQ  rf  )r   r  rt  r  rz  r   slicer  loss_functionr  r  rQ  r*  rJ  r  r<   r  r   r   r6   rv  )r0   r  rs   rc  r   r  r  rd  r  r  rv   outputsr6   slice_indicesr  r  r  s                    r3   rC   JambaForCausalLM.forward]  sP   N %9$D $++JjJj 	
 +/** 	+
)%+'!5	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r5   )r  rt  r*  rJ  r  r  )	NNNNNNNNr   )rJ   rK   rL   rM   _tied_weights_keys_tp_plan_pp_planr!   r*   r   r   r,   r  rO   r   rj  ri  r   r   r   r   rC   rP   rQ   rR   s   @r3   r  r  K  s<   *,GH23H_-z:;H
{ 
  .2.204(,26*.!%,0-.P
##d*P
 t+P
 &&-	P

 P
 ((4/P
   4'P
 $;P
 #TkP
 ell*P
 +,P
 
#P
  P
r5   r  c                       \ rS rSrSrg)JambaForSequenceClassificationi  rf  N)rJ   rK   rL   rM   rP   rf  r5   r3   r  r    s    r5   r  )r  r  r  rs  )r    )r   )Nr8   N)Jcollections.abcr   r,   r    r   r{  activationsr   cache_utilsr   r	   
generationr
   integrationsr   r   r   r   r   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   configuration_jambar!   
get_loggerrJ   r   Moduler$   rZ   re   rO   r   rn   rN   r   r   r   r  r'  rG  rY  rl  rs  r  r  rF   r  r  r  __all__rf  r5   r3   <module>r     s  2 %   & ! . )  0 [ Q F & R R 7 9 E , 
		H	% Y'J299 J (J(( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*3)RYY 3) +3)lPNbii PNfryy   $#299 $# $#N"")) "J#!; #L7 BX? X8 )CMcd  U% U Ut #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d c
+_ c
 c
L	%EG[ 	 gr5   