
    Z jBr                        S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
Jr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.  SSK/J0r0  \Rb                  " \25      r3 " S S\(5      r4 " S S\'5      r5 " S S\Rl                  5      r7 " S S\+5      r8 " S  S!\-5      r9 " S" S#\Rl                  5      r: " S$ S%\5      r; " S& S'\5      r<\;\<S(.r= " S) S*\5      r>\ " S+ S,\>5      5       r? " S- S.\.5      r@ " S/ S0\\>5      rA/ S1QrBg)2    )CallableN)nn   )initialization)ACT2FN)CacheDynamicCache)lazy_load_kernel)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )LlamaAttentionLlamaRMSNormeager_attention_forward)
MistralMLP)MixtralExpertsMixtralForCausalLM   )JambaConfigc                       \ rS rSrSrg)JambaRMSNorm.    N__name__
__module____qualname____firstlineno____static_attributes__r&       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/jamba/modular_jamba.pyr$   r$   .       r-   r$   c                      ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S\	S-  S	\
\   S
\\R                  \R                  S-  4   4
S jjrSrU =r$ )JambaAttention2   config	layer_idxc                   > [         TU ]  X5        [        R                  " UR                  UR
                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l	        [        R                  " UR                  UR                  U R                  -  SS9U l
        [        R                  " UR
                  U R                  -  UR                  SS9U l        g NFbias)super__init__r   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_proj)selfr3   r4   	__class__s      r.   r:   JambaAttention.__init__3   s    +ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr-   Nhidden_statesattention_maskpast_key_valueskwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	Ub  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      n
U
" U UUU	U4U R                  (       d  SOU R                  U R                   S.UD6u  pUR"                  " / UQSP76 R%                  5       nU R'                  U5      nX4$ )Nr!   r           )dropoutscaling)shaper>   r?   view	transposerA   rB   updater4   r   get_interfacer3   _attn_implementationr   trainingattention_dropoutrP   reshape
contiguousrC   )rD   rG   rH   rI   rJ   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                r.   forwardJambaAttention.forward:   sg    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r-   )rA   rC   r?   rB   NN)r(   r)   r*   r+   r"   intr:   torchTensorr   r   r   tuplerc   r,   __classcell__rE   s   @r.   r1   r1   2   s    l{ ls l /3(,	")||") t+") 	")
 +,") 
u||U\\D00	1") ")r-   r1   c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	S-  S\R                  S-  4S	 jjrSS\	S-  S\R                  S-  4S
 jjr  SS\	S-  S\R                  S-  4S jjrSrU =r$ )JambaMambaMixer_   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r3   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        UR                  U l        ["        R$                  " U R                  U R                  U R                  U R                  U R                  U R                  S-
  S9U l        UR(                  U l        [,        UR(                     U l        ["        R0                  " U R                  U R                  S-  U R                   S9U l        ["        R0                  " U R                  U R                  U R                  S-  -   SS9U l        ["        R0                  " U R                  U R                  SS9U l        [8        R:                  " SU R                  S-   5      S S S 24   nUR=                  U R                  S5      R?                  5       n["        R@                  " [8        RB                  " U5      5      U l"        ["        R@                  " [8        RF                  " U R                  5      5      U l$        ["        R0                  " U R                  U R                  U R                   S9U l%        [M        U R                  URN                  S9U l(        [M        U R                  URN                  S9U l)        [M        U R                  URN                  S9U l*        [W        S	5      n[Y        US
S 5      q-[Y        USS 5      q.[W        S5      n[_        USS9q0[Y        USS 5      q1[Y        USS 5      q2[g        [`        [b        [\        [Z        [d        45      q4[h        (       d  [j        Rm                  S5        g g )Nr!   )in_channelsout_channelsr8   kernel_sizegroupspaddingr   r7   FTrM   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r9   r:   r3   r4   r<   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizemamba_expandintermediate_sizemamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr;   in_projx_projdt_projrg   arangeexpandrZ   	ParameterlogA_logonesDout_projr$   rms_norm_epsdt_layernormb_layernormc_layernormr
   getattrrw   rx   r   selective_state_updaterz   r{   allis_fast_path_availableloggerwarning_once)rD   r3   r4   Acausal_conv1d	mamba_ssmrE   s         r.   r:   JambaMambaMixer.__init__g   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &%R &r-   NrG   cache_paramsrH   c                 	   UR                   u  pEnUS L=(       a'    UR                  U R                  5      =(       a    US:H  nU R                  U5      R	                  SS5      nUR                  SSS9u  pUb  XR                  S5      -  nU R                  R                  R                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ao  [        UR                  S5      UR                  U R                     R                  U
U R                  R                  U R                   5      nUR                  S5      nOUbV  ["        R$                  R'                  XR(                  UR                   S   -
  S45      nUR+                  XR                  5        [-        XU R                  R                  U R                   S9nUb  XR                  S5      -  nU R/                  UR	                  SS5      5      n[0        R2                  " XR4                  U R6                  U R6                  /SS9u  pnU R9                  U5      nU R;                  U5      nU R=                  U5      nU R>                  R                  R@                  n[0        RB                  " 5          [0        RD                  " U R>                  R                  R@                  5      U R>                  R                  l         S S S 5        U R?                  U5      R	                  SS5      n[0        RB                  " 5          UU R>                  R                  l         S S S 5        [0        RF                  " U RH                  RK                  5       5      * nUb  URK                  5       OS nU(       ad  [M        UR                  U R                     RN                  US   US   UUS S 2S4   US S 2S4   U RP                  U	S   USS	9
R                  S5      nOo[S        UUUUR	                  SS5      UR	                  SS5      U RP                  RK                  5       U	USSS
9
u  nnUb  Ub  URU                  UU R                  5        U RW                  UR	                  SS5      5      nU$ ! , (       d  f       GN= f! , (       d  f       GNb= f)Nr!   r   dimr   rM   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state),rQ   has_previous_stater4   r   rS   chunk	unsqueezer   weightrR   sizerw   squeezelayersconv_statesr8   r   r   
functionalpadr   update_conv_staterx   r   rg   splitr   r}   r   r   r   r   datano_grad
zeros_likeexpr   floatr   recurrent_statesr   rz   update_recurrent_stater   )rD   rG   r   rH   
batch_sizeseq_len_use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r.   cuda_kernels_forward$JambaMambaMixer.cuda_kernels_forward   s    "/!4!4
Q$i)H)H)Xi]dhi]i 	  <<6@@AF /44QA4>%),D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)##DNN3??  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st..{NNK,]$++JZJZgkgvgvwM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ **//]]_%*%5%5dll6G6G6L6L%MDLL" !\\)4>>q!D]]_%3DLL"  YYtzz'')**3A3M--/SW!1##DNN3DDf%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A33It~~N !%l.D.DQ.J K$$S _ _s   AR8S
8
S

Sc           	      
   UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  SSS9u  pUb  XR                  S5      -  n	UbR  UR                  U R                  5      (       a2  UR                  U R                     R                  R                  5       nO6[        R                  " X@R                  U R                  4U	R                  US9nUGbN  UR                  U R                  5      (       a  US:X  a  UR!                  XR                  5      n[        R"                  " XR$                  R&                  S S 2SS S 24   -  SS9n	U R(                  (       a  XR$                  R*                  -  n	U R-                  U	5      R/                  U5      R                  S5      n	O[0        R2                  R5                  U	U R6                  U	R                   S   -
  S45      nUR!                  XR                  5      nU R-                  U R%                  U	5      SS U24   5      n	O'U R-                  U R%                  U	5      SS U24   5      n	Ub  XR                  S5      -  n	U R9                  U	R                  SS5      5      n[        R:                  " XR<                  U R                  U R                  /SS9u  pnU R?                  U5      nU RA                  U5      nU RC                  U5      nU RE                  U5      n[0        R2                  RG                  U5      R                  SS5      n[        RH                  " U RJ                  RM                  5       5      * n[        RH                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RM                  5       -  nUU	S S 2S S 2S S 2S 4   RM                  5       -  n/ n[O        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[        RP                  " UR/                  U5      US S 2US S 24   R                  S5      5      nURS                  US S 2S S 2S4   5        M     [        RT                  " USS9nUXRV                  S S S 2S 4   -  -   nUU R-                  U
5      -  nUb  URY                  XR                  5        U R[                  UR                  SS5      5      nU$ )Nr!   r   r   )devicedtyper   rM   .).rQ   r   r   rS   r   r   r   r4   r   r   clonerg   zerosr   r}   r   r   sumr   r   r   r8   r   tor   r   r   r   r   r   r   r   r   r   r   softplusr   r   r   rangematmulappendstackr   r   r   )rD   input_statesr   rH   r   r   r   r   r   rG   r   r   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   iscan_outputr   s                             r.   slow_forwardJambaMambaMixer.slow_forward  sq   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM#(G(G(W(W$++DNN;LLRRTI33T5H5HI$++5I #..t~~>>7a<);;M>>Z
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 *;;JW
 $])CC'M)R S HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDwA"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45   kk,B7!]VVD!TM5J%JK"TXXd^3#//	>>J !%k.C.CAq.I J$$r-   c                 t   U R                   R                  (       a_  [        (       a.  SU R                  R                  R
                  R                  ;  a&  [        R                  S5        SU R                   l        U R                   R                  (       a  U R                  XU5      $ U R                  XU5      $ )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)r3   use_mamba_kernelsr   r   r   r   typer   r   r   r   )rD   rG   r   rH   s       r.   rc   JambaMambaMixer.forward]  s     ;;((&&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,].YY  nMMr-   )r   r   r   r   r   r   r3   r   r   r   r   r<   r   r   r4   r   r}   r   r   r   r   re   )r(   r)   r*   r+   __doc__r"   r:   rg   rh   r   
LongTensorr   r   rc   r,   rj   rk   s   @r.   rm   rm   _   s    A{ AL &*26	c%||c% dlc% ((4/	c%LJ%ut| J%\a\l\los\s J%` &*26	N dlN ((4/	N Nr-   rm   c                       \ rS rSrSrg)JambaMLPir  r&   Nr'   r&   r-   r.   r   r   r  r/   r-   r   c                       \ rS rSrSrg)JambaExpertsiv  r&   Nr'   r&   r-   r.   r   r   v  r/   r-   r   c                   t   ^  \ rS rSrSrS\4U 4S jjrS rS\R                  S\R                  4S jr
S	rU =r$ )
JambaSparseMoeBlockiz  a  
This implementation is
strictly equivalent to standard MoE with full capacity (no
dropped tokens). It's faster since it formulates MoE operations
in terms of block-sparse operations to accommodate imbalanced
assignments of tokens to experts, whereas standard MoE either
(1) drop tokens at the cost of reduced performance or (2) set
capacity factor to number of experts and thus waste computation
and memory on padding.
r3   c                 (  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        U5      U l        g r6   )r9   r:   r<   
hidden_dimr   ffn_dimnum_expertsnum_experts_per_toktop_kr   r;   routerr   expertsrD   r3   rE   s     r.   r:   JambaSparseMoeBlock.__init__  sm     ,,//!--//
ii1A1AN#F+r-   c                     [         R                  R                  R                  US[         R                  S9n[         R
                  " X0R                  SS9u  pEXTR                  UR                  5      4$ )NrM   )r   r   r   )	rg   r   r   softmaxr   topkr   r   r   )rD   rG   router_logitsrouting_weightstop_k_weightstop_k_indexs         r.   route_tokens_to_experts+JambaSparseMoeBlock.route_tokens_to_experts  s\    ((--55mSXS^S^5_%*ZZQS%T",,]-@-@AAAr-   rG   rK   c                     UR                   u  p#nUR                  SU5      nU R                  U5      nU R                  X5      u  pgU R	                  XU5      nUR                  X#U5      nU$ )NrM   )rQ   rR   r   r  r   rY   )rD   rG   r   sequence_lengthr   r  r  r  s           r.   rc   JambaSparseMoeBlock.forward  sm    2?2E2E/
Z%**2z:M2%)%A%A-%_"]O%--j:Vr-   )r   r   r   r   r   r   )r(   r)   r*   r+   r   r"   r:   r  rg   rh   rc   r,   rj   rk   s   @r.   r   r   z  s:    	,{ ,B
U\\ ell  r-   r   c                      ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\   S\R                  4S jjrSrU =r$ )JambaAttentionDecoderLayeri  r3   r4   c                 P  > [         TU ]  5         UR                  (       a  UR                  U   OSn[        X5      U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr!   ru   )r9   r:   layers_num_expertsr1   	self_attnr   r   feed_forwardr$   r<   r   input_layernormpre_ff_layernormrD   r3   r4   r   ffn_layer_classrE   s        r.   r:   #JambaAttentionDecoderLayer.__init__  s    >D>W>Wf//	:]^':1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr-   NrG   rH   position_idsrI   	use_cacherJ   rK   c           	          UnU R                  U5      nU R                  " SUUUUUS.UD6u  pXq-   nUnU R                  U5      nU R                  U5      nXq-   nU$ )N)rG   rH   r  rI   r  r&   )r  r  r  r  )	rD   rG   rH   r  rI   r  rJ   residualr   s	            r.   rc   "JambaAttentionDecoderLayer.forward  s     !,,];>> 
')%+
 
 !0 --m<))-8 0r-   )r  r  r  r  )NNNF)r(   r)   r*   r+   r"   rf   r:   rg   rh   r   r   boolr   r   FloatTensorrc   r,   rj   rk   s   @r.   r  r    s    Z{ Zs Z /304(,!&|| t+ &&-	
  $; +, 
		 r-   r  c                      ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\\   S\R                  4S jjrSrU =r$ )JambaMambaDecoderLayeri  r3   r4   c                 L  > [         TU ]  5         UR                  (       a  UR                  U   OSn[        XS9U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr!   )r3   r4   ru   )r9   r:   r  rm   mambar   r   r  r$   r<   r   r  r  r  s        r.   r:   JambaMambaDecoderLayer.__init__  s    >D>W>Wf//	:]^$FH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr-   NrG   rH   r  rI   rJ   rK   c                     UnU R                  U5      nU R                  UUUS9nXa-   nUnU R                  U5      nU R                  U5      nXa-   nU$ )N)rG   r   rH   )r  r  r  r  )rD   rG   rH   r  rI   rJ   r  s          r.   rc   JambaMambaDecoderLayer.forward  sr     !,,];

'() # 

 !0 --m<))-8 0r-   )r  r  r  r  )NNN)r(   r)   r*   r+   r"   rf   r:   rg   rh   r   r   r   r   r  rc   r,   rj   rk   s   @r.   r  r    s    Z{ Zs Z /304(,|| t+ &&-	
  +, 
		 r-   r  )	attentionr  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSr\\/\\" \R$                  SS	9S
.r\R*                  " 5       U 4S j5       rSrU =r$ )JambaPreTrainedModeli  r3   modelTr  r  rI   r   )
layer_name)rG   
attentionsr  c                   > [         TU ]  U5        [        U[        5      (       a  [        R
                  " SUR                  S-   5      S S S 24   nUR                  UR                  S5      R                  5       n[        R                  " UR                  [        R                  " U5      5        [        R                  " UR                  5        g [        U[         5      (       ai  [        R"                  " UR$                  SU R&                  R(                  S9  [        R"                  " UR*                  SU R&                  R(                  S9  g g )Nr!   rM   rN   )meanstd)r9   _init_weights
isinstancerm   rg   r   r}   r   r   rZ   initcopy_r   r   ones_r   r   normal_gate_up_projr3   initializer_range	down_proj)rD   moduler   rE   s      r.   r,  "JambaPreTrainedModel._init_weights  s    f%fo..Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx --LL,,3DKK<Y<YZLL))9V9VW .r-   r&   )r(   r)   r*   r+   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  r  r1   r   r   r;   _can_record_outputsrg   r   r,  r,   rj   rk   s   @r.   r%  r%    su    &*#57OP"3NL46LM$'		hG ]]_	X 	Xr-   r%  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rS rSrU =r$ )
JambaModeli  r3   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H.  n[        UR                  U      nUR                  U" XS95        M0     [
        R                  " U5      U l        [!        UR                  UR"                  S9U l        SU l        U R)                  5         g )N)r4   ru   F)r9   r:   pad_token_idpadding_idx
vocab_sizer   	Embeddingr<   embed_tokensr   num_hidden_layersALL_DECODER_LAYER_TYPESlayers_block_typer   
ModuleListr   r$   r   final_layernormgradient_checkpointing	post_init)rD   r3   decoder_layersr   layer_classrE   s        r.   r:   JambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0A1&2J2J12MNK!!+f"BC 1 mmN3+F,>,>FDWDWX&+#r-   N	input_idsrH   r  rI   inputs_embedsr  rJ   rK   c           	      D   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	U R                  X$5      n
UnU R                   H*  n[        U[        5      (       a  U
OU	nU" U4UUUUS.UD6nM,     U R                  U5      n[!        UUS9$ )	Nz:You must specify exactly one of input_ids or inputs_embeds)r3   r   r!   )r   )r3   rS  rH   rI   r  )rH   r  rI   r  )last_hidden_staterI   )
ValueErrorrG  r	   r3   get_seq_lengthrg   r   rQ   r   r   r   _update_mamba_maskr   r-  r  rL  r   )rD   rR  rH   r  rI   rS  r  rJ   past_seen_tokenscausal_mask
mamba_maskrG   decoder_layer
layer_masks                 r.   rc   JambaModel.forward  sF    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 ,,^M
%![[M'1-AW'X'X^iJ))) /# M ) ,,];%++
 	
r-   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr!   )r   rg   r   )rD   rH   rI   r[  s       r.   rX  JambaModel._update_mamba_maskP  sA     $
'O,N,N,P,P&599^q5H+I+IJr-   )rG  rL  rM  r   rD  rE  )NNNNNN)r(   r)   r*   r+   r"   r:   r   r   r   rg   r   rh   r   r  r  r   r   r   rc   rX  r,   rj   rk   s   @r.   rA  rA    s    { $   .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
 2
    2
h r-   rA  c                   6  ^  \ rS rSrS\4U 4S jjr         SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S	\R                  S-  S
\R                  S-  S\S-  S\S-  S\\R                  -  S\\   S\4U 4S jjjrSrU =r$ )JambaForCausalLMi^  r3   c                 F   > [         TU ]  U5        UR                  U l        g )N)r9   r:   r   r   s     r.   r:   JambaForCausalLM.__init___  s     !--r-   NrR  rH   r  rI   rS  labelsr  output_router_logitslogits_to_keeprJ   rK   c
           
      4   > [         TU ]  " UUUUUUUU	40 U
D6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, JambaForCausalLM

>>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r9   rc   )rD   rR  rH   r  rI   rS  re  r  rf  rg  rJ   rE   s              r.   rc   JambaForCausalLM.forwardc  s9    F w

 

 
	
r-   )r   )	NNNNNNNNr   )r(   r)   r*   r+   r"   r:   rg   r   rh   r   r  r  rf   r   r   r   rc   r,   rj   rk   s   @r.   rb  rb  ^  s    .{ . .2.204(,26*.!%,0-.-
##d*-
 t+-
 &&-	-

 -
 ((4/-
   4'-
 $;-
 #Tk-
 ell*-
 +,-
 
#-
 -
r-   rb  c                       \ rS rSrSrg)JambaForSequenceClassificationi  r&   Nr'   r&   r-   r.   rk  rk    r/   r-   rk  )rb  rk  rA  r%  )Ccollections.abcr   rg   r    r   r.  activationsr   cache_utilsr   r	   integrationsr
   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   llama.modeling_llamar   r   r   mistral.modeling_mistralr   mixtral.modeling_mixtralr   r    configuration_jambar"   
get_loggerr(   r   r$   r1   Modulerm   r   r   r   r  r  rI  r%  rA  rb  rk  __all__r&   r-   r.   <module>r     s1  & %   & ! . , / [ Q F & @ @ 7 9 E X X 1 I , 
		H	%	< 	*)^ *)ZPNbii PNf	z 		> 	"")) "J#!; #L7 B )CMcd X? X8 U% U Up2
) 2
j	%EG[ 	 gr-   