
    Z jc                     l    S r SSKrSSKJr  SSKJr  SSKJr  \" SS9\ " S	 S
\5      5       5       rS
/r	g)zZamba model configuration    N)strict   )PreTrainedConfig)auto_docstringzZyphra/Zamba-7B-v1)
checkpointc                   R  ^  \ rS rSr% SrSrS/rSSS.rSr\	\
S	'   S
r\\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S '   S
r\\
S!'   S"r\	\
S#'   S$r\	S-  \
S%'   S"r\	S-  \
S&'   Sr\	\ \	   -  S-  \
S''   S(r!\	\
S)'   S*r"\\	-  \
S+'   S,r#\	\
S-'   S.r$\	\
S/'   S
r%\\
S0'   Sr&\	\
S1'   S.r'\	\
S2'   Sr(\	\
S3'   S4r)\\	-  \
S5'   S6r*\\
S7'   S8r+\\
S9'   S:r,\\
S;'   S
r-\\
S<'   S=r.\\
S>'   U 4S? jr/S@ r0SA r1SBr2U =r3$ )CZambaConfig   a  
attention_hidden_size (`int`, *optional*):
    Dimension of the hidden representations of the inputs to the Attention layer.
attention_head_dim (`int`, *optional*):
    Dimension of the attention head in the Transformer decoder.
n_mamba_heads (`int`, *optional*, defaults to 2):
    Number of mamba heads for each mamba layer.
hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
    The non-linear activation function (function or string) in the mamba layer.
num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
    Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
    integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
    logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
    sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
    significantly.
attn_layer_period (`int`, *optional*, defaults to 6):
    Once in this many layers, we will have a shared attention layer
attn_layer_offset (`int`, *optional*, defaults to 4):
    Offset of the shared attention layer
use_mamba_kernels (`bool`, *optional*, defaults to `True`):
    Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
    `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
    `True` and kernels are not available
mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
    Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
zambapast_key_valueslayers_block_typeattention_head_dim)layer_typeshead_dimi }  
vocab_sizeTtie_word_embeddingsi  hidden_sizeNattention_hidden_sizei :  intermediate_sizeL   num_hidden_layers   num_attention_headsnum_key_value_heads   n_mamba_headsgelu
hidden_actsiluhidden_mamba_actg{Gz?initializer_rangegh㈵>rms_norm_eps	use_cache   num_logits_to_keepr   pad_token_idbos_token_ideos_token_idi   max_position_embeddingsg        attention_dropout   attn_layer_period   attn_layer_offsetuse_mamba_kernelsmamba_d_statemamba_d_convmamba_expandautomamba_dt_rankgMbP?time_step_ming?time_step_maxg-C6?time_step_floormamba_conv_biasFmamba_proj_biasc                   > U R                   =(       d    SU R                  -  U l         U R                  =(       d    SU R                  -  U R                  -  U l        U R                  S:X  a#  [
        R                  " U R                  S-  5      OU R                  U l        U R                  U R                  U R                  U R                  5      U l        [        TU ]4  " S0 UD6  g )Nr   r3   r    )r   r   r   r   r4   mathceil_layers_block_typer   r,   r.   r   super__post_init__)selfkwargs	__class__s     ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/zamba/configuration_zamba.pyr@   ZambaConfig.__post_init__\   s    %)%?%?%W1tGWGWCW""&"9"9"mQAQAQ=QUYUmUm=mAEASASW]A]TYYt'7'7"'<=cgcucu!%!8!8""D$:$:D<R<R"
 	''    c                 n    U R                   U R                  -  U R                  -  S:w  a  [        S5      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.r   z;`intermediate_size` should be divisible by `n_mamba_heads`.N)r2   r   r   
ValueError)rA   s    rD   validate_architecture!ZambaConfig.validate_architecturee   s8     0 00D4F4FF!KZ[[ LrF   c                 l    / SQ[        US-
  5       Vs/ s H  oDU-  U:X  a  SOSPM     sn-   nU$ s  snf )N)mambarL   hybridr   rM   rL   )range)rA   r   r,   r.   ilayerss         rD   r>   ZambaConfig._layers_block_typej   sR    
 [``qtu`uZvwZvUV..2CCXPZvw	x
  xs   1)r   r   r   r4   )4__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr   int__annotations__r   boolr   r   r   r   r   r   r   r   r   strr    r!   floatr"   r#   r%   r&   r'   r(   listr)   r*   r,   r.   r/   r0   r1   r2   r4   r5   r6   r7   r8   r9   r@   rI   r>   __static_attributes____classcell__)rC   s   @rD   r	   r	      s   6 J#4"5$7EYZMJ $$K(,3:,"s"s!!%)d
)!!M3J"c"#u#L%It L#*  L#* +,L#S	/D(,#'S'%(us{(ss"t"M3L#L#%M39% M5 M5!OU! OT !OT!(\
 rF   r	   )
rV   r<   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r	   __all__r;   rF   rD   <module>rf      sM       . 3 # /0V" V  1Vr /rF   