
    Z j+                         S r SSKJr  SSKJr  SSKJr  SSKJr  \" SS9\ " S	 S
\5      5       5       r	\" SS9\ " S S\5      5       5       r
S/rg)zMpt configuration    )Literal)strict   )PreTrainedConfig)auto_docstringzmosaicml/mpt-7b)
checkpointc                       \ rS rSr% SrSrSr\S   \S'   Sr	\
\S'   S	r\\S
'   Sr\S-  \S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\
\S'   Srg)MptAttentionConfig   a  
attn_type (`str`, *optional*, defaults to `"multihead_attention"`):
    type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`.
attn_pdrop (`float`, *optional*, defaults to `0.0`):
    The dropout probability for the attention layers.
attn_impl (`str`, *optional*, defaults to `"torch"`):
    The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`.
clip_qkv (`float`, *optional*):
    If not `None`, clip the queries, keys, and values in the attention layer to this value.
softmax_scale (`float`, *optional*):
    If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to
    `1/sqrt(hidden_size)`.
prefix_lm (`bool`, *optional*, defaults to `False`):
    Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument
    which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another
    bi-directionally. Tokens outside the prefix use causal attention.
qk_ln (`bool`, *optional*, defaults to `False`):
    Whether to apply layer normalization to the queries and keys in the attention layer.
attn_uses_sequence_id (`bool`, *optional*, defaults to `False`):
    Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train`
    mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each
    token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored.
alibi (`bool`, *optional*, defaults to `True`):
    Whether or not to use the alibi bias instead of positional embedding.
alibi_bias_max (`int`, *optional*, defaults to 8):
    The maximum value of the alibi bias.
attn_configmultihead_attention)r   multiquery_attention	attn_typer   
attn_pdroptorch	attn_implNclip_qkvsoftmax_scaleF	prefix_lmqk_lnattn_uses_sequence_idTalibi   alibi_bias_max )__name__
__module____qualname____firstlineno____doc__base_config_keyr   r   __annotations__r   intr   strr   floatr   r   boolr   r   r   r   __static_attributes__r       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/mpt/configuration_mpt.pyr
   r
      s    8 $OH]IwDE]JIs!Hedl!"&M54<&ItE4"'4'E4NCr(   r
   c                     ^  \ rS rSr% SrSrS\0rSSSS.rS	r	\
\S'   S
r\
\S'   Sr\
\S'   Sr\
\S'   S	r\
\S'   Sr\
\S'   Sr\\
-  \S'   Sr\\S'   Sr\\
-  \S'   Sr\\S'   Sr\\-  S-  \S'   Sr\\S'   Sr\\-  S-  \S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   S#r \\S$'   Sr!\\S%'   Sr"\
S-  \S&'   Sr#\
S-  \S''   Sr$\
\%\
   -  S-  \S('   U 4S) jr&S*r'U =r($ )+	MptConfigE   a  
expansion_ratio (`int`, *optional*, defaults to 4):
    The ratio of the up/down scale in the MLP.
max_seq_len (`int`, *optional*, defaults to 2048):
    The maximum sequence length of the model.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
    The epsilon to use in the layer normalization layers.
learned_pos_emb (`bool`, *optional*, defaults to `True`):
    Whether to use learned positional embeddings.
attn_config (`dict`, *optional*):
    A dictionary used to configure the model's attention module.
init_device (`str`, *optional*, defaults to `"cpu"`):
    The device to use for parameter initialization. Defined for backward compatibility
logit_scale (`float`, *optional*):
    If not None, scale the logits by this value.
no_bias (`bool`, *optional*, defaults to `True`):
    Whether to use bias in all linear layers.
embedding_fraction (`float`, *optional*, defaults to 1.0):
    The fraction to scale the gradients of the embedding layer by.
norm_type (`str`, *optional*, defaults to `"low_precision_layernorm"`):
    Type of layer norm to use. All MPT models uses the same layer norm implementation. Defined for backward
    compatibility.

Example:

```python
>>> from transformers import MptConfig, MptModel

>>> # Initializing a Mpt configuration
>>> configuration = MptConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = MptModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
mptr   n_headsd_modeln_layers)num_attention_headshidden_sizenum_hidden_layersi      r      expansion_ratiomax_seq_leni  
vocab_sizeg        resid_pdropgh㈵>layer_norm_epsilon	emb_pdropTlearned_pos_embNcpuinit_devicelogit_scaleno_biasg      ?embedding_fractionlow_precision_layernorm	norm_typeF	use_cacheg{Gz?initializer_rangetie_word_embeddingspad_token_idbos_token_ideos_token_idc                    > U R                   c  [        5       U l         O9[        U R                   [        5      (       a  [        S0 U R                   D6U l         [        TU ]  " S0 UD6  g )Nr   )r   r
   
isinstancedictsuper__post_init__)selfkwargs	__class__s     r)   rN   MptConfig.__post_init__   sS    #13D(($//1ED4D4DED''r(   )r   ))r   r   r   r   r    
model_typer
   sub_configsattribute_mapr/   r#   r"   r.   r0   r6   r7   r8   r9   r%   r:   r;   r<   r&   r   rL   r>   r$   r?   r@   rA   rC   rD   rE   rF   rG   rH   rI   listrN   r'   __classcell__)rQ   s   @r)   r+   r+   E   sH   %N J "45K( 'M GSGSHcOSKJ"K" $$ Ius{  OT 48K**T18K&*Kt#*GT ##.Is.It#u# $$#L#*##L#*#+/L#S	/D(/( (r(   r+   N)r    typingr   huggingface_hub.dataclassesr   configuration_utilsr   utilsr   r
   r+   __all__r   r(   r)   <module>r]      sx      . 3 # ,-() (  .(V ,-L(  L(  .L(^ -r(   