
    Z j                         S r SSKJr  SSKJr  SSKJr  SSKJr  \\" SSS	9 " S
 S\5      5       5       r	\\" SSS	9 " S S\5      5       5       r
\" SS9\ " S S\5      5       5       rS/rg)zDBRX model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz4This config is used to instantiate attention layers.z$transformers-community/dbrx-instruct)custom_intro
checkpointc                   Z    \ rS rSr% SrSrSr\\-  \	S'   Sr
\\-  S-  \	S'   Sr\\	S	'   S
rg)DbrxAttentionConfig   a^  
attn_pdrop (`float`, *optional*, defaults to 0.0):
    The dropout probability for the attention layers.
clip_qkv (`float`, *optional*):
    If set, clip the queries, keys, and values in the attention layer to this value.
kv_n_heads (`int`, *optional*, defaults to 1):
    For grouped_query_attention only, allow user to specify number of kv heads.
attn_config        
attn_pdropNclip_qkv   
kv_n_heads )__name__
__module____qualname____firstlineno____doc__base_config_keyr   floatint__annotations__r   r   __static_attributes__r       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   r      s9     $O!J!#'HcEkD 'Jr   r   z6This config is used to instantiate feedforward layers.c                      ^  \ rS rSr% SrSrSr\\S'   Sr	\
S-  \S'   Sr\\S	'   S
r\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   Sr\S-  \S'   U 4S jrSrU =r$ )DbrxFFNConfig-   a  
ffn_act_fn (`dict`, *optional*, defaults to `None`):
    A dict specifying activation function for the FFN.
    The dict should have a key 'name' with the value being the name of the activation function along with
    any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
ffn_hidden_size (`int`, *optional*, defaults to 3584):
    The hidden size of the feedforward network.
moe_num_experts (`int`, *optional*, defaults to 4):
    The number of experts in the mixture of experts layer.
moe_top_k (`int`, *optional*, defaults to 1):
    The number of experts to use in the mixture of experts layer.
moe_jitter_eps (`float`, *optional*, defaults to `None`):
    If not `None`, the jitter epsilon for the mixture of experts layer.
moe_loss_weight (`float`, *optional*, defaults to 0.01):
    The loss weight for the mixture of experts layer.
moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0):
    The normalization factor for the expert weights.

ffn_configi   hidden_sizeN
ffn_act_fni   ffn_hidden_size   moe_num_expertsr   	moe_top_kmoe_jitter_epsg{Gz?moe_loss_weightg      ?moe_normalize_expert_weightsc                    > U R                   c	  SS0U l         S H  nX!;   d  M
  UR                  U5        M     [        U5      S:w  a  [        SU< 35      e[        TU ]  " S0 UD6  g )Nnamesilu)
model_typeattn_implementationexperts_implementationtransformers_version_commit_hashtorch_dtypedtyper   zFound unknown kwargs=r   )r%   poplen
ValueErrorsuper__post_init__)selfkwargsk	__class__s      r   r;   DbrxFFNConfig.__post_init__Q   sh    ??"%v.DO
A {

1
 v;!5fY788''r   )r%   )r   r   r   r   r   r   r$   r   r   r%   dictr&   r(   r)   r*   r   r+   r,   r;   r   __classcell__r?   s   @r   r!   r!   -   sv    & #OK"Jt"OSOSIs#'NEDL'!OU!14 %$,4( (r   r!   )r	   c                     ^  \ rS rSr% SrSr\\S.rSSSSS	.r	S
r
\S-  \S'   Sr\S-  \S'   Sr\S-  \S'   S
r\S-  \S'   Sr\\S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\-  S-  \S'   Sr\\-  S-  \S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\-  S-  \S'   Sr\S-  \S'   Sr\S-  \S'   Sr\\ \   -  S-  \S'   Sr!\\S'   U 4S  jr"S! r#S"r$U =r%$ )#
DbrxConfigf   a  
max_seq_len (`int`, *optional*, defaults to 2048):
    The maximum sequence length of the model.
attn_config (`dict`, *optional*):
    A dictionary used to configure the model's attention module.
ffn_config (`dict`, *optional*):
    A dictionary used to configure the model's FFN module.

Example:
```python
>>> from transformers import DbrxConfig, DbrxModel

>>> # Initializing a Dbrx configuration
>>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

>>> # Initializing a model (with random weights) from the configuration
>>> model = DbrxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
dbrx)r   r#   n_headsd_modeln_layersmax_seq_len)num_attention_headsr$   num_hidden_layersmax_position_embeddingsi   N      i }  
vocab_sizer   resid_pdrop	emb_pdropr   r#   T	use_cacheg{Gz?initializer_rangeFoutput_router_logitsrope_parameterspad_token_idbos_token_ideos_token_idtie_word_embeddingsc                   > U R                   c  [        5       U l         O9[        U R                   [        5      (       a  [        S0 U R                   D6U l         U R                  c  [        5       U l        O9[        U R                  [        5      (       a  [        S0 U R                  D6U l        U R                   R                  U l        [        TU ]$  " S0 UD6  g )Nr   )
r   r   
isinstancerA   r#   r!   r   num_key_value_headsr:   r;   )r<   r=   r?   s     r   r;   DbrxConfig.__post_init__   s    #24D(($//2FT5E5EFD??"+oDO..+>doo>DO#'#3#3#>#> ''r   c                 <    U R                   (       a  [        S5      eg)zOPart of `@strict`-powered validation. Validates the architecture of the config.z5tie_word_embeddings is not supported for DBRX models.N)r[   r9   )r<   s    r   validate_architecture DbrxConfig.validate_architecture   s    ##TUU $r   )r   r#   r^   )&r   r   r   r   r   r0   r   r!   sub_configsattribute_maprI   r   r   rH   rJ   rK   rQ   rR   r   rS   r   rA   r#   rT   boolrU   rV   rW   r   rX   rY   rZ   listr[   r;   ra   r   rB   rC   s   @r   rE   rE   f   s<   . J"5]SK( '#0	M GS4ZGS4ZHcDj"Kt"J #K#!Iut|!59K$t+d29.2J$t+2It#u#(-$+-48O^d*T18#L#*##L#*#+/L#S	/D(/ %%(V Vr   rE   N)r   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   r!   rE   __all__r   r   r   <module>rl      s     . 3 1 # G5* 	 
" I51($ 1(	 
1(h ABDV! DV  CDVN .r   