
    Z j                     p    S r SSKJr  SSKJr  SSKJr  SSKJr  \" SS9\ " S	 S
\5      5       5       r	S
/r
g)zOLMoE model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringzallenai/OLMoE-1B-7B-0924)
checkpointc                     ^  \ rS rSr% SrSrS/rSS0rSSSSS	S
SS.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
S-  \S'   Sr\\S'   Sr\
\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\
S-  \S"'   Sr\
S-  \S#'   S$r\
\\
   -  S-  \S%'   S&r\\S''   Sr\\ -  S-  \S('   S&r!\\S)'   S*r"\\
-  \S+'   Sr#\S-  \S,'   S-r$\
\S.'   S/r%\
\S'   S&r&\\S0'   S1r'\\S2'   S&r(\\S3'   U 4S4 jr)S5r*U =r+$ )6OlmoeConfig   a  
clip_qkv (`float`, *optional*):
    If not `None`, elements of query, key and value attention states are clipped so that their
    absolute value does not exceed this value.

```python
>>> from transformers import OlmoeModel, OlmoeConfig

>>> # Initializing a OLMoE 7B A1B style configuration
>>> configuration = OlmoeConfig()

>>> # Initializing a model from the OLMoE 7B A1B style configuration
>>> model = OlmoeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
olmoepast_key_valuesnum_local_expertsnum_expertscolwise_gather_outputrowwise_split_inputpacked_colwiserowwisemoe_tp_experts)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projz!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsi  
vocab_sizei   hidden_sizeintermediate_size   num_hidden_layersnum_attention_headsNnum_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cache   pad_token_idbos_token_idig  eos_token_idFtie_word_embeddingsrope_parametersattention_biasg        attention_dropoutclip_qkv   num_experts_per_tok@   output_router_logitsg{Gz?router_aux_loss_coefnorm_topk_probc                 b   > U R                   c  U R                  U l         [        TU ]  " S0 UD6  g )N )r   r   super__post_init__)selfkwargs	__class__s     ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/olmoe/configuration_olmoe.pyr4   OlmoeConfig.__post_init__S   s-    ##+'+'?'?D$''    )r   ),__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapbase_model_tp_planr   int__annotations__r   r   r   r   r   r   strr   r   floatr    r!   boolr#   r$   r%   listr&   r'   r   dictr(   r)   r*   r,   r   r.   r/   r0   r4   __static_attributes____classcell__)r7   s   @r8   r
   r
      ss   & J#4"5(-8M &=%<%<%:-=*3 0 JK!s!s!!&*t*J#'S'#u#L%It L#* #L#*#+0L#S	/D(0 %%48O^d*T18 ND %(us{(!Hedl!  K!&$&"&%& ND ( (r:   r
   N)r?   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__r2   r:   r8   <module>rR      sJ      . 3 1 # 56?(" ?(  7?(D /r:   