
    Z jd                     r    S r SSKJr  SSKJr  SSKJr  SSKJr  \\" SSS	9 " S
 S\5      5       5       r	S/r
g)zAFMoE model configuration    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz
    AFMoE is an Adaptive Feedforward MoE (Mixture of Experts) model with token-choice routing, shared experts, and a
    hybrid attention mechanism combining sliding window and full attention patterns.
    zarcee-ai/Trinity-Mini)custom_intro
checkpointc                     ^  \ rS rSr% SrSrS/rS/S/4SS/S/4S/S/4S	.rS
r\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\	S-  \
S'   Sr\\
S'   Sr\	\
S'   S r\\
S!'   S"r\\
S#'   S$r\\
S%'   S&r\\
S''   Sr\\-  S-  \
S('   S)r\	S-  \
S*'   S+r \	S-  \
S,'   S-r!\	S-  \
S.'   S/r"\S-  \
S0'   S&r#\\
S1'   S2r$\	S-  \
S3'   S4r%\	S-  \
S5'   Sr&\'\   S-  \
S6'   S7r(\\	-  S-  \
S8'   S&r)\S-  \
S9'   Sr*\	\'\	   -  S-  \
S:'   Sr+\	S-  \
S;'   Sr,\	S-  \
S<'   S&r-\\
S='   U 4S> jr.S?r/U =r0$ )@AfmoeConfig   a  
global_attn_every_n_layers (`int`, *optional*, defaults to 4):
    The frequency of full attention layers. Every Nth layer will use full attention, while others use sliding
    window attention.
mup_enabled (`bool`, *optional*, defaults to `False`):
    Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
    are scaled by `sqrt(hidden_size)`.

Example:
```python
>>> from transformers import AfmoeModel, AfmoeConfig

>>> # Initializing an AFMoE configuration
>>> configuration = AfmoeConfig()

>>> # Initializing a model from the afmoe-small-sft-v1 style configuration
>>> model = AfmoeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
afmoepast_key_values	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei   intermediate_sizei  moe_intermediate_size    num_hidden_layers   Nnum_dense_layers   num_attention_headsnum_key_value_heads   head_dimsilu
hidden_acti @  max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacheFtie_word_embeddingsrope_parameters@   num_experts   num_experts_per_tok   num_shared_expertsg      ?route_scaleoutput_router_logits   global_attn_every_n_layersi   sliding_windowlayer_typesg        attention_dropoutmup_enabledeos_token_idpad_token_idbos_token_idattention_biasc                    > U R                   cM  [        U R                  5       Vs/ s H'  n[        US-   U R                  -  5      (       a  SOSPM)     snU l         U R
                  c  U R                  U l        [        TU ]   " S0 UD6  g s  snf )Nr   sliding_attentionfull_attention )	r6   ranger   boolr4   r    r   super__post_init__)selfkwargsi	__class__s      ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/afmoe/configuration_afmoe.pyrD   AfmoeConfig.__post_init__`   s    # t556 6A (,QUd6U6U,U'V'V#\ll6 D
 ##+'+'?'?D$'' s   .B)r6   r    )1__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_pp_planr   int__annotations__r   r   r   r   r   r   r    r"   r$   strr%   r&   floatr'   r(   rB   r)   r*   r   dictr,   r.   r0   r1   r2   r4   r5   r6   listr7   r8   r9   r:   r;   r<   rD   __static_attributes____classcell__)rH   s   @rI   r   r      s   . J#4"5 &(9:#%568IJ!"_$56 JK!s!!%3%s#$cDj$!!&*t*HcDjJ#(S(#u#L%It %%48O^d*T18 Kt &'t'%&d
& #K#!&$&-.d
.!%NC$J%$(KcT!(,/us{T)/$K$+/L#S	/D(/#L#*##L#*# ND 
( 
(    r   N)rO   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r   __all__r@   r[   rI   <module>ra      sV      . 3 1 #  'K(" K( K(\ /r[   