
    Z j                     l    S SK Jr  SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/r	g
)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringzSmallDoge/Doge-320M)
checkpointc                   D  ^  \ rS rSr% SrSrS/rSSSSSSSSSSSS	.rS
/S/4SS/S/4S/S/4S.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\\
-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   Sr\
\S#'   S$r\\-  S$-  \S%'   S&r\
\S''   S$r\
S$-  \S('   S!r\\S)'   Sr\S$-  \S*'   S!r \\S+'   S$r!\
S$-  \S,'   Sr"\
\S-'   S!r#\\S.'   S/r$\
\S0'   S1r%\
\S2'   S!r&\\S3'   S!r'\\S4'   S5r(\\S6'   S$r)\
S$-  \S7'   S$r*\
S$-  \S8'   S$r+\
\,\
   -  S$-  \S9'   U 4S: jr-S;r.U =r/$ )<
DogeConfig   a  
keep_window_size (`int`, *optional*, defaults to 2048):
    The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
is_moe (`bool`, *optional*, defaults to `False`):
    Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.

```python
>>> from transformers import DogeConfig, DogeModel

>>> # Initializing a Doge-320M style configuration
>>> configuration = DogeConfig()

>>> # Initializing a model from the Doge-320M style configuration
>>> model = DogeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```dogepast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatezlayers.*.mlp.down_embedzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei   hidden_sizei   intermediate_size    num_hidden_layersg        hidden_dropoutsilu
hidden_actg{Gz?initializer_rangegư>rms_norm_epsT	use_cacheFtie_word_embeddingsmax_position_embeddingsNrope_parameters   num_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moei @  num_experts@   num_experts_per_toknorm_topk_proboutput_router_logitsgMbP?router_aux_loss_coefpad_token_idbos_token_ideos_token_idc                 b   > U R                   c  U R                  U l         [        TU ]  " S0 UD6  g )N )r)   r(   super__post_init__)selfkwargs	__class__s     |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/doge/configuration_doge.pyr<   DogeConfig.__post_init__f   s-    ##+'+'?'?D$''    )r)   )0__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   int__annotations__r   r   r   r   floatr    strr!   r"   r#   boolr$   r%   r&   r   dictr(   r)   r*   r+   r,   r-   r.   r/   r0   r2   r3   r4   r5   r6   r7   r8   listr<   __static_attributes____classcell__)r?   s   @r@   r
   r
      s   & J#4"5 &/%.%.&/%."+ )"+$;#8!6 &(9:#%568IJ!"_$56 JK!s!s"%NECK%J#u#L%It %%#'S'48O^d*T18  &*t* ND &)ut|)Hd!%NC$J% c FDK!! ND !&$&"'%'#L#*##L#*#+/L#S	/D(/( (rB   r
   N)
huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__r:   rB   r@   <module>rZ      sJ   , / 3 1 # 01L(! L(  2L(^ .rB   