
    Z j9                     l    S SK Jr  SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       rS	/r	g
)    )strict   )PreTrainedConfig)RopeParameters)auto_docstringz!swiss-ai/Apertus-8B-Instruct-2509)
checkpointc            	         ^  \ rS rSr% SrSrS/rSrSSSSSSSSS	.rS
/S/4SS/S/4S/S/4S.r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\S-  \S%'   S&r\S-  \S''   S(r\\\   -  S-  \S)'   S*r\\S+'   Sr\ \!-  S-  \S,'   S*r"\\S-'   S.r#\\-  \S/'   U 4S0 jr$S1r%U =r&$ )2ApertusConfig   aV  
```python
>>> from transformers import ApertusModel, ApertusConfig

>>> # Initializing a Apertus-8B style configuration
>>> configuration = ApertusConfig()

>>> # Initializing a model from the Apertus-8B style configuration
>>> model = ApertusModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```apertuspast_key_values    `fAcolwisereplicated_with_grad_allreducerowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_headsNnum_key_value_headsxielu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacher   pad_token_id   bos_token_id   eos_token_idFtie_word_embeddingsrope_parametersattention_biasg        attention_dropoutc                    > U R                   c  U R                  U l         U R                  c  SSSSSSS.U l        [        TU ]  " S0 UD6  g )	Nllama3r   g       @i    g      ?g      @)	rope_type
rope_thetafactor original_max_position_embeddingslow_freq_factorhigh_freq_factor )r   r   r,   super__post_init__)selfkwargs	__class__s     ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/apertus/configuration_apertus.pyr9   ApertusConfig.__post_init__S   sW    ##+'+'?'?D$'%(48#&$'$D  	''    )r   r,   )'__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planr   int__annotations__r   r   r   r   r   r!   strr"   r#   floatr$   r%   boolr&   r(   r*   listr+   r,   r   dictr-   r.   r9   __static_attributes____classcell__)r<   s   @r=   r
   r
      sb    J#4"5M%.%.%.%E%E%. )"+	 &(9:#%568IJ!"_$56 JK"s"s!!&*t*J#(S(#u#L%It L#*  L#* +,L#S	/D(, %%48O^d*T18 ND %(us{(( (r?   r
   N)
huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr   r
   __all__r7   r?   r=   <module>rX      sL   * / 3 1 # >?B($ B(  @B(J 
r?   