
    Z j                     |    S SK JrJr  S SKJr  SSKJr  SSKJr  SSK	J
r
  \
" SS9\ " S	 S
\5      5       5       rS
/rg)    )AnyLiteral)strict   )PreTrainedConfig)RopeParameters)auto_docstringzpoolside/laguna-XS.2)
checkpointc                     ^  \ rS rSr% SrSrS/r0 SS_SS_SS_S	S_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_rS/S/4SS/S/4S/S/4S.rSr	\
\S '   S!r\
\S"'   S#r\
\S$'   S%r\
\S&'   S'r\
\S('   S)r\
\S*'   S+r\\S,'   S-r\
\S.'   S/r\\S0'   S1r\\S2'   S3r\\S4'   S5r\\S6'   S7r\\-  S7-  \S8'   S9r\
\S:'   S;r\\
-  \S<'   S9r\
\S='   S9r \
\S>'   S)r!\
\S?'   S@r"\
\SA'   S5r#\\SB'   SCr$\\SD'   S7r%\&\   S7-  \SE'   S7r'\
S7-  \SF'   S7r(\
S7-  \SG'   S7r)\
\&\
   -  S7-  \SH'   SIr*\
\SJ'   S5r+\\SK'   S7r,\&\
   S7-  \SL'   S7r-\&\   S7-  \SM'   SNr.\\SO'   S5r/\\SP'   S;r0\\SQ'   U 4SR jr1SS r2ST r3SUr4U =r5$ )VLagunaConfig   uB  
num_attention_heads_per_layer (`list[int]`, *optional*):
    Per-layer override for ``num_attention_heads``. Length must equal ``num_hidden_layers``.
mlp_layer_types (`list[str]`, *optional*):
    Per-layer MLP type — ``"dense"`` or ``"sparse"``. Length must equal
    ``num_hidden_layers``. Defaults to first layer dense, rest sparse.
moe_routed_scaling_factor (`float`, *optional*, defaults to 1.0):
    Scalar applied to routed-expert output before combining with the shared-expert output.
moe_apply_router_weight_on_input (`bool`, *optional*, defaults to `False`):
    Whether to apply router weights to the MoE input rather than the output. Not supported
    in transformers yet; ``True`` will raise a ``NotImplementedError`` for now.
moe_router_logit_softcapping (`float`, *optional*, defaults to 0.0):
    Scaling factor when applying tanh softcapping on the logits of the MoE router logits.

Example:

```python
>>> from transformers import LagunaModel, LagunaConfig

>>> configuration = LagunaConfig()
>>> model = LagunaModel(configuration)
>>> configuration = model.config
```
lagunapast_key_valueszlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.g_projzlayers.*.self_attn.o_projrowwisezlayers.*.self_attn.q_normreplicated_with_grad_allreducezlayers.*.self_attn.k_normzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projz!layers.*.mlp.experts.gate_up_projpacked_colwisezlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsmoe_tp_expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei    intermediate_size(   num_hidden_layers0   num_attention_heads   num_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cacheFtie_word_embeddingsNrope_parametersi   sliding_windowg        attention_dropoutmoe_intermediate_sizeshared_expert_intermediate_sizenum_experts_per_tok   num_expertsoutput_router_logitsgMbP?router_aux_loss_coeflayer_typespad_token_idbos_token_ideos_token_id   head_dimattention_biasnum_attention_heads_per_layermlp_layer_types      ?moe_routed_scaling_factor moe_apply_router_weight_on_inputmoe_router_logit_softcappingc                 `  > U R                   c  S/U R                  -  U l         U R                  c  S/S/U R                  S-
  -  -   U l        U R                  c  U R                  /U R                  -  U l        SSSS.SS	S
S.S.nU R
                  c  X l        [        TU ]  " S0 UDSSS10D6  g )Nfull_attentiondensesparse   defaultg    Ag      ?)	rope_type
rope_thetapartial_rotary_factorg     @r?   )rD   sliding_attentionignore_keys_at_rope_validationrL    )r6   r    r>   r=   r"   r,   super__post_init__)selfkwargsdefault_rope_params	__class__s      ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/laguna/configuration_laguna.pyrP   LagunaConfig.__post_init__w   s    # 01D4J4JJD'$+9zT=S=SVW=W/X#XD --5262J2J1KdNdNd1dD. -6Xhkl/8jm!ne
 '#6  	ooH[]mGno    c                     U$ )NrN   )rQ   rR   s     rU   convert_rope_params_to_dict(LagunaConfig.convert_rope_params_to_dict   s    rW   c                 B   U R                   (       a  [        S5      eU R                  bR  [        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      eg)z'Part of ``@strict``-powered validation.zhmoe_apply_router_weight_on_input=True is not yet supported in the transformers implementation of Laguna.Nz&num_attention_heads_per_layer length (z ) must equal num_hidden_layers (z).zlayer_types length (zmlp_layer_types length ()rA   NotImplementedErrorr=   lenr    
ValueErrorr6   r>   )rQ   s    rU   validate_architecture"LagunaConfig.validate_architecture   s,   00%9 
 ..:D6674;Q;QQ8T=_=_9`8a b1151G1G0HL  t D$:$::&s4+;+;'<&= >1151G1G0HL  t##$(>(>>*3t/C/C+D*E F1151G1G0HL  ?rW   )r6   r>   r=   r,   )6__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   int__annotations__r   r   r    r"   r$   r&   strr'   r(   floatr)   r*   boolr+   r,   r   dictr-   r.   r/   r0   r1   r3   r4   r5   r6   listr7   r8   r9   r;   r<   r=   r>   r@   rA   rB   rP   rY   r_   __static_attributes____classcell__)rT   s   @rU   r   r      s   2 J#4"5#Y#Y 	$Y 	$Y	
 	$Y 	$%E 	$%E 	!) 		 	!) 	,-= 	)) 	 0 	0 	.y  	0!& &(9:#%568IJ!"_$56 JK!s!s!!  J#)S)#u#L%It %%48O^d*T18NC%(us{(!$3$+.#S.  K!&$&"'%'$(KcT!(#L#*##L#*#+/L#S	/D(/ Hc ND 6:!49t#3:(,OT#Y%,'*u*-2$d2*- %-p$ rW   r   N)typingr   r   huggingface_hub.dataclassesr   configuration_utilsr   modeling_rope_utilsr   utilsr	   r   __all__rN   rW   rU   <module>ry      sN   (   . 3 1 # 12F# F  3FR 
rW   