
    Z jn                        S SK r S SKJr  S SK Jr  SSKJr  SSKJr  SSK	J
r
Jr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSKJr  \R2                  " \5      r\
" SS9\ " S S\5      5       5       r " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\\5      r" " S S\5      r# " S S \5      r$/ S!Qr%g)"    N)strict)nn   )initialization)PreTrainedModel)auto_docstringlogging   )DeepseekV3Config)DeepseekV3Attention)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbedding)Qwen3MLPztencent/Youtu-LLM-2B)
checkpointc                     ^  \ rS rSr% SrSrSSSS.r0 rSr\	\
S'   S	r\	\
S
'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\S-  \
S'   Sr\S-  \
S'   Sr\	S-  \
S'   Sr\	S-  \
S'   Sr\	\\	   -  S-  \
S'   Sr\\
S'   \" 5       r\" 5       r\" 5       r\" 5       r\" 5       r\" 5       r \" 5       r!\" 5       r"\" 5       r#\" 5       r$U 4S jr%Sr&U =r'$ ) YoutuConfig,   a  
rope_interleave (`bool`, *optional*, defaults to `True`):
    Whether to interleave the rotary position embeddings.
embedding_initializer_range (`float`, *optional*):
    The standard deviation of the truncated_normal_initializer for initializing all embedding matrices.

```python
>>> from transformers import YoutuModel, YoutuConfig
>>> # Initializing a Youtu-LLM-2B style configuration
>>> configuration = YoutuConfig()
>>> # Accessing the model configuration
>>> configuration = model.config
```youtucolwiserowwise)zlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proji  
vocab_sizei   hidden_sizei   intermediate_size    num_hidden_layers   num_attention_headsnum_key_value_headsi   max_position_embeddingsNinitializer_rangeembedding_initializer_rangepad_token_idi  bos_token_idi eos_token_idTtie_word_embeddingsc                    > U R                   c2  U R                  S:w  a  SSU R                  -  S-  -  U l         OSU l         U R                  =(       d    SU R                   -  U l        [        TU ]  " S0 UD6  g )Nr   g       @g      @g      ?{Gz? )r$   r   r%   super__post_init__)selfkwargs	__class__s     x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/youtu/modular_youtu.pyr.   YoutuConfig.__post_init___   sq    !!)1$),d6F6F0F3/N)N&)-&+/+K+K+ksUYUkUkOk(''    )r%   r$   )(__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planattribute_mapr   int__annotations__r   r   r   r!   r"   r#   r$   floatr%   r&   r'   r(   listr)   boolAttributeErrorn_shared_expertsn_routed_expertsrouted_scaling_factorn_group
topk_groupnum_experts_per_tokfirst_k_dense_replacenorm_topk_probpretraining_tpmoe_intermediate_sizer.   __static_attributes____classcell__)r1   s   @r2   r   r   ,   s0    J"+ )"+
 MJK!s!s!!!!#)S)&*ut|*044#L#*#%L#*%+1L#S	/D(1 $$ &'%'*,G!J(**,#%N#%N*,( (r4   r   c                       \ rS rSrSrg)YoutuRMSNormj   r,   Nr5   r6   r7   r8   rM   r,   r4   r2   rP   rP   j       r4   rP   c                       \ rS rSrSrg)YoutuRotaryEmbeddingn   r,   NrR   r,   r4   r2   rU   rU   n   rS   r4   rU   c                       \ rS rSrSrg)YoutuMLPr   r,   NrR   r,   r4   r2   rX   rX   r   rS   r4   rX   c                       \ rS rSrSrg)YoutuAttentionv   r,   NrR   r,   r4   r2   r[   r[   v   rS   r4   r[   c                       \ rS rSrSrg)YoutuDecoderLayerz   r,   NrR   r,   r4   r2   r^   r^   z   rS   r4   r^   c                   B    \ rS rSr\R
                  " 5       S 5       rSrg)YoutuPreTrainedModel~   c                    [         R                  " X5        [        U R                  SS5      n[        U R                  SSU-  5      n[	        U[
        R                  5      (       af  [        R                  " UR                  SUS9  UR                  b8  [        R                  " UR                  R                  UR                     5        g g g )Nr$   r+   r%   r
   g        )meanstd)r   _init_weightsgetattrconfig
isinstancer   	Embeddinginitnormal_weightpadding_idxzeros_data)r/   modulere   	embed_stds       r2   rf   "YoutuPreTrainedModel._init_weights   s    %%d3dkk#6=DKK)FCP	fbll++LLSi@!!-FMM..v/A/ABC . ,r4   r,   N)r5   r6   r7   r8   torchno_gradrf   rM   r,   r4   r2   ra   ra   ~   s    
]]_D Dr4   ra   c                       \ rS rSrSrg)
YoutuModel   r,   NrR   r,   r4   r2   rw   rw      rS   r4   rw   c                       \ rS rSrSrg)YoutuForCausalLM   r,   NrR   r,   r4   r2   rz   rz      rS   r4   rz   )r   ra   rw   rz   )&rt   huggingface_hub.dataclassesr   r    r   rk   modeling_utilsr   utilsr   r	   %deepseek_v3.configuration_deepseek_v3r    deepseek_v3.modeling_deepseek_v3r   llama.modeling_llamar   r   r   r   r   r   qwen3.modeling_qwen3r   
get_loggerr5   loggerr   rP   rU   rX   r[   r^   ra   rw   rz   __all__r,   r4   r2   <module>r      s   *  .  & - , D B  , 
		H	% 129(" 9(  39(x	< 		/ 		x 		( 		) 		D/ 	D	 		' 	r4   