
    Z jN                        S r SSKJr  SSKJrJrJr  SSKrSSKJ	s  J
r  SSKJr  SSKJ	r	  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2  SSK3J4r4J5r5J6r6J7r7  SSK8J9r9J:r:  SSK;J<r<J=r=  \%R|                  " \?5      r@\$" SS9\ " S S\25      5       5       rA " S  S!\75      rB " S" S#\+5      rC " S$ S%\55      rD " S& S'\95      rE " S( S)\<5      rF " S* S+\=5      rG " S, S-\)5      rH " S. S/\-5      rI " S0 S1\65      rJ " S2 S3\/5      rK " S4 S5\45      rL/ S6QrMg)7zPyTorch Laguna model.    )Callable)AnyLiteralOptionalN)strict)nn   )initialization)CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)MoeModelOutputWithPast)ROPE_INIT_FUNCTIONS)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging)TransformersKwargs   )AfmoeAttention)Gemma3RotaryEmbedding)Glm4MoeLiteDecoderLayer)
LlamaModeleager_attention_forward)Qwen2MoeConfig)Qwen2MoeForCausalLMQwen2MoeMLPQwen2MoePreTrainedModelQwen2MoeRMSNorm)Qwen3_5MoeTopKRouterapply_rotary_pos_emb)Qwen3MoeExpertsQwen3MoeSparseMoeBlockzpoolside/laguna-XS.2)
checkpointc                      \ rS rSr% SrSr0 SS_SS_SS_SS_S	S
_SS_SS_SS_SS_SS
_SS_SS
_SS_SS_SS_SS
_rSr\\	S'   Sr
\\	S'   Sr\\	S'   Sr\\	S'   S r\\	S!'   S"r\\	S#'   S$r\\	S%'   S r\\	S&'   S'r\\	S('   S'r\\	S)'   S'r\\	S*'   S+r\\	S,'   S-r\\	S.'   S/r\\   S/-  \	S0'   S/r\\   S/-  \	S1'   S2r\\	S3'   S-r\\	S4'   S5r\\	S6'   \" 5       r \" 5       r!\" 5       r"\" 5       r#\" 5       r$\" 5       r%S7 r&S8 r'S9 r(S:r)g/);LagunaConfig0   uB  
num_attention_heads_per_layer (`list[int]`, *optional*):
    Per-layer override for ``num_attention_heads``. Length must equal ``num_hidden_layers``.
mlp_layer_types (`list[str]`, *optional*):
    Per-layer MLP type — ``"dense"`` or ``"sparse"``. Length must equal
    ``num_hidden_layers``. Defaults to first layer dense, rest sparse.
moe_routed_scaling_factor (`float`, *optional*, defaults to 1.0):
    Scalar applied to routed-expert output before combining with the shared-expert output.
moe_apply_router_weight_on_input (`bool`, *optional*, defaults to `False`):
    Whether to apply router weights to the MoE input rather than the output. Not supported
    in transformers yet; ``True`` will raise a ``NotImplementedError`` for now.
moe_router_logit_softcapping (`float`, *optional*, defaults to 0.0):
    Scaling factor when applying tanh softcapping on the logits of the MoE router logits.

Example:

```python
>>> from transformers import LagunaModel, LagunaConfig

>>> configuration = LagunaConfig()
>>> model = LagunaModel(configuration)
>>> configuration = model.config
```
lagunazlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.g_projzlayers.*.self_attn.o_projrowwisezlayers.*.self_attn.q_normreplicated_with_grad_allreducezlayers.*.self_attn.k_normzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projz!layers.*.mlp.experts.gate_up_projpacked_colwisezlayers.*.mlp.experts.down_projzlayers.*.mlp.expertsmoe_tp_expertsz%layers.*.mlp.shared_experts.gate_projz#layers.*.mlp.shared_experts.up_projz%layers.*.mlp.shared_experts.down_proji  
vocab_sizei    intermediate_size(   num_hidden_layersnum_attention_heads   num_key_value_headsi   max_position_embeddings   num_expertsnum_experts_per_toki   moe_intermediate_sizeshared_expert_intermediate_sizesliding_window   head_dimFattention_biasNnum_attention_heads_per_layermlp_layer_types      ?moe_routed_scaling_factor moe_apply_router_weight_on_input        moe_router_logit_softcappingc                 l   U R                   c  S/U R                  -  U l         U R                  c  S/S/U R                  S-
  -  -   U l        U R                  c  U R                  /U R                  -  U l        SSSS.SS	S
S.S.nU R
                  c  X l        [        R                  " U 40 UDSSS10D6  g )Nfull_attentiondensesparse   defaultg    Ag      ?)	rope_type
rope_thetapartial_rotary_factorg     @rD   rJ   sliding_attentionignore_keys_at_rope_validationrS   )layer_typesr4   rC   rB   r5   rope_parametersr   __post_init__)selfkwargsdefault_rope_paramss      z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/laguna/modular_laguna.pyrW   LagunaConfig.__post_init__   s    # 01D4J4JJD'$+9zT=S=SVW=W/X#XD --5262J2J1KdNdNd1dD. -6Xhkl/8jm!ne
 '#6  	&&	
	
<OQa;b	
    c                     U$ N )rX   rY   s     r[   convert_rope_params_to_dict(LagunaConfig.convert_rope_params_to_dict   s    r]   c                 B   U R                   (       a  [        S5      eU R                  bR  [        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      e[        U R                  5      U R                  :w  a/  [        S[        U R                  5       SU R                   S35      eg)z'Part of ``@strict``-powered validation.zhmoe_apply_router_weight_on_input=True is not yet supported in the transformers implementation of Laguna.Nz&num_attention_heads_per_layer length (z ) must equal num_hidden_layers (z).zlayer_types length (zmlp_layer_types length ()rF   NotImplementedErrorrB   lenr4   
ValueErrorrU   rC   )rX   s    r[   validate_architecture"LagunaConfig.validate_architecture   s,   00%9 
 ..:D6674;Q;QQ8T=_=_9`8a b1151G1G0HL  t D$:$::&s4+;+;'<&= >1151G1G0HL  t##$(>(>>*3t/C/C+D*E F1151G1G0HL  ?r]   )rU   rC   rB   rV   )*__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planr1   int__annotations__r2   r4   r5   r7   r8   r:   r;   r<   r=   r>   r@   rA   boolrB   listrC   strrE   floatrF   rH   AttributeErrordecoder_sparse_stepmlp_only_layersqkv_biasnorm_topk_probuse_sliding_windowmax_window_layersrW   ra   rg   __static_attributes__r`   r]   r[   r)   r)   0   s   2 J#Y#Y 	$Y 	$Y	
 	$Y 	$%E 	$%E 	!) 		 	!) 	,-= 	)) 	 0 	0 	.y  	0!& J!s!s!!  #)S)K  !$3$+.#S.NC Hc ND 6:!49t#3:(,OT#Y%,'*u*-2$d2*- %- )*$&OH#%N')&(
(r]   r)   c                       \ rS rSrSrg)LagunaRMSNorm   r`   Nri   rj   rk   rl   r}   r`   r]   r[   r   r          r]   r   c                      ^  \ rS rSrS\4U 4S jjr\    SS\S-  S\S   S\S-  S\	S-  S	\
S
\4   4
S jj5       rSrU =r$ )LagunaRotaryEmbedding   configc                 $   > [         TU ]  U5        g r_   )super__init__rX   r   	__class__s     r[   r   LagunaRotaryEmbedding.__init__   s     r]   Ndeviceztorch.deviceseq_len
layer_typereturnztorch.Tensorc           	      v   U R                   U   S   nU R                   U   R                  SS5      n[        U SS5      =(       d    U R                  U R                  -  n[        Xe-  5      nSnSU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  n	X4$ )
a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
rP   rQ   rD   r@   Nr   r   )dtype)r   r   )rV   getgetattrhidden_sizer5   rp   torcharangeint64toru   )
r   r   r   r   baserQ   r@   dimattention_factorinv_freqs
             r[   compute_default_rope_parameters5LagunaRotaryEmbedding.compute_default_rope_parameters   s    . %%j1,? & 6 6z B F FG^`c d6:t4h8J8JfNhNh8h(23 U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r]   r`   )NNNN)ri   rj   rk   rl   r)   r   staticmethodr   rp   rt   tupleru   r   r}   __classcell__r   s   @r[   r   r      sy    !| ! &*+/"!%	"*t#"*("* t"* $J	"*
 
~u$	%"* "*r]   r   c                       \ rS rSrSrg)	LagunaMLP   r`   Nr   r`   r]   r[   r   r      r   r]   r   c                      ^  \ rS rSrU 4S jrS\R                  S\\R                  \R                  \R                  4   4S jrSr	U =r
$ )LagunaTopKRouter   c                    > [         TU ]  5         [        R                  " [        R
                  " UR                  5      SS9U l        UR                  U l	        g )NF)requires_grad)
r   r   r   	Parameterr   zerosr:   e_score_correction_biasrH   router_logit_softcappingr   s     r[   r   LagunaTopKRouter.__init__   s?    ')||EKK@R@R4Sch'i$(.(K(K%r]   hidden_statesr   c                 V   UR                  SU R                  5      n[        R                  " XR                  5      R                  5       nU R                  S:  a/  [        R                  " X R                  -  5      U R                  -  n[        R                  " U5      nX0R                  R                  UR                  5      -   n[        R                  " X@R                  SS9u  pVUR                  SU5      nXwR!                  SSS9-  nUR                  UR                  5      nX'U4$ )NrG   )r   T)r   keepdim)reshape
hidden_dimFlinearweightru   r   r   tanhsigmoidr   r   r   topktop_kgathersum)rX   r   router_logitsrouting_scoresscores_for_selection_selected_expertsrouting_weightss           r[   forwardLagunaTopKRouter.forward   s     &--b$//B<BBD((3.!JJ}7T7T'TUX\XuXuuM}5-0L0L0O0OP^PdPd0ee#jj)=zzrR(//4DE),?,?BPT,?,UU),,]-@-@A/???r]   )r   r   )ri   rj   rk   rl   r   r   Tensorr   r   r}   r   r   s   @r[   r   r      sE    L
@||@ 
u||U\\5<<7	8@ @r]   r   c                       \ rS rSrSrg)LagunaExperts   r`   Nr   r`   r]   r[   r   r      r   r]   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )LagunaSparseMoeBlocki  r   c                 v   > [         TU ]  U5        [        XR                  S9U l        UR
                  U l        g )Nr2   )r   r   r   r=   shared_expertsrE   routed_scaling_factorr   s     r[   r   LagunaSparseMoeBlock.__init__  s1     'BhBhi%+%E%E"r]   r   r   c                     UR                   u  p#nUR                  SU5      nU R                  U5      nU R                  U5      u  pgnU R	                  XU5      nXR
                  -  nX-   nUR                  X#U5      nU$ )Nr   )shapeviewr   gateexpertsr   r   )	rX   r   
batch_sizesequence_lengthr   shared_outputr   r   r   s	            r[   r   LagunaSparseMoeBlock.forward	  s    2?2E2E/
Z%**2z:++M:/3yy/G,,]oV%(B(BB%5%--j:Vr]   )r   r   )ri   rj   rk   rl   r)   r   r   r   r   r}   r   r   s   @r[   r   r     s1    F| F
U\\ ell  r]   r   c                     ^  \ rS rSrSrS\S\S\4U 4S jjr SS\R                  S	\
\R                  \R                  4   S
\R                  S-  S\S-  S\\   S\
\R                  \R                  S-  4   4S jjrSrU =r$ )LagunaAttentioni  zSAfmoe-style SWA/GQA attention with Laguna-specific gating and per-layer head count.r   	layer_idx	num_headsc                   > X0l         [        TU ]	  X5        U R                   UR                  -  U l        [
        R                  " UR                  U R                   U R                  -  UR                  S9U l
        [
        R                  " U R                   U R                  -  UR                  UR                  S9U l        U ?[
        R                  " UR                  U R                   SS9U l        g )N)biasF)r   r   r   r7   num_key_value_groupsr   Linearr   r@   rA   q_projo_proj	gate_projg_proj)rX   r   r   r   r   s       r[   r   LagunaAttention.__init__  s    "+$(NNf6P6P$P!ii 2 2DNNT]]4RY_YnYnoii >@R@RY_YnYno Nii 2 2DNNOr]   Nr   position_embeddingsattention_maskpast_key_valuesrY   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
U R                  U5      R                  SS5      nU R                  U	5      R                  SS5      n	U
R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       n[.        R0                  " U R3                  U5      R5                  5       5      R7                  UR8                  5      nUR                  " / UQSPU R                  P76 UR;                  S5      -  R                  " / UQSP76 nU R=                  U5      nX4$ )Nr   rM   r   rG   )dropoutscalingr>   )r   r@   r   r   k_projv_projq_norm	transposek_normr$   updater   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   r>   r   
contiguousr   softplusr   ru   r   r   	unsqueezer   )rX   r   r   r   r   rY   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightsr   s                    r[   r   LagunaAttention.forward)  s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D{{<0::1a@[[,66q!<
#--a3&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHzz$++m4::<=@@ARARS"''HHbH$--H4>>Z\K]]ccuepurtukk+.((r]   )r   r   r   r   r   r_   )ri   rj   rk   rl   rm   r)   rp   r   r   r   r   r   r   r   r   r}   r   r   s   @r[   r   r     s    ]P| P P P& )-.)||.) #5<<#=>.) t+	.)
 .) -..) 
u||U\\D00	1.) .)r]   r   c                   &    \ rS rSrS\S\4S jrSrg)LagunaDecoderLayeriZ  r   r   c                    [         R                  R                  U 5        UR                  U l        [	        XUR
                  U   5      U l        UR                  U   S:X  a  [        U5      U l	        O[        XR                  S9U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )NrL   r   )eps)r   Moduler   r   r   rB   	self_attnrC   r   mlpr   r2   r   rms_norm_epsinput_layernormpost_attention_layernorm)rX   r   r   s      r[   r   LagunaDecoderLayer.__init__[  s    
		4 !--(F<`<`aj<kl!!),8+F3DH ;S;STDH,V-?-?VEXEXY(5f6H6HfNaNa(b%r]   )r   r  r  r  r  N)ri   rj   rk   rl   r)   rp   r   r}   r`   r]   r[   r  r  Z  s    	c| 	c 	cr]   r  c                   P   ^  \ rS rSr\R
                  " 5       U 4S j5       rSrU =r$ )LagunaPreTrainedModelig  c                 $  > [         TU ]  U5        [        U[        5      (       a4  [        R
                  R                  R                  UR                  5        g [        U[        5      (       a  UR                   H  nUR                  nUR                  U   S:w  a  [        UR                  U      nU" UR                  US9u  pE[        R                  " [!        X S35      U5        [        R                  " [!        X S35      U5        M     g g )NrN   )r   	_inv_freq_original_inv_freq)r   _init_weights
isinstancer   r   r   initzeros_r   r   rU   r   rO   r   r   copy_r   )rX   moduler   rope_init_fncurr_inv_freqr   r   s         r[   r  #LagunaPreTrainedModel._init_weightsh  s    f%f.//HHMM  !?!?@ 566$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 7r]   r`   )	ri   rj   rk   rl   r   no_gradr  r}   r   r   s   @r[   r  r  g  s    
]]_^ ^r]   r  c                       \ rS rSr      SS\R
                  S-  S\R                  S-  S\R
                  S-  S\S-  S\R                  S-  S\	S-  S	\
\   S
\4S jjrSrg)LagunaModeliw  N	input_idsr   position_idsr   inputs_embeds	use_cacherY   r   c           	        ^ US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       dR  U R                  UUUUS.mU4S jU4S jS	.n
0 n	[        U R                  R                  5       H  nX   " 5       X'   M     Un0 n[        U R                  R                  5       H  nU R                  XU5      X'   M     [        U R                   S U R                  R"                   5       HE  u  pU" U4XR                  R                  U      XR                  R                  U      UUS
.UD6nMG     U R%                  U5      n['        UU(       a  US9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   rM   )r   )r   r%  r   r   r$  c                     > [        S0 T D6$ Nr`   )r   mask_kwargss   r[   <lambda>%LagunaModel.forward.<locals>.<lambda>  s    *<*K{*Kr]   c                     > [        S0 T D6$ r)  )r   r*  s   r[   r,  r-    s    -N-]Q\-]r]   rR   )r   r   r$  r   )last_hidden_stater   )rf   embed_tokensr   r   get_seq_lengthr   r   r   r   r   r  dictsetrU   
rotary_emb	enumeratelayersr4   normr   )rX   r#  r   r$  r   r%  r&  rY   past_seen_tokenscausal_mask_mappingmask_creation_functionsr   r   r   idecoder_layerr+  s                   @r[   r   LagunaModel.forwardx  s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF++!."0#2 ,K #L%]'# #%!$++"9"9:
2I2U2W#/ ; & dkk556J.2oom[e.f+ 7 !*$++6U8U8U*V WA)2;;3J3J13MN$78O8OPQ8R$S) / M !X 		-0%+/8O
 	
>B
 	
r]   r`   )NNNNNN)ri   rj   rk   rl   r   
LongTensorr   r   FloatTensorrr   r   r   r   r   r}   r`   r]   r[   r"  r"  w  s     .2.204(,26!%<
##d*<
 t+<
 &&-	<

 <
 ((4/<
 $;<
 +,<
 
 <
 <
r]   r"  c                   (   ^  \ rS rSrU 4S jrSrU =r$ )LagunaForCausalLMi  c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
r`   )r   r   )rX   super_kwargsr   s     r[   r   LagunaForCausalLM.forward  s     w...r]   r`   )ri   rj   rk   rl   r   r}   r   r   s   @r[   rA  rA    s    / /r]   rA  )r)   rA  r"  r  )Nrm   collections.abcr   typingr   r   r   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r
   r  cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   afmoe.modeling_afmoer   gemma3.modeling_gemma3r   $glm4_moe_lite.modeling_glm4_moe_liter   llama.modeling_llamar   r   !qwen2_moe.configuration_qwen2_moer   qwen2_moe.modeling_qwen2_moer   r    r!   r"    qwen3_5_moe.modeling_qwen3_5_moer#   r$   qwen3_moe.modeling_qwen3_moer%   r&   
get_loggerri   loggerr)   r   r   r   r   r   r   r   r  r  r"  rA  __all__r`   r]   r[   <module>r`     s7    $ ) )    .  & . 3 R B 6 6 5 & , / 1 : J F > u u Y R 
		H	% 12}> }  3}@	O 	'*1 '*T	 	@+ @6	O 	1 *?)n ?)D
c0 
c^3 ^ =
* =
@/+ /r]   