
    Z j                     :   S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSKJ	r	  SSK
Jr  S	S
KJrJrJrJr  SSKJr  \R&                  " \5      r " S S\SS9r " S S\R.                  5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)    )	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
GraniteFlashAttentionKwargs$   a   
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

cu_seq_lens_q (`torch.LongTensor`):
    Gets cumulative sequence length for query state.
cu_seq_lens_k (`torch.LongTensor`):
    Gets cumulative sequence length for key state.
max_length_q (`int`):
    Maximum sequence length for query state.
max_length_k (`int`):
    Maximum sequence length for key state.
seq_idx (`torch.IntTensor):
    Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idx N)__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__int	IntTensor__static_attributes__r       ڎ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   $   s7      ######__r%   r   F)totalc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeSharedMLP<   zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
configc                 X  > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l
        [        R                  " U R                  U R                  SS9U l        g )Nr
   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr+   	__class__s     r&   r/   GraniteMoeSharedMLP.__init__E   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr%   hidden_statesreturnc                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr
   )dimr   r   )r6   chunkr4   r7   )r9   r<   chunked_hidden_statess      r&   forwardGraniteMoeSharedMLP.forwardN   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r%   )r4   r0   r6   r1   r7   )r   r   r   r   r   r   r/   r   TensorrC   r$   __classcell__r:   s   @r&   r)   r)   <   s7    V5 VU\\ ell  r%   r)   c                   `  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\S-  S\\R                  \R                  4   S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )GraniteMoeSharedDecoderLayerV   r+   	layer_idxc                 t   > [         TU ]  X5        UR                  S:X  a  S U l        g [        U5      U l        g )Nr   )r.   r/   r2   r)   
shared_mlpr9   r+   rK   r:   s      r&   r/   %GraniteMoeSharedDecoderLayer.__init__W   s1    +"("A"AQ"F$L_`fLgr%   Nr<   attention_maskposition_idspast_key_valuesoutput_attentions	use_cacheposition_embeddingskwargsr=   c                 6   Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pXU R                  -  -   nUn	U R                  U5      nU R	                  U5      nU R
                  c  UnOXR                  U5      -   nXU R                  -  -   nU$ )N)r<   rP   rQ   rR   rS   rT   rU   r   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerM   )r9   r<   rP   rQ   rR   rS   rT   rU   rV   residual_moe_hidden_statess               r&   rC   $GraniteMoeSharedDecoderLayer.forward[   s     !,,];  >> 	
')%+/ 3	
 	
 !43K3K#KK 55mD 11-@??"-M-0NNM 43K3K#KKr%   )rM   )NNNFFN)r   r   r   r   r   r"   r/   r   rE   r    r   booltupler   r   FloatTensorrC   r$   rF   rG   s   @r&   rI   rI   V   s    h5 h# h /304(,).!&HL%||% t+% &&-	%
 %  $;% $;% #5<<#=>E% 45% 
u  %(9(95;L;L(L"MPT"TT	U% %r%   rI   c                   &    \ rS rSr% \\S'   S/rSrg)GraniteMoeSharedPreTrainedModel   r+   rI   r   N)r   r   r   r   r   r!   _no_split_modulesr$   r   r%   r&   re   re      s    ""78r%   re   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeSharedModel   r+   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf N)r.   r/   r   
ModuleListrangenum_hidden_layersrI   layersrN   s      r&   r/   GraniteMoeSharedModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A)rp   )r   r   r   r   r   r/   r$   rF   rG   s   @r&   ri   ri      s    
5 
 
r%   ri   c                   8   ^  \ rS rSrSS0rS\4U 4S jjrSrU =r$ )GraniteMoeSharedForCausalLM   zlm_head.weightzmodel.embed_tokens.weightr+   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rl   )r.   r/   ri   model	post_initr8   s     r&   r/   $GraniteMoeSharedForCausalLM.__init__   s&     *62
r%   )rv   )	r   r   r   r   _tied_weights_keysr   r/   r$   rF   rG   s   @r&   rs   rs      s!    *,GH5  r%   rs   )rs   ri   re   )typingr   r   r   activationsr   cache_utilsr   processing_utilsr   utilsr	   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler)   rI   re   ri   rs   __all__r   r%   r&   <module>r      s       !   &   C 
		H	%)5 0")) 4*#9 *Z9&? 9

O 
"7  fr%   