
    Z j*                     D   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'J(r(  SSK)J*r*  SSK+J,r,  \RZ                  " \.5      r/ " S S\!5      r0 " S S\"5      r1 " S S\5      r2 " S S\5      r3 " S S\&5      r4 " S  S!\*5      r5 " S" S#\Rl                  5      r7 " S$ S%\ 5      r8\ " S& S'\5      5       r9\ " S( S)\(5      5       r: " S* S+\'\5      r;/ S,Qr<g)-zPyTorch OLMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)OutputRecorder   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)MixtralExpertsMixtralForCausalLMMixtralModel)Qwen2MoeTopKRouter   )OlmoeConfigc                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )OlmoeRMSNorm-   c                 $   > [         TU ]  X5        g N)super__init__)selfhidden_sizeeps	__class__s      x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/olmoe/modular_olmoe.pyr'   OlmoeRMSNorm.__init__.   s    *     )gh㈵>)__name__
__module____qualname____firstlineno__r'   __static_attributes____classcell__r+   s   @r,   r"   r"   -   s    + +r.   r"   c                       \ rS rSrSrg)OlmoeRotaryEmbedding2   r/   Nr0   r1   r2   r3   r4   r/   r.   r,   r8   r8   2       r.   r8   c                       \ rS rSrSrg)OlmoeMLP6   r/   Nr:   r/   r.   r,   r=   r=   6   r;   r.   r=   c                   ,  ^  \ rS rSrSS\S\S-  4U 4S jjjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  \	\R                     S-  4   4S jjrSrU =r$ )OlmoeAttention:   Nconfig	layer_idxc                    > [         TU ]  X5        [        UR                  UR                  S9U l        [        UR                  UR                  -  UR                  -  UR                  S9U l        g )Nr*   )	r&   r'   r"   r)   rms_norm_epsq_normnum_attention_headsnum_key_value_headsk_normr(   rB   rC   r+   s      r,   r'   OlmoeAttention.__init__;   s^    +"6#5#56;N;NO"6#=#==A[A[[agatat
r.   hidden_statesposition_embeddingsattention_maskpast_key_valueskwargsreturnc           
         UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      5      nU R	                  U R                  U5      5      n	U R                  U5      n
U R                  R                  b  UR                  U R                  R                  * U R                  R                  S9  U	R                  U R                  R                  * U R                  R                  S9  U
R                  U R                  R                  * U R                  R                  S9  UR                  " U6 R                  SS5      nU	R                  " U6 R                  SS5      n	U
R                  " U6 R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                   " U R                  R"                  [$        5      nU" U UU	U
U4U R&                  (       d  SOU R(                  U R*                  [-        U R                  SS 5      S.UD6u  pUR.                  " / UQSP76 R1                  5       nU R3                  U5      nX4$ )N)minmaxr   r           sliding_window)dropoutscalingrX   )shapehead_dimrG   q_projrJ   k_projv_projrB   clip_qkvclamp_view	transposer   updaterC   r   get_interface_attn_implementationr   trainingattention_dropoutrZ   getattrreshape
contiguouso_proj)r(   rM   rN   rO   rP   rQ   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r,   forwardOlmoeAttention.forwardB   s9    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((,7AA!QG__l3==aC
#((,7AA!QG&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
! "));;;;FFHkk+.((r.   )rJ   rG   r%   )r0   r1   r2   r3   r    intr'   torchTensortupler   r   r   rw   r4   r5   r6   s   @r,   r@   r@   :   s    
{ 
sTz 
 
 )-/)||/) #5<<#=>/) t+	/)
 /) +,/) 
u||U\\D0%2E2LL	M/) /)r.   r@   c                       \ rS rSrSrg)OlmoeExpertst   r/   Nr:   r/   r.   r,   r~   r~   t   r;   r.   r~   c                       \ rS rSrSrg)OlmoeTopKRouterx   r/   Nr:   r/   r.   r,   r   r   x   r;   r.   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )OlmoeSparseMoeBlock|   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r%   )r&   r'   r   gater~   expertsr(   rB   r+   s     r,   r'   OlmoeSparseMoeBlock.__init__}   s&    #F+	#F+r.   rM   rR   c                     UR                   u  p#nUR                  SU5      nU R                  U5      u  pVnU R                  XU5      R	                  X#U5      nU$ )NrT   )r[   rb   r   r   rj   )	r(   rM   
batch_sizesequence_length
hidden_dim_top_k_weightstop_k_indexfinal_hidden_statess	            r,   rw   OlmoeSparseMoeBlock.forward   s`    2?2E2E/
Z%**2z:(,		-(@%+"ll=}U]]
 #"r.   )r   r   )
r0   r1   r2   r3   r'   rz   r{   rw   r4   r5   r6   s   @r,   r   r   |   s(    ,
#U\\ #ell # #r.   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoeDecoderLayer   rB   rC   c                   > [         TU ]  X5        UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)rB   rC   rE   )r&   r'   r)   r@   	self_attnr   mlpr"   rF   input_layernormpost_attention_layernormrK   s      r,   r'   OlmoeDecoderLayer.__init__   sl    +!--'vK&v.+F,>,>FDWDWX(4V5G5GVM`M`(a%r.   )r)   r   r   r   r   )	r0   r1   r2   r3   r    ry   r'   r4   r5   r6   s   @r,   r   r      s    b{ bs b br.   r   c                       \ rS rSr% \\S'   SrSrS/rS/r	Sr
Sr\" \SS9\\S	.rSr\R&                  " 5       S
 5       rSrg)OlmoePreTrainedModel   rB   modelTr   rP   r   )index)router_logitsrM   
attentionsc                    [         R                  " X5        [        U[        5      (       ai  [        R
                  " UR                  SU R                  R                  S9  [        R
                  " UR                  SU R                  R                  S9  g [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g g )NrW   )meanstd)r   _init_weights
isinstancer~   initnormal_gate_up_projrB   initializer_range	down_projr   weight)r(   modules     r,   r   "OlmoePreTrainedModel._init_weights   s    %%d3fl++LL,,3DKK<Y<YZLL))9V9VW00LLSdkk6S6ST 1r.   r/   N)r0   r1   r2   r3   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r   r   r@   _can_record_outputs_supports_attention_backendrz   no_gradr   r4   r/   r.   r,   r   r      sk    &*#,-#4"5N'qA*$ #'
]]_U Ur.   r   c                      ^  \ rS rSrS\4U 4S jjr      SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S	\R                  S-  S
\S-  S\\   S\4S jjrSrU =r$ )
OlmoeModel   rB   c           	        > [         TU ]  U5        [        R                  " UR                  UR
                  U R                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR
                  UR                  S9U l        [!        US9U l        g s  snf )NrE   rB   )r&   r'   r   	Embedding
vocab_sizer)   padding_idxembed_tokens
ModuleListrangenum_hidden_layersr   layersr"   rF   normr8   
rotary_embrK   s      r,   r'   OlmoeModel.__init__   s     LL):):F<N<NPTP`P`ammCHIaIaCbcCbiv1Cbc
 !!3!39L9LM	.f= ds   3CN	input_idsrO   position_idsrP   inputs_embeds	use_cacherQ   rR   c           
      B   US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  X5      nU R                  S U R                  R                    H  nU" U
4UU	UUUS.UD6n
M     U R                  U
5      n
[        U
US9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )device)rB   r   rO   rP   r   )rN   rO   r   rP   r   )last_hidden_staterP   )
ValueErrorr   rB   r   get_seq_lengthrz   aranger[   r   	unsqueezer
   r   r   r   r   r   )r(   r   rO   r   rP   r   r   rQ   past_seen_tokenscausal_maskrM   rN   decoder_layers                r,   rw   OlmoeModel.forward   sF    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 & #oomJ![[)H4;;+H+HIM)$7*) /# M J 		-0%++
 	
r.   )r   r   r   r   )NNNNNN)r0   r1   r2   r3   r    r'   rz   
LongTensorr{   r   FloatTensorboolr   r   r   rw   r4   r5   r6   s   @r,   r   r      s    >{ > .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
 5
 5
r.   r   c                   <   ^  \ rS rSrSS0rU 4S jrU 4S jrSrU =r$ )OlmoeForCausalLM   zlm_head.weightzmodel.embed_tokens.weightc                 f   > [         TU ]  U5        [        U5      U l        UR                  U l        g r%   )r&   r'   r   r   num_expertsr   s     r,   r'   OlmoeForCausalLM.__init__   s*     '
!--r.   c                 $   > [         TU ]  " S0 UD6$ )u  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, OlmoeForCausalLM

>>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
>>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
```
r/   )r&   rw   )r(   super_kwargsr+   s     r,   rw   OlmoeForCausalLM.forward   s    0 w...r.   )r   r   )	r0   r1   r2   r3   _tied_weights_keysr'   rw   r4   r5   r6   s   @r,   r   r      s    *,GH.
/ /r.   r   )r   r   r   )=__doc__collections.abcr   rz   r    r   r   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.output_capturingr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   mixtral.modeling_mixtralr   r   r   qwen2_moe.modeling_qwen2_moer   configuration_olmoer    
get_loggerr0   loggerr"   r8   r=   r@   r~   r   Moduler   r   r   r   r   __all__r/   r.   r,   <module>r      s    $   & . ) / 6 F & @ @ 4 +  X W = , 
		H	%+< +
	/ 		x 	7)^ 7)t	> 		( 	#")) # b) b U? U U4 ?
 ?
 ?
D /)?  /F Er.   