
    Z jR2                        S SK r S SK Jr  SSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&J'r'J(r(  SSK)J*r*   " S S\5      r+ " S S\5      r, " S S\5      r- " S S\ 5      r. " S S\R^                  5      r0 " S S \"5      r1 " S! S"\%5      r2\ " S# S$\#\5      5       r3\ " S% S&\'5      5       r4 " S' S(\&5      r5/ S)Qr6g)*    N)nn   )initialization)ACT2FN)CacheDynamicCache)create_causal_mask)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemerge_with_config_defaults)capture_outputs   )GraniteRMSNormGraniteRotaryEmbedding)JetMoeParallelExpertsJetMoeTopKGating)LlamaAttentionLlamaPreTrainedModel)MixtralDecoderLayerMixtralForCausalLMMixtralModelload_balancing_loss_func   )GraniteMoeConfigc                       \ rS rSrSrg)GraniteMoeRMSNorm$    N__name__
__module____qualname____firstlineno____static_attributes__r#       ڂ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/granitemoe/modular_granitemoe.pyr!   r!   $       r*   r!   c                       \ rS rSrSrg)GraniteMoeRotaryEmbedding(   r#   Nr$   r#   r*   r+   r.   r.   (   r,   r*   r.   c                       \ rS rSrSrg)GraniteMoeParallelExperts,   r#   Nr$   r#   r*   r+   r1   r1   ,   r,   r*   r1   c                       \ rS rSrSrg)GraniteMoeTopKGating0   r#   Nr$   r#   r*   r+   r4   r4   0   r,   r*   r4   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )GraniteMoeMoE4   z
A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

Args:
    config:
        Configuration object with model hyperparameters.
configc                   > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        UR                  U R                  U R                  S-  5      U l
        [        UR                  U R                  U R                  5      U l        [        U R                  UR                  UR                  S9U l        g )Nr   )
input_sizenum_expertstop_k)super__init__hidden_sizer;   intermediate_sizer   
hidden_act
activationr1   num_local_expertsinput_linearoutput_linearr4   num_experts_per_tokrouterselfr9   	__class__s     r+   r?   GraniteMoeMoE.__init__=   s     ,,!33 !2!235f6N6NPTP_P_aeaqaqtuauv6v7O7OQUQaQacgcrcrs*00,,
r*   c                    UR                  5       u  p#nUR                  SU5      nU R                  U5      u  pVpxnX   n	U R                  X5      n
U
R	                  SSS9nU R                  US   5      US   -  n
U R                  X5      nXS S 2S 4   -  n[        R                  " X#-  U R                  4UR                  UR                  S9nUR                  SXl5      nUR                  X#U R                  5      nU$ )Nr   )dimr   r   )dtypedevice)sizereshaperH   rE   chunkrC   rF   torchzerosr;   rP   rQ   	index_addview)rJ   layer_inputbszlengthemb_size_batch_indexbatch_gatesexpert_sizeexpert_inputshidden_stateschunked_hidden_statesexpert_outputsrV   layer_outputs                  r+   forwardGraniteMoeMoE.forwardL   s    + 0 0 2X!))"h76:kk+6N3!#0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++MG'ag*>>S\4??;>CWCW`n`u`uvq+F#((dooFr*   )rC   r@   rE   r;   rF   rH   )
r%   r&   r'   r(   __doc__r   r?   rf   r)   __classcell__rK   s   @r+   r7   r7   4   s    
/ 
 r*   r7   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeAttention_   r9   	layer_idxc                 H   > [         TU ]  XU5        UR                  U l        g N)r>   r?   attention_multiplierscalingrJ   r9   rn   rK   s      r+   r?   GraniteMoeAttention.__init__`   s    y122r*   )rr   )	r%   r&   r'   r(   r   intr?   r)   ri   rj   s   @r+   rl   rl   _   s    3/ 3C 3 3r*   rl   c                      ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S-  S\	S-  S	\
\R                  \R                  4   S-  S
\R                  4
S jjrSrU =r$ )GraniteMoeDecoderLayere   r9   rn   c                 2  > [         TU ]  X5        [        XS9U l        [	        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        U ?[	        U5      U l        UR                  U l        g )N)r9   rn   eps)r>   r?   rl   	self_attnr7   block_sparse_moer!   r@   rms_norm_epsinput_layernormpost_attention_layernormmlpresidual_multiplierrs   s      r+   r?   GraniteMoeDecoderLayer.__init__f   s    +,FP -f 501C1CI\I\](9&:L:LRXReRe(f%H -f 5#)#=#= r*   Nrb   attention_maskpast_key_valuesposition_embeddingsreturnc                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pXaU R                  -  -   nUnU R                  U5      nU R	                  U5      nXaU R                  -  -   nU$ )N)rb   r   r   r   r#   )r   r|   r   r   r}   )rJ   rb   r   r   r   kwargsresidualr]   s           r+   rf   GraniteMoeDecoderLayer.forwardp   s     !,,];>> 
')+ 3	

 
 !43K3K#KK 55mD--m< 43K3K#KKr*   )r}   r   r   r   r|   )NNN)r%   r&   r'   r(   r   ru   r?   rU   Tensorr   tuplerf   r)   ri   rj   s   @r+   rw   rw   e   s    >/ >C > /3(,HL|| t+ 	
 #5<<#=>E 
 r*   rw   c                   n    \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSr\R                  " 5       S 5       rS	rg
)GraniteMoePreTrainedModel   r9   modelTrw   r   Fc                     [         R                  " X5        [        U[        5      (       a5  [        R
                  " UR                  SU R                  R                  S9  g g )Ng        )meanstd)	r   _init_weights
isinstancer1   initnormal_weightr9   initializer_range)rJ   modules     r+   r   'GraniteMoePreTrainedModel._init_weights   sA    %%d3f788LLSdkk6S6ST 9r*   r#   N)r%   r&   r'   r(   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphrU   no_gradr   r)   r#   r*   r+   r   r      sQ    &*#12#4"5N"
]]_U Ur*   r   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )GraniteMoeModel   r9   c           	      &  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        UR                  U l        g s  snf )Nrz   )r>   r?   r   
ModuleListrangenum_hidden_layersrw   layersr!   r@   r~   normembedding_multiplierrs   s      r+   r?   GraniteMoeModel.__init__   su     mmHMfNfNfHghHg9#F6Hgh
 &f&8&8f>Q>QR	$*$?$?! is   BN	input_idsr   position_idsr   inputs_embeds	use_cacher   r   c           
      ^   US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	XPR                  -  nUn
U R                  X5      nU R                  S U R                  R                    H  nU" U
4UU	UUUS.UD6n
M     U R                  U
5      n
[!        U
US9$ )	Nz:You must specify exactly one of input_ids or inputs_embeds)r9   r   r   )rQ   )r9   r   r   r   r   )r   r   r   r   r   )last_hidden_stater   )
ValueErrorr   r9   embed_tokensget_seq_lengthrU   arangeshaperQ   	unsqueezer	   r   
rotary_embr   r   r   r   )rJ   r   r   r   r   r   r   r   past_seen_tokenscausal_maskrb   r   decoder_layers                r+   rf   GraniteMoeModel.forward   sT    -t";<YZZ0*$++>O  --i8MCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &(A(AA% #oomJ![[)H4;;+H+HIM)$7*) /# M J 		-0%++
 	
r*   )r   r   r   )NNNNNN)r%   r&   r'   r(   r   r?   r   r   r   rU   
LongTensorr   r   FloatTensorboolr   r   r   rf   r)   ri   rj   s   @r+   r   r      s    @/ @   .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
 5
    5
r*   r   c                   4  ^  \ rS rSrS\4U 4S jjr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\R                  S-  S
\R                  S-  S\S-  S\\R                  -  S\\-  4S jj5       5       rSrU =r$ )GraniteMoeForCausalLM   r9   c                 f   > [         TU ]  U5        [        U5      U l        UR                  U l        g rp   )r>   r?   r   r   logits_scalingrI   s     r+   r?   GraniteMoeForCausalLM.__init__   s*     $V,
$33r*   Nr   r   r   r   r   labelsoutput_router_logitslogits_to_keepr   c	           
         Ub  UOU R                   R                  nU R                  " SUUUUUS.U	D6n
U
R                  n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nXR                   R                  -  nSnUb*  U R                  " UU4SU R                   R                  0U	D6nSnU(       aY  [        U
R                  U R                  U R                  U5      nUb*  XR                  UR!                  UR"                  5      -  -  n[%        UUUU
R&                  U
R(                  U
R*                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, GraniteMoeForCausalLM

>>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
>>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r   r   r   r   r   
vocab_size)lossaux_losslogitsr   rb   
attentionsrouter_logitsr#   )r9   r   r   r   r   ru   slicelm_headr   loss_functionr   r   r   r<   rG   router_aux_loss_coeftorQ   r
   r   rb   r   )rJ   r   r   r   r   r   r   r   r   r   outputsrb   slice_indicesr   r   r   s                   r+   rf   GraniteMoeForCausalLM.forward   sw   J %9$D $++JjJj 	 ** 
)%+'
 
  118B>SV8W8W~ot4]kmA}a,?@A++444%%  ;;11 	D /%%  ((	H !11HKK4LLL(#33!//))!//
 	
r*   )r   r   )NNNNNNNr   )r%   r&   r'   r(   r   r?   r   r   rU   r   r   r   r   r   ru   r   r
   rf   r)   ri   rj   s   @r+   r   r      s    4/ 4
  .2.204(,26*.,0-.Q
##d*Q
 t+Q
 &&-	Q

 Q
 ((4/Q
   4'Q
 #TkQ
 ell*Q
 
*	*Q
  Q
r*   r   )r   r   r   )7rU   r    r   r   activationsr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   r   utils.output_capturingr   granite.modeling_graniter   r   jetmoe.modeling_jetmoer   r   llama.modeling_llamar   r   mixtral.modeling_mixtralr   r   r   r   configuration_granitemoer   r!   r.   r1   r4   Moduler7   rl   rw   r   r   r   __all__r#   r*   r+   <module>r      s       & ! . / Q - & 7 I 5 M L G v v 6	 		 6 		 5 		+ 	(BII (V3. 3!0 !H U 4o U U" A
l A
 A
HY
. Y
x Tr*   