
    Z j	2                        S SK Jr  S SKrS SKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.  \R^                  " \05      r1 " S S\$5      r2 " S S\5      r3 " S S\5      r4 " S S\'5      r5 " S S\!5      r6 " S S \)5      r7 " S! S"\%5      r8 " S# S$\*5      r9 " S% S&\(5      r: " S' S(\&5      r;/ S)Qr<g)*    )CallableN)nn   )initialization)CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )BambaConfig)
BambaMixerBambaRMSNormGated)Gemma2RotaryEmbedding)
GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedMoEGraniteMoeSharedPreTrainedModelapply_rotary_pos_embeager_attention_forward   )GraniteMoeHybridConfigc                     ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S\	S-  S	\
\R                  \R                  4   S-  S
\\   S\
\R                  \R                  4   4S jjrSrU =r$ )GraniteMoeHybridAttention2   config	layer_idxc                 $   > [         TU ]  X5        g Nsuper__init__selfr'   r(   	__class__s      ڎ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr-   "GraniteMoeHybridAttention.__init__3   s    +    Nhidden_statesattention_maskpast_key_valuesposition_embeddingskwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Ub  Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nr"   r   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr    updater(   r   get_interfacer'   _attn_implementationr!   trainingattention_dropoutr=   reshape
contiguouso_proj)r/   r4   r5   r6   r7   r8   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r1   forward!GraniteMoeHybridAttention.forward6   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST**HC';LVY'_$L&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r3    )NN)__name__
__module____qualname____firstlineno__r#   intr-   torchTensorr   tupler   r   rW   __static_attributes____classcell__r0   s   @r1   r%   r%   2   s    ,5 ,# , )-HL')||') t+') 	')
 #5<<#=>E') +,') 
u||U\\)	*') ')r3   r%   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridMambaLayer`   r'   r(   c                 8   > [         TU ]  [        U5      U5        g r*   )r,   r-   r   r.   s      r1   r-   #GraniteMoeHybridMambaLayer.__init__a   s    V,i8r3   rY   )	rZ   r[   r\   r]   r#   r^   r-   rb   rc   rd   s   @r1   rf   rf   `   s    95 9# 9 9r3   rf   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )GraniteMoeHybridRMSNormGatede   c                 $   > [         TU ]  X5        g r*   r+   )r/   hidden_sizeepsr0   s      r1   r-   %GraniteMoeHybridRMSNormGated.__init__f   s    *r3   rY   )gư>)rZ   r[   r\   r]   r-   rb   rc   rd   s   @r1   rk   rk   e   s    + +r3   rk   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeHybridMLPj   r'   c                 $   > [         TU ]  U5        g r*   r+   r/   r'   r0   s     r1   r-   GraniteMoeHybridMLP.__init__k   s     r3   rY   )rZ   r[   r\   r]   r#   r-   rb   rc   rd   s   @r1   rr   rr   j   s    !5 ! !r3   rr   c                       \ rS rSrSrg)GraniteMoeHybridRotaryEmbeddingo   rY   NrZ   r[   r\   r]   rb   rY   r3   r1   rx   rx   o       r3   rx   c                       \ rS rSrSrg)GraniteMoeHybridMoEs   rY   Nrz   rY   r3   r1   r}   r}   s   r{   r3   r}   c                   >  ^  \ rS rSrS\S\4U 4S jjr\    SS\R                  S\R                  S-  S\
S-  S	\S-  S
\\R                  \R                  4   S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jj5       rSrU =r$ )GraniteMoeHybridDecoderLayerw   r'   r(   c                 ^  > [         TU ]  X5        [        U5      U l        S U l        S U l        UR                  U   S:X  a  [        X5      U l        O[        X5      U l        UR                  U   U l	        UR                  S:  a  [        U5      OS U l        [        USS5      S:  U l        g )Nmambar   num_local_experts)r,   r-   rr   
shared_mlp	self_attnr   layers_block_typerf   r%   
layer_typer   r}   block_sparse_moegetattrhas_expertsr.   s      r1   r-   %GraniteMoeHybridDecoderLayer.__init__x   s    +-f5
##I.'93FFDJ6vIDN 229= @F?W?WZ[?[ 3F ;ae #6+>BQFr3   Nr4   r5   r6   	use_cacher7   r8   r9   c           	         UnU R                  U5      nU R                  b  U R                  " SUUUS.UD6nOU R                  " SUUUUUS.UD6u  pXqU R                  -  -   nUnU R	                  U5      nU R
                  (       a%  U R                  U5      n	XR                  U5      -   nOU R                  U5      nXqU R                  -  -   nU$ )N)r4   cache_paramsr5   )r4   r5   r6   r   r7   rY   )input_layernormr   r   residual_multiplierpost_attention_layernormr   r   r   )
r/   r4   r5   r6   r   r7   r8   residual_moe_hidden_statess
             r1   rW   $GraniteMoeHybridDecoderLayer.forward   s     !,,];::! JJ +,- 	M  $~~  +- /#$7   M !43K3K#KK 55mD $ 5 5m D-0NNM OOM:M 43K3K#KKr3   )r   r   r   r   r   r   )NNFN)rZ   r[   r\   r]   r#   r^   r-   r   r_   r`   r   boolra   r   r   FloatTensorrW   rb   rc   rd   s   @r1   r   r   w   s    G5 G# G&  /3(,!&HL(||( t+( 	(
 $;( #5<<#=>E( 45( 
u  %(9(95;L;L(L"MPT"TT	U( (r3   r   c                   f   ^  \ rS rSr% \\S'   S/rSr\R                  " 5       U 4S j5       r
SrU =r$ )GraniteMoeHybridPreTrainedModel   r'   r   Tc           
        > [         TU ]  U5        [        U[        5      (       a  [        R
                  " UR                  5        [        R                  " UR                  [        R                  " [        R                  " SUR                  S-   5      5      5        [        R
                  " UR                  5        g [        U[        5      (       a!  [        R
                  " UR                  5        g g )Nr"   )r,   _init_weights
isinstancerf   initones_dt_biascopy_A_logr_   logarange	num_headsDrk   weight)r/   moduler0   s     r1   r   -GraniteMoeHybridPreTrainedModel._init_weights   s    f%f899JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  <==JJv}}% >r3   rY   )rZ   r[   r\   r]   r#   __annotations___no_split_modules_is_statefulr_   no_gradr   rb   rc   rd   s   @r1   r   r      s/    ""78L
]]_& &r3   r   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )GraniteMoeHybridModel   r'   c           	      0  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        UR                  U l        UR                  S:X  a  [        U5      U l        g S U l        g s  snf )Nrope)r,   r-   r   
ModuleListrangenum_hidden_layersr   layersembedding_multiplierposition_embedding_typerx   
rotary_embr.   s      r1   r-   GraniteMoeHybridModel.__init__   s~     mmNSTZTlTlNmnNm)&<Nmn
 %+$?$?!EKEcEcgmEm9&Asw os   BN	input_idsr5   position_idsr6   inputs_embedsr   r8   r9   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      n0 n	[        U R                  R                  5       H6  n
SU
;   a  U R                  X$5      X'   M  [        U R                  UUUS9X'   M8     UnS nU R                  b  U R                  X5      n[!        U R"                  5       H,  u  pU" U4XR                  R                  U      UUUS.UD6nM.     U R%                  U5      n['        UUS	9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)r'   r   r"   )devicer   )r'   r   r5   r6   )r5   r6   r   r7   )last_hidden_stater6   )
ValueErrorembed_tokensr   r   r'   get_seq_lengthr_   r   r>   r   	unsqueezesetr   _update_mamba_maskr	   r   	enumerater   normr   )r/   r   r5   r   r6   r   r   r8   past_seen_tokenscausal_mask_mappingr   r4   r7   idecoder_layers                  r1   rW   GraniteMoeHybridModel.forward   s    -t";<YZZ  --i8M%(A(AA0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L dkk;;<J*$262I2I.2j#/2D;;"/#1$3	3#/	 = &"??&"&//-"N )$++ 6A)2;;3P3PQR3ST /#$7 M !7 		-0%++
 	
r3   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr"   )has_previous_stater_   all)r/   r5   r6   
mamba_masks       r1   r   (GraniteMoeHybridModel._update_mamba_mask  sA     $
'O,N,N,P,P&599^q5H+I+IJr3   )r   r   r   )NNNNNN)rZ   r[   r\   r]   r#   r-   r   r   r   r_   
LongTensorr`   r   r   r   r   r   ra   r
   rW   r   rb   rc   rd   s   @r1   r   r      s    x5 x  .2.204(,26!%:
##d*:
 t+:
 &&-	:

 :
 ((4/:
 $;:
 45:
 
(	(:
    :
x r3   r   c                   D   ^  \ rS rSrSS0rS\4U 4S jjrU 4S jrSrU =r	$ )GraniteMoeHybridForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightr'   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r*   )r,   r-   r   model	post_initru   s     r1   r-   $GraniteMoeHybridForCausalLM.__init__   s&     *62
r3   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

>>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
>>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```rY   )r,   rW   )r/   super_kwargsr0   s     r1   rW   #GraniteMoeHybridForCausalLM.forward&  s    . w...r3   )r   )
rZ   r[   r\   r]   _tied_weights_keysr#   r-   rW   rb   rc   rd   s   @r1   r   r     s&    *,GH5 / /r3   r   )r   r   r   )=collections.abcr   r_   r    r   r   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   bamba.configuration_bambar   bamba.modeling_bambar   r   gemma2.modeling_gemma2r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   r   r    r!   configuration_granitemoehybridr#   
get_loggerrZ   loggerr%   rf   rk   rr   rx   r}   r   r   r   r   __all__rY   r3   r1   <module>r      s    %   & . / O 5 & @ @ 7 5 3 @ :   C 
		H	%+) 9 +)\9 9
+#4 +
!- !
	&; 		- 	=#? =@&&E & S1 Sl /"=  /F fr3   