
    Z j                        S SK Jr  S SKJrJr  S SKrS SKJr  S SKJr	  SSK
Jr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/  SSK0J1r1J2r2  SSK3J4r4  SSK5J6r6  SSK7J8r8  \/Rr                  " \:5      r;S r<\" S5      SRS j5       r=S\R|                  S\?S\R|                  4S jr@ SSS \R                  S!\R|                  S"\R|                  S#\R|                  S$\R|                  S-  S%\BS&\BS'\)\+   4S( jjrC\" \=5       " S) S*\R                  5      5       rDS+\R|                  S,\?4S- jrES. rFS/ rGS0 rH " S1 S2\R                  5      rI " S3 S4\R                  R                  5      rJ " S5 S6\R                  5      rK " S7 S8\R                  5      rL " S9 S:\R                  5      rM " S; S<\R                  5      rN " S= S>\R                  5      rO " S? S@\SASB9rP\" SC5       " SD SE\R                  5      5       rQ " SF SG\5      rR\, " SH SI\'5      5       rS\, " SJ SK\S5      5       rT   STSL\R|                  \U\R|                     -  S-  SM\?S-  S$\R|                  S-  S\R|                  \?-  4SN jjrV\, " SO SP\S\5      5       rW/ SQQrXg)U    )Callable)Optional	TypedDictN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocastmerge_with_config_defaults)resolve_internal_import)capture_outputs   )GraniteMoeHybridConfigc                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      ڏ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_halfr3   2   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''    rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer3   )qkcossinunsqueeze_dimq_embedk_embeds          r2   apply_rotary_pos_embr?   9   sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr4   hidden_statesn_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r%   N)r,   expandreshape)r@   rA   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvrJ   S   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr4   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub  X-   n
[
        R                  R                  U
S[        R                  S9R                  UR                  5      n
[
        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr)   r   r(   )r+   dtype)ptrainingr%   )rJ   num_key_value_groupsr-   matmul	transposer   r   softmaxfloat32torT   rQ   rV   
contiguous)rK   rL   rM   rN   rO   rP   rQ   rR   
key_statesvalue_statesattn_weightsattn_outputs               r2   eager_attention_forwardrb   _   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r4   c                     ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S	\
S-  S
\\R                  \R                  4   S-  S\\   S\\R                  \R                  4   4S jjrSrU =r$ )GraniteMoeHybridAttentionx   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 J  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  U l        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        g )NrI   Tbias)super__init__rf   rg   getattrhidden_sizenum_attention_headsrI   rG   rW   attention_multiplierrP   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projselfrf   rg   	__class__s      r2   rl   "GraniteMoeHybridAttention.__init__|   sF   "
F4F4F&JdJd4de$*$>$>&B\B\$\!22!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r4   Nr@   rO   past_key_valuesposition_embeddingsrR   rB   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Ub  Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nr(   r%   r)           )rQ   rP   )r,   rI   ru   viewrY   rv   rw   r?   updaterg   r   get_interfacerf   _attn_implementationrb   rV   rq   rP   rE   r]   rx   )rz   r@   rO   r}   r~   rR   input_shapehidden_shapequery_statesr^   r_   r:   r;   attention_interfacera   r`   s                   r2   forward!GraniteMoeHybridAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST**HC';LVY'_$L&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r4   )rq   rf   rI   rr   rv   rg   rW   rx   ru   rP   rw   NN)__name__
__module____qualname____firstlineno____doc__r&   intrl   r-   Tensorr   tupler   r   r   __static_attributes____classcell__r{   s   @r2   rd   rd   x   s    G
5 
# 
6 )-HL')||') t+') 	')
 #5<<#=>E') +,') 
u||U\\)	*') ')r4   rd   input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moderN   )lenr,   r-   r   r   pad)r   r   	pad_shapes      r2   pad_tensor_by_sizer      sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr4   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r   r(   r)   )r   r   r,   rE   )r   r   
chunk_sizes      r2   reshape_into_chunksr      s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r4   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r(   .NdevicerT   )diagonalr   r*   )
sizerD   r-   trilonesr   boolmasked_fillcumsuminf)r   r   masktensor_segsums       r2   segment_sumr      s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr4   c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr%   r   )r,   rT   r\   )r@   rO   rT   s      r2   apply_mask_to_padding_statesr      s_    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr4   c            
       >  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
S-  S	\R                  S-  S
\R                  S-  4S jjr  SS\
S-  S	\R                  S-  4S jjr   SS\
S-  S	\R                  S-  S
\R                  S-  4S jjrSrU =r$ )GraniteMoeHybridMambaLayer   u(  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

The are a few differences between this and Mamba2Mixer:
- The variable use_precomputed_states is slightly different due to the hybrid cache structure
- There's a few non-obvious bugs fixed with batching in the slow path that exist in main
- Some extra variables that our layer doesn't need have been removed
- We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
rf   rg   c           	        > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        UR                  U R                  -  5      U l        X l        UR                  U l        UR                  U l        ["        UR                     U l        UR&                  U l        UR*                  U l        UR.                  U l        UR2                  U l        UR6                  U l        UR:                  U l        UR<                  U l        UR>                  U l        U R                  SU R0                  -  U R                  -  -   U l         [B        RD                  " U R@                  U R@                  UR                  U R                  U R@                  U R                  S-
  S9U l#        U R                  U R@                  -   U R                  -   n[B        RH                  " U R                  UU R(                  S9U l%        [B        RL                  " [N        RP                  " U R                  5      5      U l)        [N        RT                  " SU R                  S-   5      n[B        RL                  " [N        RV                  " U5      5      U l,        [[        U R                  U R,                  S9U l.        [B        RL                  " [N        RP                  " U R                  5      5      U l/        [B        RH                  " U R                  U R                  U R(                  S9U l0        [c        S5      n[e        USS 5      q3[e        USS 5      q4[c        S	5      n[k        US
S9q6[k        USS9q7[k        USS9q8[s        [l        [n        [p        [h        [f        45      q:[t        (       d  [v        Ry                  S5        g [v        Ry                  S5        g )Nr)   r%   )in_channelsout_channelsrj   kernel_sizegroupspaddingri   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)=rk   rl   mamba_n_heads	num_headsrn   mamba_d_statessm_state_sizemamba_d_convconv_kernel_sizer   mamba_expandintermediate_sizerg   mamba_conv_biasuse_conv_bias
hidden_act
activationr
   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonmamba_n_groupsn_groupsmamba_d_headrI   mamba_chunk_sizer   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1drs   in_proj	Parameterr-   r   dt_biasarangelogA_logGraniteMoeHybridRMSNormGatednormDout_projr   rm   r   r   r#   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rz   rf   rg   projection_sizeAcausal_conv1d	mamba_ssmr{   s          r2   rl   #GraniteMoeHybridMambaLayer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
01G1GTMdMde	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &%>  qrr4   Nr@   cache_paramsrO   seq_idxc                    [        X5      nU R                  U5      nUR                  u  pgnU R                  U R                  -  n	US L=(       a'    UR                  U R                  5      =(       a    US:H  n
U
(       Ga  UR                  S5      R                  U R                  U R                  U R                  /SS9u  pn[        UUR                  U R                     R                  U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[&        R                  " UU R                  X/SS9u  pn[&        R(                  " U R*                  R-                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R/                  SU R0                  U R                  5      R3                  [&        R4                  S9nUS S 2S S 2S 4   R/                  SSU R0                  5      nU R6                  S S 2S S4   R/                  SU R0                  5      nU R8                  S S 2S S4   R/                  SU R0                  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  UR                  S   U R                  -  5      nUR;                  X`R                  U R0                  5      n[=        UR                  U R                     R>                  UUUUUUS USS9
nUR;                  X`R                  U R0                  -  5      nU RA                  X5      nU RC                  U5      S S 2S S4   nU$ [&        R(                  " U R*                  R-                  5       5      * nU RD                  S[-        S	5      4:X  a  0 OS
U RD                  0nU RF                  (       a  Uc  [I        UU R                  R                   R                  S5      U R                  R"                  U R6                  U4U R8                  U RJ                  UU R$                  U R@                  R                   U R@                  RL                  U RB                  R                   U RB                  R"                  U R0                  U R                  SSS.UD6nU$ UR                  U R                  U R                  U R                  /SS9u  pnUbj  URO                  SS5      n[P        RR                  RU                  UU RV                  UR                  S   -
  S45      nURY                  UU R                  5      nU R$                  S;  aH  U R[                  U R                  URO                  SS5      5      SS U24   RO                  SS5      5      nOn[]        URO                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  US9RO                  SS5      n[        X5      n[&        R                  " UU R                  X/SS9u  pn[_        UR;                  XgSU R0                  5      UUUR;                  XgU R                  S5      UR;                  XgU R                  S5      4U RJ                  U R8                  S USU R6                  SS.UD6u  nnUb  Ub  URa                  UU R                  5      nUR;                  XgS5      nU RA                  UU5      nU RC                  U5      nU$ )Nr%   r(   r*   .rT   T)zr   dt_softplusr   r   dt_limitF)r   r   r   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr)   r   )siluswish)r/   weightrj   r   r   )r   r   r   r   r  r   r   )1r   r   r,   r   r   has_previous_staterg   squeezesplitr   r   r   r   layersconv_statesr   r  rj   r   r-   expr   floatrD   rI   r\   r[   r   r   r   r   recurrent_statesr   r   r   rV   r   r   variance_epsilonrY   r   r   r   r   update_conv_stater   r   r   update_recurrent_state)rz   r@   r   rO   r   projected_states
batch_sizeseq_len_groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr  scan_output	ssm_states                             r2   cuda_kernels_forward/GraniteMoeHybridMambaLayer.cuda_kernels_forwardj  s{    5]S<<6 "/!4!4
Q!%1D1D!D $i)H)H)Xi]dhi]i 	
 "*:*B*B1*E*K*K''GR +L +'DR
 !5!##DNN3??""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2##DNN3DD& M *..z>>DMM;YZM IIm:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K #/"@"@dnn"]K??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff#(, LL $* &*&Y" (\-E , C CIt~~ ^I)..zBG"iiT: mmK0
r4   c                    UR                   u  pEnUR                  n[        X5      nU R                  U5      nUR	                  U R
                  U R                  U R                  /SS9u  pnU
R                  SS5      n
US L=(       a'    UR                  U R                  5      =(       a    US:H  nU(       a  UR                  XR                  5      n[        R                  " XR                  R                  R!                  S5      -  SS9n
U R"                  (       a  XR                  R$                  -   n
U R'                  U
5      n
OUbV  [(        R*                  R-                  XR.                  U
R                   S   -
  S45      nUR                  XR                  5      nU R'                  U R                  U
5      SS U24   R                  SS5      5      n
[        X5      n
[        R                  " U
U R
                  U R0                  U R2                  -  U R0                  U R2                  -  /SS9u  pn[        R4                  " U R6                  R9                  5       5      * nU(       Ga  UR:                  U R                     R<                  R>                  nUS S 2SS S 24   S S 2S S4   nUR                  SS5      RA                  XKR                   S   U RB                  5      nU RD                  S   RA                  U RD                  R                   S   U RB                  5      n[        R(                  R*                  RG                  UURI                  UR                  5      -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nUS   RA                  U R                  U RB                  U R2                  5      RI                  [        RN                  S	9n[        R4                  " US   U-  5      RI                  US
9nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nUS   USS S S 24   -  nURQ                  USU RB                  5      nUUS   -  RI                  US
9nUR:                  U R                     R<                  U-  U-   nURU                  UU R                  5      nURQ                  X@R0                  S5      SS S S 24   nURA                  X@R0                  U R                  U R0                  -  UR                   S   5      RS                  5       nURQ                  USUR                   S   5      nURI                  UR>                  UR                  S9nURW                  X@R                  -  U RB                  U R2                  5      nURW                  X@R                  -  U R2                  S5      n[        RX                  " UU5      nURW                  X@R                  U RB                  5      nU RZ                  S   RA                  U RZ                  R                   S   U RB                  5      nUUU-  -   RI                  UR                  5      nURQ                  US5      S S 2S S4   nGO[(        R*                  RG                  XRD                  -   5      n[        RJ                  " XRL                  S   U RL                  S   5      nURQ                  XESU RB                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nURQ                  XESU R2                  5      R9                  5       nUR]                  U R                  U R0                  -  SU R                  S9nUR]                  U R                  U R0                  -  SU R                  S9nU R^                  XPR^                  -  -
  U R^                  -  nU RZ                  S   [a        UU5      -  nXS   -  nURI                  UR                  5      U-  nUUUU4 Vs/ s H  n[c        UUU R^                  5      PM     snu  nnnnURe                  SSSS5      n[        Rf                  " USS9n[        R4                  " [i        U5      5      n US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n!U!R                  SS9n"U"S   U Re                  SSSSS5      S   -  n#U#R                  SS9n$U$S   US S 2S S 2S 4   -  R                  SS9n%[        R4                  " US S 2S S 2S S 2SS 24   U-
  5      n&UU&Re                  SSSS5      S   -  n'U'SS S S 24   US   -  R                  SS9n([        Rj                  " U(S S 2S S24   5      n)[        Rl                  " U)U(/SS9n([        R4                  " [i        [(        R*                  R-                  US S 2S S 2S S 2S4   S5      5      5      n*U*R                  SS5      n*U*S   U(S S 2S S 2S S4   -  R                  SS9n+U+S S 2S S24   U+S S 2S4   n,n([        R4                  " U5      n-USS S S 24   U(S S 2S S 2S S4   -  n.U-Re                  SSSS5      n/U.R                  S5      U/S   -  n0U%U0-   nURQ                  USU R                  U RB                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURQ                  XES5      nU,b  Ub  URU                  U,U R                  5      n,U Ro                  UU	5      n1U Rq                  U1RI                  U5      5      n2U2$ s  snf )Nr(   r*   r%   r)   r   .r   ).NNr   r   r   )r+   output_sizer   r   r   )r%   r   )9r,   rT   r   r   r	  r   r   r   rY   r  rg   r  r-   sumr   r  r  r   rj   r   r   r   r   r   r   r   r  r   r  r
  r  r   rD   rI   r   softplusr\   clampr   r[   rE   r]   r  r   bmmr   repeat_interleaver   r   r   permuter   r   
zeros_liker.   r   r   )3rz   input_statesr   rO   r  r  r  rT   r  r  r  r  r  r  r@   r  r  r   cache_devicer   dAdBdBx
ssm_statesssm_states_reshaped
C_reshapedyr   r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr"  state_decay_outC_times_statesstate_decay_out_permutedY_offr!  contextualized_statess3                                                      r2   torch_forward(GraniteMoeHybridMambaLayer.torch_forward  s
    ".!3!3
Q"" 4LQ<<5&6&<&<''GR '= '
# .77!<!-T!9!~l>]>]^b^l^l>m!~ry}~r~ "&889JNN[K %		kk0088;;! !!$58H8H$H! $): ; ' mm//%(=(=@Q@W@WXZ@[([]^'_ +<<[..Y $5F)GXgX)V)`)`abde)f g89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
! YYtzz'')**!'..t~~>OOVVL Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC &,,T^^<MMPRRUXXJ%<<ZXJ 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A $ahhaggFJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF $..va!e}=OYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A(??	4>>Z	ii4(
 !%knnU.C D$$A &{s   .!oc                    [         (       aO  SU R                  R                  R                  R                  ;   a!  [        5       (       d  U R                  XX45      $ Ub  [        S5      eUR                  nUbC  UR                  S   S:  a0  UR                  S   S:  a  XS S 2S S 2S 4   -  R                  U5      nU R                  XU5      $ )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r%   r   )r   r   r  r   typer   r#  NotImplementedErrorrT   r,   r\   rL  )rz   r@   r   rO   r   rR   rT   s          r2   r   "GraniteMoeHybridMambaLayer.forward  s     "!f0C0C0J0J0O0O&OXpXrXr,,].bb%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*Aq$J-GGKKERM!!-~NNr4   )r   r   r   r   r   r   r   r   r   rI   rn   r   r   rg   r   r   r   r   r   r   r   r   r   r   r   NNNr   )r   r   r   r   r   r&   r   rl   r-   r   r   	IntTensorr#  rL  r   r   r   r   s   @r2   r   r      s    Zs5 Zs# Zs~ &*.2*._||_ dl_ t+	_
 4'_J &*.2	z% dlz% t+	z%@ &*.2*.O dlO t+	O
 4'O Or4   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )r   i  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g Nrk   rl   r   r   r-   r   r  r  rz   rn   r   r{   s      r2   rl   %GraniteMoeHybridRMSNormGated.__init__  s-    ll5::k#:; #r4   c                    UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  S5      R                  SSS9nU[        R                  " X@R                  -   5      -  nU R                  UR                  U5      -  $ Nr)   r(   T)keepdim)rT   r\   r-   r[   r   r   r  powmeanrsqrtr  r  )rz   r@   r  input_dtypevariances        r2   r   $GraniteMoeHybridRMSNormGated.forward  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%H?T?T4T(UU{{]--k:::r4   r  r  gư>rW  )r   r   r   r   rl   r   r   r   r   s   @r2   r   r     s    $
	; 	;r4   r   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeHybridMLPi  zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
rf   c                 X  > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l
        [        R                  " U R                  U R                  SS9U l        g )Nr)   Fri   )rk   rl   rn   
input_sizeshared_intermediate_sizer
   r   r   r   rs   input_linearoutput_linearrz   rf   r{   s     r2   rl   GraniteMoeHybridMLP.__init__  s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr4   r@   rB   c                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr)   r(   r*   r   r%   )rk  chunkr   rl  )rz   r@   chunked_hidden_statess      r2   r   GraniteMoeHybridMLP.forward  s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r4   )r   rn   rk  ri  rl  )r   r   r   r   r   r&   rl   r-   r   r   r   r   r   s   @r2   rg  rg    s7    V5 VU\\ ell  r4   rg  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )GraniteMoeHybridRotaryEmbeddingi  inv_freqNrf   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultru  F)
persistentoriginal_inv_freq)rk   rl   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrf   rope_parametersrw  compute_default_rope_parametersr   attention_scalingregister_bufferclone)rz   rf   r   rope_init_fnru  r{   s        r2   rl   (GraniteMoeHybridRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr4   r   ztorch.devicer  rB   ztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetarI   Ng      ?r   r)   r   r   )	r~  rm   rn   ro   r-   r   int64r\   r  )rf   r   r  baser+   attention_factorru  s          r2   r  ?GraniteMoeHybridRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r4   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r(   r%   mpscpuF)device_typeenabledr)   r*   r   )ru  r  rD   r,   r\   r   
isinstancerP  strr!   rY   r-   r.   r:   r  r;   rT   )
rz   r/   position_idsinv_freq_expandedposition_ids_expandedr  freqsembr:   r;   s
             r2   r   'GraniteMoeHybridRotaryEmbedding.forward<  sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)r  rf   r|  r}  rw  rW  rS  )r   r   r   r   r-   r   __annotations__r&   rl   staticmethodr   r   r   r  r  no_gradr   r   r   r   r   s   @r2   rt  rt    s    llV5 V V  04+/"*&-*(* t* 
~u$	%	* *: ]]_<  <r4   rt  c                   B   ^  \ rS rSrS\S\S\SS4U 4S jjrS rS	rU =r$ )
GraniteMoeHybridParallelExpertsiL  num_expertsri  r'  rB   Nc                    > [         TU ]  5         [        R                  " [        R
                  " XU5      5      U l        Xl        X l        X0l	        g)a]  
Initialize the GraniteMoeHybridParallelExperts module.
The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
[ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
[MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
used in vllm.

Args:
    num_experts (int):
        Number of experts.
    input_size (int):
        Size of the input.
    output_size (int):
        Size of the output.
N)
rk   rl   r   r   r-   emptyr  r  ri  r'  )rz   r  ri  r'  r{   s       r2   rl   (GraniteMoeHybridParallelExperts.__init__M  s<    " 	ll5;;{#TU&$&r4   c                     UR                  USS9n/ n[        U R                  5       H8  nUR                  [        R
                  " X5   U R                  U   5      5        M:     [        R                  " USS9nU$ )z
Forward pass of the GraniteMoeHybridParallelExperts module.

Args:
    inputs (Tensor):
        Input tensor.
    expert_size:
        Expert size information.

Returns:
    Tensor: Output tensor.
r   r*   )	r	  ranger  appendFlinearr  r-   r.   )rz   inputsexpert_size
input_listoutput_listiresultss          r2   r   'GraniteMoeHybridParallelExperts.forwardd  sh     \\+1\5
t''(Aqxx
t{{1~FG )))KQ/r4   )ri  r  r'  r  	r   r   r   r   r   rl   r   r   r   r   s   @r2   r  r  L  s.    'C 'S 's 't '. r4   r  c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r$ )GraniteMoeHybridTopKGatingiy  ri  r  top_kc                 z   > [         TU ]  5         X l        Xl        X0l        [
        R                  " XSS9U l        g)z
Initialize the top-k gating mechanism.

Args:
    input_size (`int`):
        Size of the input.
    num_experts (`int`):
        Number of experts.
    top_k (`int`):
        Number of top experts to select.
Fri   N)rk   rl   r  ri  r  r   rs   layer)rz   ri  r  r  r{   s       r2   rl   #GraniteMoeHybridTopKGating.__init__z  s2     	&$
YYzUC
r4   c                 z   U R                  U5      R                  5       nUR                  U R                  SS9u  p4[        R
                  " USS9R                  U5      n[        R                  " UR                  S5      U R                  /UR                  UR                  S9nUR                  SUS5      nUR                  5       R                  S5      nUR                  5       nUR!                  5       n	U	R#                  S5      u  pUR%                  U R                  SS9nUR!                  5       nX[   nXXU4$ )Nr%   r*   r   rT   r   trunc)rounding_mode)r  r  topkr  r-   rZ   type_aszerosr   r  rT   r   scatterlongr(  tolistflattensortdiv)rz   r@   logitstop_k_logitstop_k_indicestop_k_gatesr  gatesr  top_k_expertsr  index_sorted_expertsbatch_indexbatch_gatess                 r2   r   "GraniteMoeHybridTopKGating.forward  s"   M*002&,kk$**!k&D#mmLa8@@O a $"2"23;;L;LU`UgUg
 a2jjl&&q) "((* &--/"/"4"4Q"7*..tzz.Q "))+!7#+FRRr4   )ri  r  r  r  r  r   s   @r2   r  r  y  s-    D3 DS D D(S Sr4   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )GraniteMoeHybridMoEi  z
A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

Args:
    config:
        Configuration object with model hyperparameters.
rf   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        UR                  U R                  U R                  S-  5      U l
        [        UR                  U R                  U R                  5      U l        [        U R                  UR                  UR                  S9U l        g )Nr)   )ri  r  r  )rk   rl   rn   ri  r   r
   r   r   r  num_local_expertsrk  rl  r  num_experts_per_tokrouterrm  s     r2   rl   GraniteMoeHybridMoE.__init__  s     ,,!33 !2!23;$$doot7G7G!7K
 =$$d&6&6
 100,,
r4   c                    UR                  5       u  p#nUR                  SU5      nU R                  U5      u  pVpxnX   n	U R                  X5      n
U
R	                  SSS9nU R                  US   5      US   -  n
U R                  X5      nXS S 2S 4   -  n[        R                  " X#-  U R                  4UR                  UR                  S9nUR                  SXl5      nUR                  X#U R                  5      nU$ )Nr(   r)   r*   r   r%   r  )r   rE   r  rk  rp  r   rl  r-   r  ri  rT   r   	index_addr   )rz   layer_inputbszlengthemb_sizer  r  r  r  expert_inputsr@   rq  expert_outputsr  layer_outputs                  r2   r   GraniteMoeHybridMoE.forward  s    + 0 0 2X!))"h76:kk+6N3!#0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++MG'ag*>>S\4??;>CWCW`n`u`uvq+F#((dooFr4   )r   rn   rk  ri  rl  r  )
r   r   r   r   r   r&   rl   r   r   r   r   s   @r2   r  r    s    
5 
& r4   r  c                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
GraniteFlashAttentionKwargsi  a   
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

cu_seq_lens_q (`torch.LongTensor`):
    Gets cumulative sequence length for query state.
cu_seq_lens_k (`torch.LongTensor`):
    Gets cumulative sequence length for key state.
max_length_q (`int`):
    Maximum sequence length for query state.
max_length_k (`int`):
    Maximum sequence length for key state.
seq_idx (`torch.IntTensor):
    Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr    N)r   r   r   r   r   r-   
LongTensorr  r   rT  r   r  r4   r2   r  r    s7      ######__r4   r  F)totalRMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )GraniteMoeHybridRMSNormi  r   rB   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z6
GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
NrX  rY  s      r2   rl    GraniteMoeHybridRMSNorm.__init__  s/     	ll5::k#:; #r4   r@   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r\  )	rT   r\   r-   r[   r^  r_  r`  r  r  )rz   r@   ra  rb  s       r2   r   GraniteMoeHybridRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r4   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r  r,   r  )rz   s    r2   
extra_repr"GraniteMoeHybridRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr4   rd  re  )r   r   r   r   r  rl   r-   r   r   r  r   r   r   s   @r2   r  r    sB    $ $$ $ $;U\\ ;ell ;J Jr4   r  c                   >  ^  \ rS rSrS\S\4U 4S jjr\    SS\R                  S\R                  S-  S\
S-  S	\S-  S
\\R                  \R                  4   S-  S\\   S\\R                  \\R                  \R                  4   S-  4   4S jj5       rSrU =r$ )GraniteMoeHybridDecoderLayeri  rf   rg   c                 ,  > [         TU ]  5         UR                  U l        S U l        [	        UR                  UR
                  S9U l        [	        UR                  UR
                  S9U l        UR                  S:  a  [        U5      OS U l
        UR                  U l        [        U5      U l        S U l        UR                  U   S:X  a  [!        X5      U l        O[#        X5      U l        UR                  U   U l        ['        USS5      S:  U l        g )Nr   r   mambar  )rk   rl   rn   	self_attnr  r   input_layernormpost_attention_layernormr  r  block_sparse_moeresidual_multiplierrg  
shared_mlpr  layers_block_typer   rd   
layer_typerm   has_expertsry   s      r2   rl   %GraniteMoeHybridDecoderLayer.__init__  s    !--6v7I7IvObObc(?@R@RX^XkXk(l% @F?W?WZ[?[ 3F ;ae#)#=#= -f5
##I.'93FFDJ6vIDN 229= #6+>BQFr4   Nr@   rO   r}   	use_cacher~   rR   rB   c           	         UnU R                  U5      nU R                  b  U R                  " SUUUS.UD6nOU R                  " SUUUUUS.UD6u  pXqU R                  -  -   nUnU R	                  U5      nU R
                  (       a%  U R                  U5      n	XR                  U5      -   nOU R                  U5      nXqU R                  -  -   nU$ )N)r@   r   rO   )r@   rO   r}   r  r~   r  )r  r  r  r  r  r  r  r  )
rz   r@   rO   r}   r  r~   rR   residualr  moe_hidden_statess
             r2   r   $GraniteMoeHybridDecoderLayer.forward  s     !,,];::! JJ +,- 	M  $~~  +- /#$7   M !43K3K#KK 55mD $ 5 5m D-0NNM OOM:M 43K3K#KKr4   )
r  r  rn   r  r  r  r  r  r  r  )NNFN)r   r   r   r   r&   r   rl   r   r-   r   r   r   r   r   r  FloatTensorr   r   r   r   s   @r2   r  r    s    G5 G# G.  /3(,!&HL(||( t+( 	(
 $;( #5<<#=>E( 45( 
u  %(9(95;L;L(L"MPT"TT	U( (r4   r  c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.rSr\R(                  " 5       U 4S	 j5       rS
rU =r$ )GraniteMoeHybridPreTrainedModeliJ  rf   modelTr  r}   F)r@   
attentionsc           
      |  > [         TU ]  U5        [        U[        5      (       a4  [        R
                  " UR                  SU R                  R                  S9  [        U[        5      (       a  [        R                  " UR                  5        [        R                  " UR                  [        R                  " [        R                   " SUR"                  S-   5      5      5        [        R                  " UR$                  5        g [        U[&        5      (       a!  [        R                  " UR                  5        g g )Nr   )r_  stdr%   )rk   _init_weightsr  r  initnormal_r  rf   initializer_ranger   ones_r   copy_r   r-   r   r   r   r   r   )rz   rK   r{   s     r2   r	  -GraniteMoeHybridPreTrainedModel._init_weights\  s    f%f=>>LLSdkk6S6STf899JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  <==JJv}}% >r4   r  )r   r   r   r   r&   r  base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  rd   _can_record_outputs_is_statefulr-   r  r	  r   r   r   s   @r2   r  r  J  sq    ""&*#78#4"5N""&5/ L
]]_	& 	&r4   r  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )GraniteMoeHybridModelii  rf   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        UR"                  S:X  a  [%        U5      OS U l        SU l        UR*                  U l        U R-                  5         g s  snf )Nr   ropeF)rk   rl   pad_token_idpadding_idx
vocab_sizer   	Embeddingrn   embed_tokens
ModuleListr  num_hidden_layersr  r
  r  r   r   position_embedding_typert  
rotary_embgradient_checkpointingembedding_multiplier	post_initry   s      r2   rl   GraniteMoeHybridModel.__init__k  s     !.. ++LL):):F<N<NPTP`P`ammNSTZTlTlNmnNm)&<Nmn
 ,F,>,>FDWDWX	EKEcEcgmEm9&Asw&+#$*$?$?! 	 os   D$N	input_idsrO   r  r}   inputs_embedsr  rR   rB   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      n0 n	[        U R                  R                  5       H6  n
SU
;   a  U R                  X$5      X'   M  [        U R                  UUUS9X'   M8     UnS nU R                  b  U R                  X5      n[!        U R"                  5       H,  u  pU" U4XR                  R                  U      UUUS.UD6nM.     U R%                  U5      n['        UUS	9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)rf   r   r%   r&  r  )rf   r-  rO   r}   )rO   r}   r  r~   )last_hidden_stater}   )
ValueErrorr#  r)  r   rf   get_seq_lengthr-   r   r,   r   r7   setr  _update_mamba_maskr   r'  	enumerater
  r   r   )rz   r,  rO   r  r}   r-  r  rR   past_seen_tokenscausal_mask_mappingr  r@   r~   r  decoder_layers                  r2   r   GraniteMoeHybridModel.forward|  s    -t";<YZZ  --i8M%(A(AA0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L dkk;;<J*$262I2I.2j#/2D;;"/#1$3	3#/	 = &"??&"&//-"N )$++ 6A)2;;3P3PQR3ST /#$7 M !7 		-0%++
 	
r4   c                     UnUb  UR                  5       (       d!  Ub   [        R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
Nr%   )r  r-   r   )rz   rO   r}   
mamba_masks       r2   r3  (GraniteMoeHybridModel._update_mamba_mask  sA     $
'O,N,N,P,P&599^q5H+I+IJr4   )r#  r)  r(  r
  r   r   r'  r!  )NNNNNN)r   r   r   r   r&   rl   r   r"   r$   r-   r  r   r   r  r   r   r  r   r   r   r3  r   r   r   s   @r2   r  r  i  s    5 "  .2.204(,26!%:
##d*:
 t+:
 &&-	:

 :
 ((4/:
 $;:
 45:
 
(	(:
    :
x r4   r  gate_logitsr  c                    U b  [        U [        5      (       d  g[        U [        5      (       aC  U S   R                  n[        R                  " U  Vs/ s H  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGOUR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XX45      R                  SU5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  n[        R                   " XR#                  S5      -  5      nUU-  $ s  snf )ax  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r*   r(   )r  r   r   r-   r.   r\   r   r   rZ   r  one_hotr_  r  r,   rD   rE   r(  r7   )r<  r  r  rO   compute_device
layer_gateconcatenated_gate_logitsrouting_weightsr  selected_expertsexpert_masktokens_per_expertrouter_prob_per_expertr  sequence_lengthr%  expert_attention_mask router_per_expert_attention_maskoverall_losss                      r2   load_balancing_loss_funcrK    s+   : *[%"@"@+u%%$Q..#(99^i-j^iPZmmN.K^i-jpq#r hh))112JPR1SO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
4::1=*B^_ 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OQRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1Q1QRS1TTUL+%%[ .ks   Ic                   T  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjr\	\
        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\-  4S jj5       5       rSrU =r$ )GraniteMoeHybridForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr@   r  rf   c                 l  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l        UR                  U l        UR                  U l        U R                  5         g )NFri   )rk   rl   r  r  r!  r   rs   rn   rN  router_aux_loss_coefr  r  r  logits_scalingr*  rm  s     r2   rl   $GraniteMoeHybridForCausalLM.__init__!  s     *62
 ++yy!3!3V5F5FUS$*$?$?!!33#)#=#= $33 	r4   Nr,  rO   r  r}   r-  labelsoutput_router_logitslogits_to_keeprB   c	           
         Ub  UOU R                   R                  nU R                  " SUUUUUS.U	D6n
U
R                  n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nXR                   R                  -  nSnUb*  U R                  " UU4SU R                   R                  0U	D6nSnU(       aY  [        U
R                  U R                  U R                  U5      nUb*  XR                  UR!                  UR"                  5      -  -  n[%        UUUU
R&                  U
R(                  U
R*                  U
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

>>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
>>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r,  rO   r  r}   r-  r!  )lossaux_lossr  r}   r@   r  router_logitsr  )rf   rU  r  r/  r  r   slicerN  rR  loss_functionr!  rK  rZ  r  r  rQ  r\   r   r   r}   r@   r  )rz   r,  rO   r  r}   r-  rT  rU  rV  rR   outputsr@   slice_indicesr  rX  rY  s                   r2   r   #GraniteMoeHybridForCausalLM.forward.  sw   J %9$D $++JjJj 	 ** 
)%+'
 
  118B>SV8W8W~ot4]kmA}a,?@A++444%%  ;;11 	D /%%  ((	H !11HKK4LLL(#33!//))!//
 	
r4   )rN  rR  r  r  r  rQ  r!  )NNNNNNNr   )r   r   r   r   _tied_weights_keys_tp_plan_pp_planr&   rl   r   r   r-   r  r   r   r  r   r   r   r   r   r   r   r   s   @r2   rM  rM    s    *,GH23H_-z:;H5   .2.204(,26*.,0-.Q
##d*Q
 t+Q
 &&-	Q

 Q
 ((4/Q
   4'Q
 #TkQ
 ell*Q
 
*	*Q
  Q
r4   rM  )rM  r  r  )r%   )r   )Nr)   N)Ycollections.abcr   typingr   r   r-   r   torch.nnr   r   r	   r
  activationsr
   cache_utilsr   r   
generationr   integrationsr   r   r   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    utils.genericr!   r"   utils.import_utilsr#   utils.output_capturingr$   configuration_granitemoehybridr&   
get_loggerr   r   r3   r?   r   r   rJ   Moduler  rb   rd   r   r   r   r   r   r   rg  rt  r  r  r  r  r  r  r  r  r   rK  rM  __all__r  r4   r2   <module>rz     s   * % &   $ & ! . ) f f 8 / 9 j j K F & l l G 9 5 B 
		H	%( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*A)		 A) +A)NVU\\ VS V
((	\O \O~;588?? ;$")) 4><bii ><B*bii *Z.S .Sb,")) ,^)5 0 Y'Jbii J (J(A#= AH &o & &< \; \ \B #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d e
"A? e
 e
P fr4   