
    Z j                        S SK r S SKrS SKJr  SSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJrJrJrJrJr  SS	KJr  \R(                  " \5      r " S
 S\R.                  5      r " S S\5      r " S S\5      rS rS!S jr " S S\5      r " S S\5      r " S S\5      r " S S\\5      r  " S S\5      r! " S S\5      r" " S S\	5      r#/ S Qr$g)"    N   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )HeliumRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/helium/modular_helium.pyr   HeliumRMSNorm.__init__   s-    ll5::k#:; #    c                 V   UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  R                  [        R                  5      U-  R                  U5      $ )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariances       r"   forwardHeliumRMSNorm.forward$   s    #))%((7 $$Q',,R,>%H?T?T4T(UUu}}-=AA+NNr$   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   shaper   )r   s    r"   
extra_reprHeliumRMSNorm.extra_repr+   s*    ))*+6$2G2G1HIIr$   )r   r   )gư>)	__name__
__module____qualname____firstlineno__r   r1   r6   __static_attributes____classcell__r!   s   @r"   r   r      s    $
OJ Jr$   r   c                       \ rS rSrSrg)HeliumRotaryEmbedding/    Nr8   r9   r:   r;   r<   rB   r$   r"   r@   r@   /       r$   r@   c                       \ rS rSrSrg)	HeliumMLP3   rB   NrC   rB   r$   r"   rF   rF   3   rD   r$   rF   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   Nr   r   r&   dim)r   stackflatten)xx1x2s      r"   rotate_halfrQ   7   sJ    	
319B	
319B;;Ryb)11"55r$   c                 4   UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.Nr&   r   rI   )	unsqueezer5   repeat_interleaverQ   )qkcossinunsqueeze_dimq_embedk_embeds          r"   apply_rotary_pos_embr\   >   s    $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
ECw;q>C/0Gw;q>C/0Gr$   c                   >   ^  \ rS rSrSS\S\S-  4U 4S jjjrSrU =r$ )HeliumAttention]   Nconfig	layer_idxc                    > [         TU ]  X5        [        R                  " UR                  UR                  SS9U l        S[        R                  " U R                  5      -  U l	        g )NF)biasr   )
r   r   r   Linearr   o_projmathsqrthead_dimscalingr   r`   ra   r!   s      r"   r   HeliumAttention.__init__^   sI    +ii 2 2F4F4FUS499T]]33r$   )re   ri   r   	r8   r9   r:   r;   r   intr   r<   r=   r>   s   @r"   r^   r^   ]   s    4| 4d
 4 4r$   r^   c                   >   ^  \ rS rSrSS\S\S-  4U 4S jjjrSrU =r$ )HeliumDecoderLayerd   Nr`   ra   c                    > [         TU ]  X5        [        U5      U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        g )Nr    )	r   r   rF   mlpr   r   rms_norm_epsinput_layernormpost_attention_layernormrj   s      r"   r   HeliumDecoderLayer.__init__e   sR    +V$,V-?-?VEXEXY(5f6H6HfNaNa(b%r$   )ru   rs   rv   r   rl   r>   s   @r"   ro   ro   d   s#    c| cd
 c cr$   ro   c                       \ rS rSrSrg)HeliumPreTrainedModelm   rB   NrC   rB   r$   r"   ry   ry   m   rD   r$   ry   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )HeliumModelq   r`   c           	      2  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        SU l        U R                  5         g s  snf )Nrr   F)r   r   r   
ModuleListrangenum_hidden_layersro   layersr   r   rt   normgradient_checkpointing	post_initrj   s      r"   r   HeliumModel.__init__r   s{     mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	&+# 	 es   B)r   r   r   )r8   r9   r:   r;   r   r   r<   r=   r>   s   @r"   r|   r|   q   s    	| 	 	r$   r|   c                       \ rS rSrSrg)HeliumForCausalLM~   rB   NrC   rB   r$   r"   r   r   ~   rD   r$   r   c                       \ rS rSrSrg)HeliumForSequenceClassification   rB   NrC   rB   r$   r"   r   r      rD   r$   r   c                       \ rS rSrSrg)HeliumForTokenClassification   rB   NrC   rB   r$   r"   r   r      rD   r$   r   )ry   r|   r   r   r   )r   )%rf   r   torch.nnr   utilsr   gemma.modeling_gemmar   r   r   granite.modeling_graniter	   llama.modeling_llamar
   r   r   r   r   configuration_heliumr   
get_loggerr8   loggerModuler   r@   rF   rQ   r\   r^   ro   ry   r|   r   r   r   __all__rB   r$   r"   <module>r      s        p p 7 v v . 
		H	%JBII J"	0 		 	6>4& 4c* c	0 	
' 
	( 		&D 		#> 	r$   