
    Z jZ"                        S SK Jr  S SKrS SKJr  S SKJr  S SKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJr  S
SKJrJrJr  S
SKJr  S
SKJrJrJrJrJrJrJr  \R@                  " \!5      r"\" SS9\ " S S\5      5       5       r# " S S\5      r$ " S S\5      r%S r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S \5      r+ " S! S"\5      r,/ S#Qr-g)$    )CallableN)strict)TransformersKwargs   )Cache)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )LlamaPreTrainedModelLlamaRMSNormeager_attention_forward)
OlmoConfig)OlmoAttentionOlmoDecoderLayerOlmoForCausalLMOlmoForSequenceClassification	OlmoModelOlmoRotaryEmbeddingapply_rotary_pos_embzallenai/Olmo2-7B-1124-hf)
checkpointc                   t    \ rS rSr% SrSrSSSSSSSS.rS	/S
/4SS/S/4S/S/4S.rSr\	\
S'   \" 5       rSrg)Olmo2Config0   aU  
Example:

```python
>>> from transformers import Olmo2Model, Olmo2Config

>>> # Initializing a Olmo2 7B style configuration
>>> configuration = Olmo2Config()

>>> # Initializing a model from the Olmo2 7B style configuration
>>> model = Olmo2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
olmo2colwise_gather_outputrowwise_split_inputcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormgh㈵>rms_norm_eps N)__name__
__module____qualname____firstlineno____doc__
model_typebase_model_tp_planbase_model_pp_planr(   float__annotations__AttributeErrorclip_qkv__static_attributes__r)       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/olmo2/modular_olmo2.pyr   r   0   su    " J%<%<%<%:"+ )"+ &(9:#%568IJ!"_$56 L%Hr7   r   c                       \ rS rSrS rSrg)Olmo2RMSNormZ   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  U-  R                  U5      $ )Nr   T)keepdim)	dtypetotorchfloat32powmeanrsqrtvariance_epsilonweight)selfr#   input_dtypevariances       r8   forwardOlmo2RMSNorm.forward[   sw    #))%((7 $$Q',,R,>%H?T?T4T(UUm+//<<r7   r)   N)r*   r+   r,   r-   rK   r6   r)   r7   r8   r:   r:   Z   s    =r7   r:   c                       \ rS rSrSrg)Olmo2RotaryEmbeddingc   r)   Nr*   r+   r,   r-   r6   r)   r7   r8   rN   rN   c       r7   rN   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr=   r   )dim)shaperA   cat)xx1x2s      r8   rotate_halfrY   g   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                   
  ^  \ rS rSrSS\S\S-  4U 4S jjjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )Olmo2Attentionq   Nconfig	layer_idxc                    > [         TU ]  XS9  [        UR                  U R                  -  UR
                  5      U l        [        UR                  U R                  -  UR
                  5      U l        g )Nr^   )	super__init__r:   num_attention_headshead_dimr(   q_normnum_key_value_headsk_normrH   r]   r^   	__class__s      r8   rb   Olmo2Attention.__init__r   sY    5"6#=#=#MvObObc"6#=#=#MvObObcr7   r#   position_embeddingsr$   past_key_valueskwargsreturnc                 P   UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      5      nU R	                  U R                  U5      5      n	U R                  U5      n
UR                  U5      R                  SS5      nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )Nr=      r   g        )dropoutscaling)rT   rd   re   q_projrg   k_projv_projview	transposer   updater^   r   get_interfacer]   _attn_implementationr   trainingattention_dropoutrr   reshape
contiguouso_proj)rH   r#   rk   r$   rl   rm   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r8   rK   Olmo2Attention.forwardw   s    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r7   )rg   re   )N)r*   r+   r,   r-   r   intrb   rA   Tensortupler   r	   r   rK   r6   __classcell__ri   s   @r8   r[   r[   q   s    d{ dsTz d d )-*)||*) #5<<#=>*) t+	*)
 *) +,*) 
u||U\\D00	1*) *)r7   r[   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )Olmo2DecoderLayer   r]   r^   c                    > [         TU ]  XS9  [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        XS9U l        U ?	g )Nr`   eps)r]   r^   )
ra   rb   r:   hidden_sizer(   post_attention_layernormpost_feedforward_layernormr[   	self_attninput_layernormrh   s      r8   rb   Olmo2DecoderLayer.__init__   s[    5(4V5G5GVM`M`(a%*6v7I7IvObOb*c''vK r7   Nr#   r$   position_idsrl   	use_cacherk   rm   rn   c           
          UnU R                   " SUUUUUUS.UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r#   r$   r   rl   r   rk   r)   )r   r   mlpr   )
rH   r#   r$   r   rl   r   rk   rm   residual_s
             r8   rK   Olmo2DecoderLayer.forward   s     !>> 
')%+ 3
 
 55mD 0 !/77F 0r7   )r   r   r   )NNNFN)r*   r+   r,   r-   r   r   rb   rA   r   
LongTensorr   boolr   r	   r   rK   r6   r   r   s   @r8   r   r      s    !{ !s ! /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r7   r   c                       \ rS rSrSrg)Olmo2PreTrainedModel   r)   NrP   r)   r7   r8   r   r      rQ   r7   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )
Olmo2Model   r]   c           	        > [         TU ]  U5        [        UR                  UR                  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf )Nr   )ra   rb   r:   r   r(   r'   nn
ModuleListrangenum_hidden_layersr   r&   rh   s      r8   rb   Olmo2Model.__init__   sb      !3!39L9LM	mmCHIaIaCbcCbiv1Cbc
cs   A=)r&   r'   )r*   r+   r,   r-   r   rb   r6   r   r   s   @r8   r   r      s    
{ 
 
r7   r   c                       \ rS rSrSrg)Olmo2ForCausalLM   r)   NrP   r)   r7   r8   r   r      rQ   r7   r   c                       \ rS rSrSrg)Olmo2ForSequenceClassification   r)   NrP   r)   r7   r8   r   r      rQ   r7   r   )r   r   r   r   r   ).collections.abcr   rA   torch.nnr   huggingface_hub.dataclassesr   transformers.utils.genericr   cache_utilsr   modeling_utilsr   processing_utilsr	   utilsr
   r   llama.modeling_llamar   r   r   olmo.configuration_olmor   olmo.modeling_olmor   r   r   r   r   r   r   
get_loggerr*   loggerr   r:   rN   rY   r[   r   r   r   r   r   __all__r)   r7   r8   <module>r      s   ( %   . 9   5 & , ^ ^ 0   
		H	% 56# * #   7# P=< =	. 	(0)] 0)l$( $N	/ 	
 
	 		%B 	r7   