
    Z jz                        S SK Jr  S SKrS SKJr  S SKJs  Jr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJr  S	S
KJrJrJrJrJrJrJrJrJr  SSKJr  \R<                  " \5      r  " S S\RB                  5      r" " S S\5      r# " S S\5      r$SS jr% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r*/ SQr+g)     )CallableN   )Cache)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)logging)maybe_autocast   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForSequenceClassificationLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
OlmoLayerNorm0   z/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2   > [         TU ]  5         U4U l        g N)super__init__normalized_shape)selfr   	__class__s     v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/olmo/modular_olmo.pyr   OlmoLayerNorm.__init__3   s    !,    hidden_statesc                     UR                   n[        R                  " UR                  [        R
                  S9U R                  S S SS9R                  U5      $ )N)dtypegh㈵>)eps)r'   F
layer_normtotorchfloat32r   )r    r%   
orig_dtypes      r"   forwardOlmoLayerNorm.forward7   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r$   )r   )__name__
__module____qualname____firstlineno____doc__intr   r,   Tensorr/   __static_attributes____classcell__r!   s   @r"   r   r   0   s9    9/C /D /
U\\ 
ell 
 
r$   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )OlmoMLP>   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r    configr!   s     r"   r   OlmoMLP.__init__?   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr$   )rE   rC   rD   )r1   r2   r3   r4   r   r8   r9   r:   s   @r"   r<   r<   >   s    Y Yr$   r<   c                   L    \ rS rSr\R
                  " 5       \S 5       5       rSrg)OlmoRotaryEmbeddingH   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        X4$ ! , (       d  f       WW	4$ = f)
Nr   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandshaper+   device
isinstancetypestrr	   	transposer,   catcosattention_scalingsin)
r    xposition_idsinv_freq_expandedposition_ids_expandedrO   freqsembr\   r^   s
             r"   r/   OlmoRotaryEmbedding.forwardI   s7    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D
 x DC
 Cxs   BE&&
E7 N)	r1   r2   r3   r4   r,   no_gradr   r/   r8   rf   r$   r"   rI   rI   H   s"    
]]_
  
r$   rI   c                    U R                   UR                   peUR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nUR                  U5      UR                  U5      4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r'   	unsqueezer   r+   )	qkr\   r^   unsqueeze_dimq_typek_typeq_embedk_embeds	            r"   apply_rotary_pos_embrq   X   sv    $ WWaggF
--
&C
--
&Cw;q>C/0Gw;q>C/0G::fwzz&111r$   c                       \ rS rSr S
S\R
                  S\\R
                  \R
                  4   S\R
                  S-  S\S-  S\\R
                  \R
                  S-  4   4
S jjrS	r	g)OlmoAttentionr   Nr%   position_embeddingsattention_maskpast_key_valuesr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      nU R                  U5      n	U R	                  U5      n
U R
                  R                  b  UR                  U R
                  R                  * U R
                  R                  S9  U	R                  U R
                  R                  * U R
                  R                  S9  U
R                  U R
                  R                  * U R
                  R                  S9  UR                  U5      R                  SS5      nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R
                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )NrL   )minmaxr   r
   g        )dropoutscaling)rU   head_dimq_projk_projv_projrF   clip_qkvclamp_viewrZ   rq   update	layer_idxr   get_interface_attn_implementationr   trainingattention_dropoutr|   reshape
contiguouso_proj)r    r%   ru   rv   rw   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesr\   r^   attention_interfaceattn_outputattn_weightss                   r"   r/   OlmoAttention.forwards   s    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r$   rf   r   )
r1   r2   r3   r4   r,   r7   tupler   r/   r8   rf   r$   r"   rs   rs   r   s{     )-/)||/) #5<<#=>/) t+	/)
 /) 
u||U\\D00	1/) /)r$   rs   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoDecoderLayer   rF   r   c                    > [         TU ]  X5        [        UR                  5      U l        [        UR                  5      U l        [        XS9U l        g )N)rF   r   )r   r   r   r   input_layernormpost_attention_layernormrs   	self_attnr    rF   r   r!   s      r"   r   OlmoDecoderLayer.__init__   sB    +,V-?-?@(5f6H6H(I%&fJr$   )r   r   r   )	r1   r2   r3   r4   r   r6   r   r8   r9   r:   s   @r"   r   r      s    Kz Kc K Kr$   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	OlmoModel   rF   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  5      U l
        g s  snf r   )r   r   r@   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r"   r   OlmoModel.__init__   s_     mmBGH`H`BabBaYf0Bab
 "&"4"45	 cs   A4)r   r   )r1   r2   r3   r4   r   r   r8   r9   r:   s   @r"   r   r      s    6z 6 6r$   r   c                       \ rS rSrSrg)OlmoForCausalLM   rf   Nr1   r2   r3   r4   r8   rf   r$   r"   r   r          r$   r   c                       \ rS rSrSrg)OlmoForSequenceClassification   rf   Nr   rf   r$   r"   r   r      r   r$   r   )r   r   r   OlmoPreTrainedModel)r   ),collections.abcr   r,   torch.nnr@   torch.nn.functional
functionalr)   cache_utilsr   modeling_rope_utilsr   modeling_utilsr   utilsr   utils.genericr	   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr1   loggerModuler   r<   rI   rq   rs   r   r   r   r   __all__rf   r$   r"   <module>r      s   ( %       6 5  +
 
 
 + 
		H	%
BII 
Yh Y.  240)N 0)fK( K6
 6	& 		$B 	r$   