
    Z j#                        S SK Jr  S SKrS SKJr  S SKJr  SSKJrJ	r	  SSK
JrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJrJ r J!r!J"r"J#r#J$r$  \" SS9\ " S S\5      5       5       r% " S S\"5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\!5      r* " S S\ 5      r+ " S  S!\5      r, " S" S#\5      r-/ S$Qr.g)%    )CallableN)strict   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstring)TransformersKwargs   )Gemma2RotaryEmbedding)Olmo2Config)	Olmo2AttentionOlmo2DecoderLayerOlmo2ForCausalLMOlmo2ForSequenceClassification
Olmo2ModelOlmo2PreTrainedModelOlmo2RMSNormapply_rotary_pos_embeager_attention_forwardzallenai/Olmo-3-7B-Instruct)
checkpointc                      ^  \ rS rSr% SrSrS/rSSSSSSSS	.rS
/S/4SS/S/4S/S/4S.rSr	\
S-  \S'   Sr\\   S-  \S'   U 4S jrSrU =r$ )Olmo3Config+   aU  
Example:

```python
>>> from transformers import Olmo3Model, Olmo3Config

>>> # Initializing a Olmo3 7B style configuration
>>> configuration = Olmo3Config()

>>> # Initializing a model from the Olmo3 7B style configuration
>>> model = Olmo3Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
olmo3past_key_valuescolwise_gather_outputrowwise_split_inputcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   Nsliding_windowlayer_typesc                    > U R                   c  U R                  U l         U R                  c8  [        U R                  5       Vs/ s H  o"S-   S-  S:w  a  SOSPM     snU l        [
        TU ]  " S0 UD6  g s  snf )N      r   sliding_attentionfull_attention )num_key_value_headsnum_attention_headsr-   rangenum_hidden_layerssuper__post_init__)selfkwargsi	__class__s      x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/olmo3/modular_olmo3.pyr9   Olmo3Config.__post_init__S   s~    ##+'+'?'?D$#W\]a]s]sWt WtRSA{a'7#=MMWt D 	''	 s   A6)r-   r4   )__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr,   int__annotations__r-   liststrr9   __static_attributes____classcell__r=   s   @r>   r   r   +   s    " J#4"5%<%<%<%:"+ )"+ &(9:#%568IJ!"_$56 "&NC$J%$(KcT!(	( 	(    r   c                       \ rS rSrSrg)Olmo3RMSNorm_   r3   Nr@   rA   rB   rC   rM   r3   rP   r>   rR   rR   _       rP   rR   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )Olmo3Attentione   config	layer_idxc                    > [         TU ]  XS9  UR                  U   U l        U R                  S:X  a  UR                  U l        g S U l        g )N)rZ   r1   )r8   __init__r-   attention_typer,   r:   rY   rZ   r=   s      r>   r\   Olmo3Attention.__init__f   sJ    5$00;7;7J7JNa7af33gkrP   Nr'   position_embeddingsr(   r    r;   returnc                 f   UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      5      nU R	                  U R                  U5      5      n	U R                  U5      n
UR                  U5      R                  SS5      nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nU R/                  U5      nX4$ )Nr/   r   g        )dropoutscalingr,   )shapehead_dimq_normq_projk_normk_projv_projview	transposer   updaterZ   r   get_interfacerY   _attn_implementationr   trainingattention_dropoutre   r,   reshape
contiguouso_proj)r:   r'   r`   r(   r    r;   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r>   forwardOlmo3Attention.forwardk   s    $))#2.88b8$--8{{4;;}#=>[[]!;<
{{=1#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHkk+.((rP   )r]   r,   )N)r@   rA   rB   rC   r   rI   r\   torchTensortupler   r   r   r   rM   rN   rO   s   @r>   rW   rW   e   s    l{ ls l )-+)||+) #5<<#=>+) t+	+)
 +) +,+) 
u||U\\D00	1+) +)rP   rW   c                       \ rS rSrSrg)Olmo3DecoderLayer   r3   NrT   r3   rP   r>   r   r      rU   rP   r   c                       \ rS rSrSrg)Olmo3RotaryEmbedding   r3   NrT   r3   rP   r>   r   r      rU   rP   r   c                       \ rS rSrSrg)Olmo3PreTrainedModel   r3   NrT   r3   rP   r>   r   r      rU   rP   r   c                      ^  \ rS rSrS\4U 4S jjr      SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S	\R                  S-  S
\S-  S\\   S\4S jjrSrU =r$ )
Olmo3Model   rY   c           	         > [         TU ]  U5        [        UR                  UR                  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        US9U l        g s  snf )N)epsrY   )r8   r\   rR   hidden_sizerms_norm_epsr+   nn
ModuleListr6   r7   r   r*   r   
rotary_embr^   s      r>   r\   Olmo3Model.__init__   ss      !3!39L9LM	mmCHIaIaCbcCbiv1Cbc
 /f= ds   BNr%   r(   position_idsr    r&   	use_cacher;   ra   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       d)  U R                  UUUUS.n
[        S
0 U
D6[        S
0 U
D6S.n	UnU R                  X5      n[        U R                   S U R                  R"                   5       H,  u  pU" U4XR                  R$                  U      UUUS.UD6nM.     U R'                  U5      n[)        UUS	9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r/   )device)rY   r&   r(   r    r   )r2   r1   )r(   r   r    r`   )last_hidden_stater    r3   )
ValueErrorr)   r   rY   get_seq_lengthr   arangerf   r   	unsqueeze
isinstancedictr   r	   r   	enumerater*   r7   r-   r+   r
   )r:   r%   r(   r   r    r&   r   r;   past_seen_tokenscausal_mask_mappingmask_kwargsr'   r`   r<   decoder_layers                  r>   r   Olmo3Model.forward   s    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U#
 &"oomJ )$++6U8U8U*V WA)2;;3J3J13MN) /$7 M !X 		-0&++
 	
rP   )r*   r+   r   )NNNNNN)r@   rA   rB   rC   r   r\   r   
LongTensorr   r   FloatTensorboolr   r   r
   r   rM   rN   rO   s   @r>   r   r      s    >{ > .2.204(,26!%9
##d*9
 t+9
 &&-	9

 9
 ((4/9
 $;9
 +,9
 
!9
 9
rP   r   c                       \ rS rSrSrg)Olmo3ForCausalLM   r3   NrT   r3   rP   r>   r   r      rU   rP   r   c                       \ rS rSrSrg)Olmo3ForSequenceClassification   r3   NrT   r3   rP   r>   r   r      rU   rP   r   )r   r   r   r   r   )/collections.abcr   r   torch.nnr   huggingface_hub.dataclassesr   cache_utilsr   r   masking_utilsr   r	   modeling_outputsr
   modeling_utilsr   processing_utilsr   utilsr   utils.genericr   gemma2.modeling_gemma2r   olmo2.configuration_olmo2r   olmo2.modeling_olmo2r   r   r   r   r   r   r   r   r   r   rR   rW   r   r   r   r   r   r   __all__r3   rP   r>   <module>r      s    %   . . R 7 5 & # / : 3
 
 
 78/(+ /(  9/(d	< 	1)^ 1)h	) 		0 		/ 	B
 B
J	' 		%C 	rP   