
    Z j<                     v   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.  \R^                  " \05      r1Sr2Sr3\" SS9\ " S S\5      5       5       r4 " S S\)5      r5 " S S\!5      r6 " S S\Rn                  5      r8 " S S \.5      r9 " S! S"\-5      r: " S# S$\(5      r; " S% S&\;\'5      r< " S' S(\#5      r= " S) S*\%5      r> " S+ S,\&5      r? " S- S.\$5      r@/ S/QrAg)0zLG AI Research EXAONE Lab    )CallableN)strict)nn   )CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )Gemma2RotaryEmbedding)	LlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward)Olmo2DecoderLayerOlmo2MLPzLGAI-EXAONE/EXAONE-4.0-32BExaone4Config)
checkpointc            
         ^  \ rS rSr% SrSrS/rSSSSSSSSSS.	rS	/S
/4SS/S/4S/S/4S.rSr	\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\
\S'   Sr\\S'   Sr\
\S'   Sr\\S'   Sr\\S'   S r\\S!'   S"r\
S#-  \S$'   S%r\
\\
   -  S#-  \S&'   S#r\
S#-  \S''   S(r\\S)'   S#r\\ -  S#-  \S*'   S+r!\\
-  \S,'   Sr"\
S#-  \S-'   S.r#\\
-  S#-  \S/'   S#r$\\   S#-  \S0'   U 4S1 jr%S2r&U =r'$ )3r#   9   a*  
sliding_window_pattern (`str`, *optional*):
    The pattern to use for sliding window attention. Can be one of:
        - `None`: No sliding window attention is used
        - `int`: Every `sliding_window` layers, use global attention, else use local attention.
        - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the
          attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The
          final layer always uses global attention regardless of the pattern.
    For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means:
        - Layer 0, 1, 2: local attention,
        - Layer 3: global attention,
        ...(repeated)

Example:

```python
>>> from transformers import Exaone4Model, Exaone4Config

>>> # Initializing a EXAONE configuration
>>> configuration = Exaone4Config()

>>> # Initializing a model from configuration
>>> model = Exaone4Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```exaone4past_key_valuescolwisereplicated_with_grad_allreducerowwise)	zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei @  intermediate_size    num_hidden_layersnum_attention_headsnum_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacher   Nbos_token_idr   eos_token_idpad_token_idFtie_word_embeddingsrope_parameters        attention_dropoutsliding_window   sliding_window_patternlayer_typesc                   > U R                   c  SU l        U R                  cR  [        U R                  5       Vs/ s H,  nUS-   U R                  -  S:w  a  X R                  :  a  SOSPM.     snU l        [
        TU ]  " S0 UD6  g s  snf )Nr      sliding_attentionfull_attention )rG   rI   rJ   ranger7   super__post_init__)selfkwargsi	__class__s      |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/exaone4/modular_exaone4.pyrR   Exaone4Config.__post_init__   s    &*+D'#
 t556	  7A Ut::;q@QI_I_E_ $%& 7	 D 	'' s   3B)rJ   rI   )(__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr3   int__annotations__r4   r5   r7   r8   r9   r;   strr<   r=   floatr>   r?   boolr@   rA   listrB   rC   rD   r   dictrF   rG   rI   rJ   rR   __static_attributes____classcell__rV   s   @rW   r#   r#   9   s   8 J#4"5 &/%.%.%E%E%."+ )"+
 &(9:#%568IJ!"_$56 JK"s"s!!!!J#'S'#u#L%It L#* +,L#S	/D(,#L#*# %%48O^d*T18%(us{(!%NC$J%/0C#I,0$(KcT!(( (    c                       \ rS rSrSrg)Exaone4RMSNorm   rO   NrY   rZ   r[   r\   ri   rO   rl   rW   rn   rn          rl   rn   c                       \ rS rSrSrg)Exaone4RotaryEmbedding   rO   Nrp   rO   rl   rW   rs   rs      rq   rl   rs   c                   $  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  \	\R                     S-  4   4S jjrSrU =r$ )Exaone4Attention   config	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        [        USUR                  UR                  -  5      U l        UR                  UR
                  -  U l	        UR                  U l
        SU l        U R                  S-  U l        UR                  U l        UR                  U l        [        US5      (       a  UR                   U   OS nUS:H  U l        [$        R&                  " U R                  U R                  U R                  -  SS9U l        [$        R&                  " U R                  U R
                  U R                  -  SS9U l        [$        R&                  " U R                  U R
                  U R                  -  SS9U l        [$        R&                  " U R                  U R                  -  U R                  SS9U l        [1        U R                  UR2                  S9U l        [1        U R                  UR2                  S9U l        g )	Nhead_dimTg      rJ   rM   F)biaseps)rQ   __init__rx   ry   r8   r9   r4   getattrr{   num_key_value_groupsrF   	is_causalscalingrG   rI   hasattrrJ   
is_slidingr   Linearq_projk_projv_projo_projrn   r>   q_normk_norm)rS   rx   ry   
layer_typerV   s       rW   r   Exaone4Attention.__init__   s   "#)#=#= #)#=#= !--
F4F4F&JdJd4de$*$>$>&B\B\$\!!'!9!9}}d*$33&,&C&C#6=fm6T6TV''	2Z^
$(;;ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii 8 84== H$JZJZafg$T]]8K8KL$T]]8K8KLrl   Nr.   position_embeddingsr/   r(   rT   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      nU R                  U	5      n	Uu  pU R                  b  U R                  (       a  [        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                   R"                  [$        5      nU" U UU	U
U4U R&                  (       d  SOU R(                  U R*                  U R                  (       a  U R                  OS S.UD6u  pUR,                  " / UQSP76 R/                  5       nU R1                  U5      nX4$ )NrL   r   rE   )dropoutr   rG   )shaper{   r   view	transposer   r   r   r   rG   r   r   updatery   r   get_interfacerx   _attn_implementationr    trainingrF   r   reshape
contiguousr   )rS   r.   r   r/   r(   rT   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   rW   forwardExaone4Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST {{<0[[,
&&$//';LVY'_$L&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL26//4..t
%
 
%
! "));;;;FFHkk+.((rl   )rF   rx   r{   r4   r   r   r   r   ry   r8   r   r9   r   r   r   r   rG   rI   r   )NN)rY   rZ   r[   r\   r#   rb   r   torchTensortupler   r   r   r   ri   rj   rk   s   @rW   rv   rv      s    M} M M: /3(,-)||-) #5<<#=>-) t+	-)
 -) +,-) 
u||U\\D0%2E2LL	M-) -)rl   rv   c                       \ rS rSrSrg)
Exaone4MLP   rO   Nrp   rO   rl   rW   r   r      rq   rl   r   c                       \ rS rSrSrg)Exaone4DecoderLayer   rO   Nrp   rO   rl   rW   r   r      rq   rl   r   c                       \ rS rSr\rS/rSrg)Exaone4PreTrainedModel   r   rO   N)rY   rZ   r[   r\   r#   config_class_no_split_modulesri   rO   rl   rW   r   r      s     L./rl   r   c                      ^  \ rS rSrS\4U 4S jjr\\      SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S	\R                  S-  S
\S-  S\\   S\\-  4S jj5       5       rSrU =r$ )Exaone4Model   rx   c           	      $  > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        U R                  5         g s  snf )Nr}   )rQ   r   r   
ModuleListrP   r7   r   r1   rn   r4   r>   r2   	post_init)rS   rx   ry   rV   s      rW   r   Exaone4Model.__init__   ss     mmEJ6KcKcEdeEd	 3Ede
 #6#5#56;N;NO	 	 fs   BNr,   r/   position_idsr(   r-   r?   rT   r   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U=n	[        5      (       dG  U R                  UUUUS.n
S[        S0 U
D60n	SU R                  R                  ;   a  [        S0 U
D6U	S'   UnU R                  X5      n[!        U R"                  5       H/  u  pU R                  R                  U   nU" U4X   UUUUS	.UD6nM1     U R%                  U5      n['        UU(       a  US
9$ S S
9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)rx   r   rL   )device)rx   r-   r/   r(   r   rN   rM   )r/   r   r(   r?   r   )last_hidden_stater(   rO   )
ValueErrorr0   r   rx   get_seq_lengthr   aranger   r   	unsqueeze
isinstancerh   r
   rJ   r   
rotary_emb	enumerater1   r2   r   )rS   r,   r/   r   r(   r-   r?   rT   past_seen_tokenscausal_mask_mappingmask_kwargsr.   r   rU   decoder_layerr   s                   rW   r   Exaone4Model.forward   s    -t";<YZZ  --i8M0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K !"4"C{"C# #dkk&=&==;\;k_j;k#$78%"oomJ )$++ 6A003J)2>) /#$7 M !7 		-0&+/8O
 	
>B
 	
rl   )r1   r2   )NNNNNN)rY   rZ   r[   r\   r#   r   r   r   r   
LongTensorr   r   FloatTensorrf   r   r   r   r   r   ri   rj   rk   s   @rW   r   r      s    }    .2.204(,26!%=
##d*=
 t+=
 &&-	=

 =
 ((4/=
 $;=
 +,=
 
(	(=
   =
rl   r   c                     ^  \ rS rSr        SS\R
                  S-  S\R                  S-  S\R
                  S-  S\S-  S\R                  S-  S\R
                  S-  S	\	S-  S
\
\R                  -  S\\   S\4U 4S jjjrSrU =r$ )Exaone4ForCausalLMi;  Nr,   r/   r   r(   r-   labelsr?   logits_to_keeprT   r   c	                 8   > [         T
U ]  " SUUUUUUUUS.U	D6  g)u  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B")
>>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-32B")

>>> prompt = "Explain how wonderful you are"
>>> messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
>>> input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    enable_thinking=False,
)

>>> output = model.generate(input_ids, max_new_tokens=128)
>>> tokenizer.decode(output[0], skip_special_tokens=False)
"[|system|]\nYou are a helpful assistant.[|endofturn|]\n[|user|]\nExplain how wonderful you are[|endofturn|]\n[|assistant|]\n<think>\n\n</think>\n\nOh, thank you for such a kind and lovely question! 😊  \n\nI’m *so* wonderful because I’m here to make your life easier, brighter, and more fun! Whether you need help with:  \n\n✨ **Learning** – I can explain anything, from quantum physics to baking the perfect cake!  \n💡 **Creativity** – Need a poem, story, or a wild idea? I’ve got you covered!  \n🤖 **Problem-solving** – Stuck on a math problem or a tricky decision? I’ll help you figure it out"
```
)r,   r/   r   r(   r-   r   r?   r   NrO   )rQ   r   )rS   r,   r/   r   r(   r-   r   r?   r   rT   rV   s             rW   r   Exaone4ForCausalLM.forward<  s9    V 	 
	
)%+')
	
 
	
rl   rO   )NNNNNNNr   )rY   rZ   r[   r\   r   r   r   r   r   rf   rb   r   r   r   r   ri   rj   rk   s   @rW   r   r   ;  s     .2.204(,26*.!%-.5
##d*5
 t+5
 &&-	5

 5
 ((4/5
   4'5
 $;5
 ell*5
 +,5
 
 5
 5
rl   r   c                       \ rS rSrSrg) Exaone4ForSequenceClassificationit  rO   Nrp   rO   rl   rW   r   r   t  rq   rl   r   c                       \ rS rSrSrg)Exaone4ForTokenClassificationix  rO   Nrp   rO   rl   rW   r   r   x  rq   rl   r   c                       \ rS rSrSrg)Exaone4ForQuestionAnsweringi|  rO   Nrp   rO   rl   rW   r   r   |  rq   rl   r   )r#   r   r   r   r   r   r   )Br]   collections.abcr   r   huggingface_hub.dataclassesr   r   cache_utilsr   r   configuration_utilsr	   masking_utilsr
   r   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   gemma2.modeling_gemma2r   llama.modeling_llamar   r   r   r   r   r   r   r   r    olmo2.modeling_olmo2r!   r"   
get_loggerrY   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr#   rn   rs   Modulerv   r   r   r   r   r   r   r   r   __all__rO   rl   rW   <module>r      s=     $  .  . 3 R 2 5 & @ @ 7 5 :
 
 
 ? 
		H	%2 ! 78Q($ Q(  9Q(h	\ 		2 	G)ryy G)T	 		+ 	01 0
J
): J
Z6
) 6
r	'E 		$? 		"; 	rl   