
    Z j!                        S SK r S SKJr  S SKrS SKJr  SSKJr  SSK	J
r
Jr  SSKJr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)   " S S\%5      r* " S S\!5      r+S r, " S S\'5      r- " S S\5      r. " S S\5      r/\ " S S\ 5      5       r0\ " S  S!\5      5       r1\ " S" S#\5      5       r2/ S$Qr3g)%    N)Callable   )initialization)CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring   )CLIPMLP)Gemma2ForCausalLM)LlamaDecoderLayer
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Llama4TextL2Norm)Qwen3Attention   )NanoChatConfigc                       \ rS rSrSrg)NanoChatRMSNorm+    N__name__
__module____qualname____firstlineno____static_attributes__r        ~/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/nanochat/modular_nanochat.pyr   r   +       r'   r   c                       \ rS rSrSrg)NanoChatRotaryEmbedding/   r    Nr!   r    r'   r(   r+   r+   /   r)   r'   r+   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " X!* 4SS9$ )zJRotates half the hidden dims of the input with flipped signs for NanoChat..Nr   )dim)shapetorchcat)xx1x2s      r(   rotate_halfr6   3   sX    	
3"!''"+"""	#B	
3q ""	#B99b#YB''r'   c                   
  ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )NanoChatAttention:   config	layer_idxc                    > [         TU ]  X5        U ?U ?[	        UR
                  S9U l        [	        UR
                  S9U l        g N)eps)super__init__sliding_window
layer_typer   rms_norm_epsq_normk_normselfr:   r;   	__class__s      r(   r@   NanoChatAttention.__init__;   s?    +O%&*=*=>%&*=*=>r'   Nhidden_statesposition_embeddingsattention_maskpast_key_valueskwargsreturnc                 L   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pU R                  U5      nU R                  U	5      n	Ub  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )Nr.   r   r           )dropoutscaling)r0   head_dimq_projview	transposek_projv_projr   rD   rE   updater;   r   get_interfacer:   _attn_implementationr   trainingattention_dropoutrS   reshape
contiguouso_proj)rG   rJ   rK   rL   rM   rN   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r(   forwardNanoChatAttention.forwardC   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[  {{<0[[,
&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r'   )rE   rD   )NNN)r"   r#   r$   r%   r   intr@   r1   Tensortupler   r   r   rl   r&   __classcell__rH   s   @r(   r8   r8   :   s    ?~ ?# ? IM.2(,*)||*) #5<<#=>E*) t+	*)
 *) +,*) 
u||U\\D00	1*) *)r'   r8   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )NanoChatMLPp   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        g )NF)bias)r?   r@   nnLinearhidden_sizeintermediate_sizefc1fc2rG   r:   rH   s     r(   r@   NanoChatMLP.__init__q   sR     99V//1I1IPUV99V55v7I7IPUVr'   )r|   r}   )r"   r#   r$   r%   r@   r&   rq   rr   s   @r(   rt   rt   p   s    W Wr'   rt   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )NanoChatDecoderLayerw   r:   r;   c                    > [         TU ]  5         [        UR                  S9U l        [        UR                  S9U l        g r=   )r?   r@   r   rC   input_layernormpost_attention_layernormrF   s      r(   r@   NanoChatDecoderLayer.__init__x   s4    .63F3FG(7F<O<O(P%r'   )r   r   )	r"   r#   r$   r%   r   rn   r@   r&   rq   rr   s   @r(   r   r   w   s    Q~ Q# Q Qr'   r   c                   :    \ rS rSrS\R
                  SS4S jrSrg)NanoChatPreTrainedModel   modulerO   Nc           	      4   [         R                  " X5        [        U[        5      (       am  [        R
                  " UR                  R                  SU R                  R                  [        R                  " SU R                  R                  -  5      -  S9  g g )NrQ   r   )meanstd)r   _init_weights
isinstancer8   initnormal_ra   weightr:   initializer_rangemathsqrtnum_hidden_layers)rG   r   s     r(   r   %NanoChatPreTrainedModel._init_weights   si    %%d3f/00LL$$KK11DIIa$++B_B_>_4`` 1r'   r    )r"   r#   r$   r%   rx   Moduler   r&   r    r'   r(   r   r      s    BII $ r'   r   c                      ^  \ rS rSrS\4U 4S jjr      SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S	\R                  S-  S
\S-  S\\   S\4S jjrSrU =r$ )NanoChatModel   r:   c                 T   > [         TU ]  U5        [        UR                  S9U l        g r=   )r?   r@   r   rC   normr~   s     r(   r@   NanoChatModel.__init__   s"     #(;(;<	r'   N	input_idsrL   position_idsrM   inputs_embeds	use_cacherN   rO   c           	      ^   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  U
5      n
U R                  S U R                  R                    H  nU" U
4U	UUUS.UD6n
M     U R                  U
5      n
[        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)r:   r   r   )device)r:   r   rL   rM   r   )r   )rL   rK   r   rM   )last_hidden_staterM   )
ValueErrorembed_tokensr   r:   get_seq_lengthr1   aranger0   r   	unsqueezer   
rotary_embr   layersr   r	   )rG   r   rL   r   rM   r   r   rN   past_seen_tokenscausal_maskrJ   rK   decoder_layers                r(   rl   NanoChatModel.forward   sQ    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW		-0![[)H4;;+H+HIM)*$7) / M J 		-0&++
 	
r'   )r   )NNNNNN)r"   r#   r$   r%   r   r@   r1   
LongTensorro   r   FloatTensorboolr   r   r	   rl   r&   rq   rr   s   @r(   r   r      s    =~ = .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
!2
 2
r'   r   c                   8   ^  \ rS rSrSS0rS\4U 4S jjrSrU =r$ )NanoChatForCausalLM   lm_headcolwise_gather_outputrO   c                 &   > [         TU ]  " S0 UD6  g)a  
Example:

```python
>>> from transformers import AutoTokenizer, AutoModelForCausalLM

>>> model = AutoModelForCausalLM.from_pretrained("karpathy/nanochat-d32")

>>> tokenizer = AutoTokenizer.from_pretrained("karpathy/nanochat-d32")

>>> conversation = [
        {"role": "user", "content": "What is the capital of France?"},
    ]

>>> inputs = tokenizer.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
    ).to(device)

>>> with torch.no_grad():
>>>     outputs = model.generate(**inputs, max_new_tokens=64, do_sample=False)

>>> generated_tokens = outputs[0, inputs["input_ids"].shape[1] :]
>>> output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
```Nr    )r?   rl   )rG   super_kwargsrH   s     r(   rl   NanoChatForCausalLM.forward   s    2 	','r'   r    )	r"   r#   r$   r%   _tp_planr
   rl   r&   rq   rr   s   @r(   r   r      s     23H()? ( (r'   r   )r   r   r   )4r   collections.abcr   r1   torch.nnrx    r   r   cache_utilsr   r   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   gemma2.modeling_gemma2r   llama.modeling_llamar   r   r   r   r   r   llama4.modeling_llama4r   qwen3.modeling_qwen3r   configuration_nanochatr   r   r+   r6   r8   rt   r   r   r   r   __all__r    r'   r(   <module>r      s     $   & . / O F & 7 ( 6  6 1 2	& 		2 	(3) 3)lW' WQ, Q 2   8
J 8
 8
v (+ ( (>r'   