
    Z j                        S SK Jr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJrJrJrJrJrJrJr  SSKJrJ r   \RB                  " \"5      r#\" SS9\ " S S\5      5       5       r$ " S S\ 5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S \5      r+ " S! S"\5      r, " S# S$\5      r-/ S%Qr.g)&    )CallableN)strict   )Cache)PreTrainedConfig)FlashAttentionKwargs)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward)
Qwen2ModelQwen2RotaryEmbeddingzHuggingFaceTB/SmolLM3-3B)
checkpointc                     ^  \ rS rSr% SrSrS/rSrSSSSSSSS.rS	/S
/4SS/S/4S/S/4S.r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S '   S!r\\S"'   S#r\\S$'   S%r\S-  \S&'   S'r\S-  \S('   S)r\\\   -  S-  \S*'   Sr\\ -  S-  \S+'   S,r!\\S-'   Sr"\S-  \S.'   Sr#\\   S-  \S/'   Sr$\\S0'   Sr%\\   S-  \S1'   S,r&\\S2'   S3r'\\-  \S4'   S,r(\\S5'   S#r)\\S6'   U 4S7 jr*S8r+U =r,$ )9SmolLM3Config,   a  
no_rope_layers (`List[int]`, *optional*):
    List with at least the same length as the number of layers in the model.
    A `1` at an index position indicates that the corresponding layer will use RoPE,
    while a `0` indicates that it's a NoPE layer.
no_rope_layer_interval (`int`, *optional*, defaults to 4):
    If `no_rope_layers` is `None`, it will be created using a NoPE layer every
    `no_rope_layer_interval` layers.

```python
>>> from transformers import SmolLM3Model, SmolLM3Config

>>> # Initializing a SmolLM3 style configuration
>>> configuration = SmolLM3Config()

>>> # Initializing a model from the SmolLM3 style configuration
>>> model = SmolLM3Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```smollm3past_key_valuesg    >Acolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi  
vocab_sizei   hidden_sizei +  intermediate_size$   num_hidden_layers   num_attention_heads   Nnum_key_value_headssilu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegư>rms_norm_epsT	use_cachei pad_token_idi  bos_token_idi eos_token_idrope_parametersFuse_sliding_windowsliding_windowno_rope_layersno_rope_layer_intervallayer_typesattention_bias        attention_dropoutmlp_biastie_word_embeddingsc                 L  > U R                   c  U R                  U l         U R                  cG  [        U R                  5       Vs/ s H!  n[        US-   U R                  -  S:g  5      PM#     snU l        U R                  c  / U l        [        U R                  5       Ho  nU R                  U   nU R                  (       a1  U R                  b$  U(       d  U R                  R                  S5        MT  U R                  R                  S5        Mq     [        TU ]0  " S0 UD6  g s  snf )N   r   sliding_attentionfull_attention )r1   r/   r>   ranger-   intr?   r@   r<   r=   appendsuper__post_init__)selfkwargs	layer_idxhas_rope	__class__s       |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/smollm3/modular_smollm3.pyrO   SmolLM3Config.__post_init__q   s    ##+'+'?'?D$&Y^_c_u_uYv#YvIY]d&A&AAQFGYv#D #!D"4#9#9:	..y9**t/B/B/NW_$$++,?@$$++,<= ; 	''#s   (D!)r@   r>   r1   )-__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planr)   rL   __annotations__r*   r+   r-   r/   r1   r3   strr4   r5   floatr6   r7   boolr8   r9   r:   listr;   r	   dictr<   r=   r>   r?   r@   rA   rC   rD   rE   rO   __static_attributes____classcell__rT   s   @rU   r   r   ,   s   , J#4"5M &/%.%.%."+ )"+ &(9:#%568IJ!"_$56 JK"s"s!!&'t'J#(S(#u#L%It%L#*%%L#*%+1L#S	/D(148O^d*T18$$!%NC$J%'+NDI$+"#C#$(KcT!( ND %(us{(Hd $$( (    r   c                       \ rS rSrSrg)SmolLM3RotaryEmbedding   rJ   NrW   rX   rY   rZ   rg   rJ   rj   rU   rl   rl          rj   rl   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )SmolLM3Attention   configrR   c                    > [         TU ]  X5        UR                  U   U l        UR                  (       a%  UR
                  U   S:X  a  UR                  U l        g S U l        g )NrH   )rN   __init__r>   use_roper<   r@   r=   )rP   rs   rR   rT   s      rU   ru   SmolLM3Attention.__init__   s`    +--i8 ((V-?-?	-JNa-a !! 	  	rj   Nr$   position_embeddingsr%   r   rQ   returnc                 @   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )NrG   r   rB   )dropoutscalingr=   )shapehead_dimq_projview	transposek_projv_projrv   r   updaterR   r
   get_interfacers   _attn_implementationr   trainingrC   r}   r=   reshape
contiguouso_proj)rP   r$   rx   r%   r   rQ   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   rU   forwardSmolLM3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST==*HC';LVY'_$L&'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHkk+.((rj   )r=   rv   )N)rW   rX   rY   rZ   r   rL   ru   torchTensortupler   r   r   r   rg   rh   ri   s   @rU   rq   rq      s    
} 
 
 )-()||() #5<<#=>() t+	()
 () -.() 
u||U\\D00	1() ()rj   rq   c                       \ rS rSrSrg)SmolLM3DecoderLayer   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg)SmolLM3PreTrainedModel   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg)SmolLM3Model   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg)SmolLM3ForCausalLM   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg) SmolLM3ForSequenceClassification   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg)SmolLM3ForTokenClassification   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   c                       \ rS rSrSrg)SmolLM3ForQuestionAnswering   rJ   Nrn   rJ   rj   rU   r   r      ro   rj   r   )r   r   r   r   r   r   r   )/collections.abcr   r   huggingface_hub.dataclassesr   cache_utilsr   configuration_utilsr   modeling_flash_attention_utilsr   modeling_rope_utilsr	   modeling_utilsr
   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   qwen2.modeling_qwen2r   r   
get_loggerrW   loggerr   rl   rq   r   r   r   r   r   r   r   __all__rJ   rj   rU   <module>r      s    %  .   3 B 1 5 & ,
 
 
 D 
		H	% 56U($ U(  7U(p	1 	3)~ 3)l	+ 		1 		: 		) 		'E 		$? 		"; 	rj   