
    Z j%                        S SK Jr  S SKrS SKJr  S SKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJrJrJrJrJrJrJrJrJ r   SSK!J"r"  \RF                  " \$5      r%\" SS9\ " S S\5      5       5       r& " S S\"5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S  S!\5      r- " S" S#\5      r. " S$ S%\5      r// S&Qr0g)'    )CallableN)strict)nn   )ACT2CLS)Cache)PreTrainedConfig)RopeParameters)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)NemotronMLPz!swiss-ai/Apertus-8B-Instruct-2509)
checkpointc            	         ^  \ rS rSr% SrSrS/rSrSSSSSSSSS	.rS
/S/4SS/S/4S/S/4S.r	Sr
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\S-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\S!'   S"r\\S#'   S$r\S-  \S%'   S&r\S-  \S''   S(r\\\   -  S-  \S)'   S*r\\S+'   Sr\ \!-  S-  \S,'   S*r"\\S-'   S.r#\\-  \S/'   U 4S0 jr$S1r%U =r&$ )2ApertusConfig.   aV  
```python
>>> from transformers import ApertusModel, ApertusConfig

>>> # Initializing a Apertus-8B style configuration
>>> configuration = ApertusConfig()

>>> # Initializing a model from the Apertus-8B style configuration
>>> model = ApertusModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```apertuspast_key_values    `fAcolwisereplicated_with_grad_allreducerowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormi   
vocab_sizei   hidden_sizei 8  intermediate_size    num_hidden_layersnum_attention_headsNnum_key_value_headsxielu
hidden_acti   max_position_embeddingsg{Gz?initializer_rangegh㈵>rms_norm_epsT	use_cacher   pad_token_id   bos_token_idr   eos_token_idFtie_word_embeddingsrope_parametersattention_bias        attention_dropoutc                    > U R                   c  U R                  U l         U R                  c  SSSSSSS.U l        [        TU ]  " S0 UD6  g )	Nllama3r"   g       @i    g      ?g      @)	rope_type
rope_thetafactor original_max_position_embeddingslow_freq_factorhigh_freq_factor )r3   r2   r?   super__post_init__)selfkwargs	__class__s     |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/apertus/modular_apertus.pyrM   ApertusConfig.__post_init__e   sW    ##+'+'?'?D$'%(48#&$'$D  	''    )r3   r?   )'__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planr-   int__annotations__r.   r/   r1   r2   r3   r5   strr6   r7   floatr8   r9   boolr:   r<   r=   listr>   r?   r
   dictr@   rB   rM   __static_attributes____classcell__rP   s   @rQ   r   r   .   sb    J#4"5M%.%.%.%E%E%. )"+	 &(9:#%568IJ!"_$56 JK"s"s!!&*t*J#(S(#u#L%It L#*  L#* +,L#S	/D(, %%48O^d*T18 ND %(us{(( (rS   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )
ApertusMLPu   c                 :  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        UR                  S:X  a  [        S   " UR                  S9U l        g g )NF)biasr4   )dtype)rL   __init__r   Linearr.   r/   up_proj	down_projr5   r   rm   act_fn)rN   configrP   s     rQ   rn   ApertusMLP.__init__v   sz     yy!1!143I3IPUV4#9#94;K;KRWX'!'*>DK (rS   )rr   rq   rp   )rT   rU   rV   rW   rn   re   rf   rg   s   @rQ   ri   ri   u   s    ? ?rS   ri   c                       \ rS rSrSrg)ApertusRMSNorm~   rK   NrT   rU   rV   rW   re   rK   rS   rQ   rv   rv   ~       rS   rv   c                       \ rS rSrSrg)ApertusRotaryEmbedding   rK   Nrx   rK   rS   rQ   r{   r{      ry   rS   r{   c                     ^  \ rS rSrSS\S\S-  4U 4S jjjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  4   4S jjrSrU =r$ )ApertusAttention   Nrs   	layer_idxc                    > [         TU ]  X5        [        U R                  UR                  5      U l        [        U R                  UR                  5      U l        g N)rL   rn   rv   head_dimr8   q_normk_normrN   rs   r   rP   s      rQ   rn   ApertusAttention.__init__   s@    +$T]]F4G4GH$T]]F4G4GHrS   r(   position_embeddingsr)   r!   rO   returnc                 L   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      nU R                  U	5      n	Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )Nr;   r   rA   )dropoutscaling)shaper   q_projview	transposek_projv_projr   r   r   updater   r   get_interfacers   _attn_implementationr   trainingrB   r   reshape
contiguouso_proj)rN   r(   r   r)   r!   rO   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   rQ   forwardApertusAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((rS   )r   r   r   )rT   rU   rV   rW   r   r^   rn   torchTensortupler   r   r   r   re   rf   rg   s   @rQ   r~   r~      s    I} It I I )-()||() #5<<#=>() t+	()
 () +,() 
u||U\\)	*() ()rS   r~   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )ApertusDecoderLayer   rs   r   c                    > [         TU ]  X5        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        U ?U ?g )N)eps)	rL   rn   rv   r.   r8   attention_layernormfeedforward_layernorminput_layernormpost_attention_layernormr   s      rQ   rn   ApertusDecoderLayer.__init__   sR    +#1&2D2D&J]J]#^ %3F4F4FFL_L_%`" )rS   Nr(   r)   position_idsr!   r9   r   rO   r   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r(   r)   r   r!   r9   r   rK   )r   	self_attnr   mlp)
rN   r(   r)   r   r!   r9   r   rO   residual_s
             rQ   r   ApertusDecoderLayer.forward   s     !00?>> 
')%+ 3
 
 !0 !22=A/ 0rS   )r   r   )NNNFN)rT   rU   rV   rW   r   r^   rn   r   r   
LongTensorr   rb   r   r   r   r   re   rf   rg   s   @rQ   r   r      s    *} * * /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 rS   r   c                       \ rS rSrSrg)ApertusPreTrainedModel   rK   Nrx   rK   rS   rQ   r   r      ry   rS   r   c                       \ rS rSrSrg)ApertusModel   rK   Nrx   rK   rS   rQ   r   r      ry   rS   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )ApertusForCausalLM   c                 $   > [         TU ]  " S0 UD6$ )a
  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, ApertusForCausalLM

>>> model = ApertusForCausalLM.from_pretrained("swiss-ai/Apertus-8B-Instruct-2509")
>>> tokenizer = AutoTokenizer.from_pretrained("swiss-ai/Apertus-8B-Instruct-2509")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```rK   )rL   r   )rN   super_kwargsrP   s     rQ   r   ApertusForCausalLM.forward   s    . w...rS   rK   )rT   rU   rV   rW   r   re   rf   rg   s   @rQ   r   r      s    / /rS   r   c                       \ rS rSrSrg)ApertusForTokenClassificationi  rK   Nrx   rK   rS   rQ   r   r     ry   rS   r   )r   r   r   r   r   )1collections.abcr   r   huggingface_hub.dataclassesr   r   activationsr   cache_utilsr   configuration_utilsr	   modeling_rope_utilsr
   modeling_utilsr   processing_utilsr   utilsr   r   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   nemotron.modeling_nemotronr   
get_loggerrT   loggerr   ri   rv   r{   r~   r   r   r   r   r   __all__rK   rS   rQ   <module>r      s    %  .  "   3 1 5 & @ @   5 
		H	% >?B($ B(  @B(J? ?	\ 		1 	.)~ .)b%+ %P	1 		: 	/) /6	$? 	rS   