
    Z jO                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJr  SSKJr  \R@                  " \!5      r"Sr# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S  S!\5      r+/ S"Qr,g)#zPyTorch Qwen3 model.    )CallableN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )GemmaMLP)LlamaAttention)Qwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassificationQwen2RMSNormQwen2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       \ rS rSrSrg)Qwen3RMSNorm0    N__name__
__module____qualname____firstlineno____static_attributes__r       x/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   0       r#   r   c                       \ rS rSrSrg)Qwen3MLP4   r   Nr   r   r#   r$   r'   r'   4   r%   r#   r'   c                       \ rS rSrSrg)Qwen3RotaryEmbedding8   r   Nr   r   r#   r$   r*   r*   8   r%   r#   r*   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )Qwen3Attention<   config	layer_idxc                 P  > [        US5      (       a  UR                  U   OS U l        [        TU ]  X5        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l	        U R                  S:X  a  UR                  U l
        g S U l
        g )Nlayer_types)epssliding_attention)hasattrr2   
layer_typesuper__init__r   head_dimrms_norm_epsq_normk_normsliding_window)selfr/   r0   	__class__s      r$   r8   Qwen3Attention.__init__=   s    ;B6=;Y;Y&,,Y7_c+"4==f6I6IJ"4==f6I6IJ7;J]7]f33cgr#   Nhidden_statesposition_embeddingsattention_maskpast_key_valueskwargsreturnc                 Z   UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U5      R	                  U5      R                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [         5      nU" U UU	U
U4U R"                  (       d  SOU R$                  U R&                  U R(                  S.UD6u  pUR*                  " / UQSP76 R-                  5       nU R/                  U5      nX4$ )Nr   r   g        )dropoutscalingr=   )shaper9   r;   q_projview	transposer<   k_projv_projr   updater0   r   get_interfacer/   _attn_implementationr   trainingattention_dropoutrJ   r=   reshape
contiguouso_proj)r>   rA   rB   rC   rD   rE   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r$   forwardQwen3Attention.forwardD   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HLL..
%
 
%
! "));;;;FFHkk+.((r#   )r<   r6   r;   r=   )N)r   r   r    r!   r   intr8   torchTensortupler   r	   r   rc   r"   __classcell__r?   s   @r$   r-   r-   <   s    h{ hs h )-')||') #5<<#=>') t+	')
 ') -.') 
u||U\\D00	1') ')r#   r-   c                   :   ^  \ rS rSrS\\   S\4U 4S jjrSrU =r	$ )Qwen3ForCausalLMn   super_kwargsrF   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Qwen3ForCausalLM

>>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r   )r7   rc   )r>   rn   r?   s     r$   rc   Qwen3ForCausalLM.forwardo   s    4 w...r#   r   )
r   r   r    r!   r	   r
   r   rc   r"   ri   rj   s   @r$   rl   rl   n   s%    /12/ 
 / /r#   rl   c                       \ rS rSrSrg)Qwen3ForSequenceClassification   r   Nr   r   r#   r$   rr   rr      r%   r#   rr   c                       \ rS rSrSrg)Qwen3ForTokenClassification   r   Nr   r   r#   r$   ru   ru      r%   r#   ru   c                       \ rS rSrSrg)Qwen3ForQuestionAnswering   r   Nr   r   r#   r$   rx   rx      r%   r#   rx   )rl   rx   Qwen3PreTrainedModel
Qwen3Modelrr   ru   )-__doc__collections.abcr   rf   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   r   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r'   r*   r-   rl   rr   ru   rx   __all__r   r#   r$   <module>r      s     $    B 6 5 & 0 +	 	 	 - 
		H	%% 	< 		x 		/ 	/)^ /)d/' /<	%C 		"= 		 9 	r#   