
    Z j*                     P   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJrJrJr  SSKJr  \R8                  " \5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$/ SQr%g)zPyTorch BitNet model.    )CallableN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward   )BitNetConfigc                       \ rS rSrSrg)BitNetRMSNorm)    N__name__
__module____qualname____firstlineno____static_attributes__r       z/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/bitnet/modular_bitnet.pyr   r   )       r    r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	BitNetMLP-   configc                 j   > [         TU ]  U5        [        UR                  UR                  S9U l        g N)eps)super__init__r   intermediate_sizerms_norm_epsffn_sub_norm)selfr&   	__class__s     r!   r+   BitNetMLP.__init__.   s+     )&*B*BH[H[\r    c           	          U R                  U R                  U R                  U R                  U5      5      U R	                  U5      -  5      5      nU$ N)	down_projr.   act_fn	gate_projup_proj)r/   xr4   s      r!   forwardBitNetMLP.forward2   sF    NN4#4#4T[[PQAR5SVZVbVbcdVe5e#fg	r    )r.   )	r   r   r   r   r   r+   r9   r   __classcell__r0   s   @r!   r$   r$   -   s    ]| ] r    r$   c                      ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\R                  S-  S	\
S-  S
\\   S\	\R                  \R                  S-  4   4S jjrSrU =r$ )BitNetAttention7   r&   	layer_idxc                 j   > [         TU ]  X5        [        UR                  UR                  S9U l        g r(   )r*   r+   r   hidden_sizer-   attn_sub_norm)r/   r&   r@   r0   s      r!   r+   BitNetAttention.__init__8   s+    +*6+=+=6CVCVWr    Nhidden_statesposition_embeddingsattention_maskpast_key_valueskwargsreturnc                 *   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nU R+                  U5      nX4$ )Nr   r   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater@   r   get_interfacer&   _attn_implementationr   trainingattention_dropoutrN   reshape
contiguousrC   o_proj)r/   rE   rF   rG   rH   rI   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                   r!   r9   BitNetAttention.forward<   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFH((5kk+.((r    )rC   r3   )r   r   r   r   r   intr+   torchTensortupler   r	   r   r9   r   r;   r<   s   @r!   r>   r>   7   s    X| X X )-')||') #5<<#=>') t+	')
 ') -.') 
u||U\\D00	1') ')r    r>   c                       \ rS rSrSrg)BitNetDecoderLayerf   r   Nr   r   r    r!   rn   rn   f   r"   r    rn   c                       \ rS rSrSrg)BitNetModelj   r   Nr   r   r    r!   rq   rq   j   r"   r    rq   c                   @   ^  \ rS rSrSS0rSrSrS\4U 4S jjrSr	U =r
$ )BitNetForCausalLMn   zlm_head.weightzmodel.embed_tokens.weightNrJ   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, BitNetForCausalLM

>>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")

>>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=100)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
```r   )r*   r9   )r/   super_kwargsr0   s     r!   r9   BitNetForCausalLM.forwards   s    4 w...r    r   )r   r   r   r   _tied_weights_keys_tp_plan_pp_planr   r9   r   r;   r<   s   @r!   rt   rt   n   s-    *,GHHH/ 
 / /r    rt   )rt   rq   BitNetPreTrainedModel)&__doc__collections.abcr   rj   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   configuration_bitnetr   
get_loggerr   loggerr   r$   r>   rn   rq   rt   __all__r   r    r!   <module>r      s     $    B 6 5 &  +   / 
		H	%	L 	 ,)n ,)^	* 		* 	/( /Dr    