
    Z j"                     B   S SK r S SK Jr  SSKJrJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  \R:                  " \5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$/ SQr%g)    N)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   B   ^  \ rS rSrSrSS\S\S-  4U 4S jjjrSrU =r	$ )	GraniteAttention'   z=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g N)super__init__attention_multiplierscalingselfr   r   	__class__s      |/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/granite/modular_granite.pyr!   GraniteAttention.__init__*   s    +22    )r#   r   )
__name__
__module____qualname____firstlineno____doc__r   intr!   __static_attributes____classcell__r&   s   @r'   r   r   '   s"    G3} 3t 3 3r)   r   c                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )GraniteDecoderLayer/   r   r   c                 b   > [         TU ]  X5        UR                  U l        [        XS9U l        g )N)r   r   )r    r!   residual_multiplierr   	self_attnr$   s      r'   r!   GraniteDecoderLayer.__init__0   s*    +#)#=#= )Mr)   Nhidden_statesattention_maskposition_idspast_key_values	use_cacheposition_embeddingskwargsreturnc           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pXU R                  -  -   nUnU R                  U5      nU R	                  U5      nXU R                  -  -   nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_values (`Cache`, *optional*): cached past key and value projection states
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r:   r;   r<   r=   r>   r?    )input_layernormr8   r7   post_attention_layernormmlp)
r%   r:   r;   r<   r=   r>   r?   r@   residual_s
             r'   forwardGraniteDecoderLayer.forward5   s    < !,,];>> 
')%+ 3
 
 !43K3K#KK 55mD/ 43K3K#KKr)   )r7   r8   )NNNFN)r*   r+   r,   r-   r   r/   r!   torchTensor
LongTensorr   booltupler
   r   rI   r0   r1   r2   s   @r'   r4   r4   /   s    N} N N /304(,!&HL2||2 t+2 &&-	2
 2 $;2 #5<<#=>E2 +,2 
2 2r)   r4   c                       \ rS rSr\\S.rSrg)GranitePreTrainedModelj   )r:   
attentionsrC   N)r*   r+   r,   r-   r4   r   _can_record_outputsr0   rC   r)   r'   rQ   rQ   j   s    ,&r)   rQ   c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )GraniteModelq   r   c           	         > [         TU ]  U5        UR                  U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r   )	r    r!   embedding_multiplierr   
ModuleListrangenum_hidden_layersr4   layersr$   s      r'   r!   GraniteModel.__init__r   sV     $*$?$?!mmEJ6KcKcEdeEd	 3Ede
es   	A+N	input_idsr;   r<   r=   inputs_embedsr>   r@   rA   c           
      Z   US L US L-  (       a  [        S5      eUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  S U R                  R                    H  nU" U
4U	UUUUS.UD6n
M     U R                  U
5      n
[!        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   )device)r   r`   r;   r=   r<   )r<   )r;   r<   r=   r>   r?   )last_hidden_stater=   )
ValueErrorembed_tokensrY   r   r   get_seq_lengthrK   arangeshaperb   	unsqueezer   
rotary_embr]   r\   normr   )r%   r_   r;   r<   r=   r`   r>   r@   past_seen_tokenscausal_maskr:   r?   decoder_layers                r'   rI   GraniteModel.forwardy   sT    -t";<YZZ  --i8M%(A(AA0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[)H4;;+H+HIM)*) /#$7 M J 		-0&++
 	
r)   )rY   r]   )NNNNNN)r*   r+   r,   r-   r   r!   r   r   r   rK   rM   rL   r   FloatTensorrN   r
   r   r   rI   r0   r1   r2   s   @r'   rV   rV   q   s    
} 
   .2.204(,26!%5
##d*5
 t+5
 &&-	5

 5
 ((4/5
 $;5
 +,5
 
!5
    5
r)   rV   c                      \ rS rSr\\        SS\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\S-  S
\\R                  -  S\\   S\4S jj5       5       rSrg)GraniteForCausalLM   Nr_   r;   r<   r=   r`   labelsr>   logits_to_keepr@   rA   c	           
         U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S 5      OUnU R                  US S 2US S 24   5      nXR                  R                  -  nS nUb)  U R                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )N)r_   r;   r<   r=   r`   r>   )logitsrt   
vocab_size)lossrw   r=   r:   rS   rC   )modelrc   
isinstancer/   slicelm_headr   logits_scalingloss_functionrx   r	   r=   r:   rS   )r%   r_   r;   r<   r=   r`   rt   r>   ru   r@   outputsr:   slice_indicesrw   ry   s                  r'   rI   GraniteForCausalLM.forward   s     ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A++444%%pVt{{OeOepiopD%#33!//))
 	
r)   rC   )NNNNNNNr   )r*   r+   r,   r-   r   r   rK   rM   rL   r   rp   rN   r/   r
   r   r	   rI   r0   rC   r)   r'   rr   rr      s     .2.204(,26*.!%-.%
##d*%
 t+%
 &&-	%

 %
 ((4/%
   4'%
 $;%
 ell*%
 +,%
 
 %
  %
r)   rr   )rr   rV   rQ   )&rK   r   cache_utilsr   r   masking_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr*   loggerr   r4   rQ   rV   rr   __all__rC   r)   r'   <module>r      s       . / O & R R 7 5  1 
		H	%3~ 38+ 8v1 @
: @
F(
) (
V Kr)   