
    Z jG                        S SK r S SKrS SKJr  SSKJr  SSKJrJr  SSK	J
r
Jr  SSKJr  SSKJr  S	S
KJr  S	SKJrJrJrJrJrJrJrJrJr  S	SKJr  SSKJr  \R@                  " \!5      r"Sr#Sr$ " S S\5      r%S r& " S S\5      r' " S S\RP                  5      r) " S S\)5      r* " S S\)5      r+\)\*\+S.r, " S S\5      r- " S S \5      r. " S! S"\5      r/ " S# S$\5      r0 " S% S&\5      r1 " S' S(\5      r2 " S) S*\5      r3/ S+Qr4g),    N)nn   )initialization)CacheStaticCache)_flash_attention_forward!flash_attn_supports_top_left_mask)PreTrainedModel)logging   )GemmaForCausalLM)	LlamaDecoderLayerLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_emb	repeat_kv)
MistralMLP   )DiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                       \ rS rSrSrg)DiffLlamaMLP1    N__name__
__module____qualname____firstlineno____static_attributes__r       ڀ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/diffllama/modular_diffllama.pyr   r   1       r$   r   c                 @    SS[         R                  " SU -  5      -  -
  $ )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r%   lambda_init_fnr+   5   s     txxy 01111r$   c                       \ rS rSrSrg)DiffLlamaRotaryEmbedding9   r   Nr   r   r$   r%   r-   r-   9   r&   r$   r-   c                   N  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr    SS\R                  S\
\R                  \R                  4   S	\R                  S-  S
\R                  S-  S\S-  S\S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )DiffLlamaAttention=   z=Multi-headed attention from 'Attention Is All You Need' paperNconfigr*   c                   > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        [        USU R                  U R                  -  5      U l        UR                  U l        U R                  U R                  -  U l        UR                   U l        SU l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  -  U R                  UR(                  S9U l        [3        U5      U l        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l         [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l!        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l"        [$        RF                  " SU R                  -  URH                  S	S
9U l%        g )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.head_dimT)biasr   )sizer   F)epselementwise_affine)&super__init__r2   r*   loggerwarning_once	__class__r   attention_dropouthidden_sizenum_attention_heads	num_headsgetattrr4   num_key_value_headsnum_key_value_groupsmax_position_embeddings	is_causalr   Linearattention_biasq_projk_projv_projo_projr+   lambda_init	Parametertorchnormallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormselfr2   r*   r=   s      r%   r:   DiffLlamaAttention.__init__@   sm   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir$   hidden_statesposition_embeddingsattention_maskposition_idspast_key_values	use_cachereturnc                    UR                  5       u  pn
U	nU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " [        R                   " USSS9SS9nUR#                  SSSS5      n[        R$                  " XR                  SS5      5      [&        R(                  " U R                  5      -  nUb  UU-   n[*        R,                  R/                  US[        R0                  S9R3                  UR4                  5      n[*        R,                  R7                  UU R8                  U R:                  S9n[        R<                  " [        R>                  " U R@                  U RB                  -  S[        R0                  S95      R3                  UR4                  5      n[        R<                  " [        R>                  " U RD                  U RF                  -  S[        R0                  S95      R3                  UR4                  5      nUU-
  U RH                  -   n[        R$                  " UU5      n[        R                   " USSS9u  nnUUU-  -
  nSU RH                  -
  U RK                  U5      -  nUR                  SS5      RM                  5       nURO                  XS5      nU RQ                  U5      nUU4$ )Nr   r   dimr   re   dtype)ptraining))r6   rI   rJ   rK   viewrA   r4   	transposerC   r   updater*   r   rD   rO   catchunkrepeatmatmulr(   sqrtr   
functionalsoftmaxfloat32torh   dropoutr>   rj   r)   sumrR   rS   rT   rU   rM   rX   
contiguousreshaperL   )rZ   r\   r]   r^   r_   r`   ra   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statescossinattn_weightslambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                           r%   forwardDiffLlamaAttention.forwarda   s1    +//1{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HIyy\1!!D"M#**1aA6||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%'.8L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#b9kk+.L((r$   )r>   r2   rX   r4   r?   rF   rJ   rM   rS   rU   rR   rT   r*   rE   rA   rD   rC   rL   rI   rK   NNNNF)r   r    r!   r"   __doc__r   intr:   rO   Tensortuple
LongTensorr   boolr   r#   __classcell__r=   s   @r%   r0   r0   =   s    Gj j3: j jJ /304(,8)||8) #5<<#=>8) t+	8)
 &&-8) 8) 8) 
u||U\\D0%2E2LL	M8) 8)r$   r0   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S	\
S-  S
\S\\R                  S4   4S jjrSrU =r$ )DiffLlamaFlashAttention2   a>  
DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r   )r9   r:   r	   _flash_attn_uses_top_left_mask)rZ   argsr{   r=   s      r%   r:   !DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r$   Nr\   r]   r^   r_   r`   ra   rb   c                 	   [        U[        5      (       a  [        S5      eUR                  5       u  pxn	U R	                  U5      n
U R                  U5      nU R                  U5      nU
R                  XxU R                  U R                  5      R                  SS5      n
UR                  XxU R                  U R                  5      R                  SS5      nUR                  XxU R                  U R                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  pU
R                  SS5      n
UR                  SS5      nUR                  SS5      nU R                  (       a  U R                   OSnU
R"                  nU
R$                  R&                  S:w  a  U
R$                  R&                  OSnU[(        R*                  :X  a  [(        R,                  " U5      (       a  [(        R.                  " U5      nOR[1        U R2                  S5      (       a  U R2                  R"                  nO U R                  R4                  R"                  n[6        R9                  SU S	35        U
R;                  U5      n
UR;                  U5      nUR;                  U5      n[(        R<                  " USSS
9u  nnUR?                  SSSS5      nUR?                  SSSS5      n[A        U
UUUUUU[C        U SS 5      U RD                  U RF                  S9
n[A        U
UUUUUU[C        U SS 5      U RD                  U RF                  S9
n[(        RH                  " UU/SS
9n[(        R<                  " USSS
9u  nn[(        RJ                  " [(        RL                  " U RN                  U RP                  -  S[(        R*                  S95      R;                  U
R"                  5      n[(        RJ                  " [(        RL                  " U RR                  U RT                  -  S[(        R*                  S95      R;                  U
R"                  5      nUU-
  U RV                  -   nUUU-  -
  nSU RV                  -
  U RY                  U5      -  nUR[                  XxS5      R]                  5       nU R_                  U5      nUS 4$ )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r           mpscpu_is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rd   sliding_window)r_   rw   r   use_top_left_maskrF   rf   rg   )0
isinstancer   
ValueErrorr6   rI   rJ   rK   rk   rA   r4   rl   rC   r   rm   r*   rj   r>   rh   devicetyperO   ru   is_autocast_enabledget_autocast_dtypehasattrr2   weightr;   r<   rv   ro   rp   r   rB   r   rF   rn   r)   rx   rR   rS   rT   rU   rM   rX   rz   ry   rL   )rZ   r\   r]   r^   r_   r`   ra   r|   r   r~   r   r   r   r   r   dropout_rateinput_dtypedevice_typetarget_dtypevalue_states1value_states2r   r   r   r   r   r   s                              r%   r    DiffLlamaFlashAttention2.forward   s7    o{33} 
 &**,A{{=1[[/
{{=1
 $((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&#7RU#[ &'6'='=jX\XfXf'g$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((55$77Do66#{{00#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#b9DDFkk+.D  r$   )r   r   )r   r    r!   r"   r   r:   rO   r   r   r   r   r   r   r#   r   r   s   @r%   r   r      s    R 3704(,r!||r! #5<<#=>r! ((4/	r!
 &&-r! r! r! 
u||T!	"r! r!r$   r   c                   $   \ rS rSrSr    SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S\	S-  S	\
S
\\R                  \R                  S-  \\R                     S-  4   4S jjrSrg)DiffLlamaSdpaAttentioni   z
DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
Nr\   r]   r^   r_   r`   ra   rb   c           	         UR                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " [        R                   " USSS9SS9nUR#                  SSSS5      nUnUb  US S 2S S 2S S 2S UR$                  S   24   nUS L =(       a    U	S:  n[        R&                  R(                  R+                  UUUUU R,                  (       a  U R.                  OSUS9n[        R                   " USSS9u  nn[        R0                  " [        R2                  " U R4                  U R6                  -  S[        R8                  S95      R;                  UR<                  5      n[        R0                  " [        R2                  " U R>                  U R@                  -  S[        R8                  S95      R;                  UR<                  5      nUU-
  U RB                  -   nUUU-  -
  nSU RB                  -
  U RE                  U5      -  nUR                  SS5      RG                  5       nUR	                  XS5      nU RI                  U5      nUS 4$ )	Nr   r   rd   rf   r   )	attn_mask	dropout_prF   rg   )%r6   rI   rJ   rK   rk   rA   r4   rl   rC   r   rm   r*   r   rD   rO   rn   ro   rp   shaper   rs   scaled_dot_product_attentionrj   r>   r)   rx   rR   rS   ru   rv   rh   rT   rU   rM   rX   ry   rL   )rZ   r\   r]   r^   r_   r`   ra   r{   r|   r   r~   r   r   r   r   r   causal_maskrF   r   r   r   r   r   r   s                           r%   r   DiffLlamaSdpaAttention.forward(  s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&#7RU#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&s26kk+.D  r$   r   r   )r   r    r!   r"   r   rO   r   r   r   r   r   r   r#   r   r$   r%   r   r      s     /304(,?!||?! #5<<#=>?! t+	?!
 &&-?! ?! ?! 
u||U\\D0%2E2LL	M?! ?!r$   r   )eagerflash_attention_2sdpac                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )DiffLlamaDecoderLayeriq  r2   r*   c                 \   > [         TU ]  X5        [        UR                     " XS9U l        g )N)r2   r*   )r9   r:   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnrY   s      r%   r:   DiffLlamaDecoderLayer.__init__r  s(    +4V5P5PQY_ur$   )r   )	r   r    r!   r"   r   r   r:   r#   r   r   s   @r%   r   r   q  s    v v3 v vr$   r   c                   J    \ rS rSrSrSr\R                  " 5       S 5       rSr	g)DiffLlamaPreTrainedModelix  Fc                    [         R                  " X5        [        U[        5      (       a  [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        g g )Nr   )r
   _init_weightsr   r0   initnormal_rR   r2   rQ   rS   rT   rU   )rZ   modules     r%   r   &DiffLlamaPreTrainedModel._init_weights|  s    %%d3f011LL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HI	 2r$   r   N)
r   r    r!   r"   _supports_flex_attn_supports_attention_backendrO   no_gradr   r#   r   r$   r%   r   r   x  s(    "'
]]_J Jr$   r   c                       \ rS rSrSrg)DiffLlamaModeli  r   Nr   r   r$   r%   r   r     r&   r$   r   c                       \ rS rSrSrg)DiffLlamaForCausalLMi  r   Nr   r   r$   r%   r   r     r&   r$   r   c                       \ rS rSrSrg)"DiffLlamaForSequenceClassificationi  r   Nr   r   r$   r%   r   r     r&   r$   r   c                       \ rS rSrSrg)DiffLlamaForQuestionAnsweringi  r   Nr   r   r$   r%   r   r     r&   r$   r   c                       \ rS rSrSrg)DiffLlamaForTokenClassificationi  r   Nr   r   r$   r%   r   r     r&   r$   r   )r   r   r   r   r   r   )5r(   rO   r    r   r   cache_utilsr   r   modeling_flash_attention_utilsr   r	   modeling_utilsr
   utilsr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_diffllamar   
get_loggerr   r;   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r+   r-   Moduler0   r   r   r   r   r   r   r   r   r   r   __all__r   r$   r%   <module>r      s  "    & - i -  3
 
 
 2 4 
		H	%5 #	: 	2	3 	\) \)~A!1 A!HG!/ G!V  1" v- vJ3 J	Z 		+ 		)G 		$= 		&A 	r$   