
    Z j|                     x   S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJr  SSKJrJrJrJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-  SSK.J/r/J0r0  SSK1J2r2  SSK3J4r4  \-Rj                  " \65      r7 " S S\Rp                  5      r9 " S S\Rp                  5      r:S r;\" S5      S<S j5       r<S\Rz                  S\>S \Rz                  4S! jr?S" r@ " S# S$\Rp                  5      rA " S% S&\A5      rB " S' S(\A5      rC\" S)5       " S* S+\Rp                  5      5       rD\A\B\CS,.rE " S- S.\5      rF\+ " S/ S0\&5      5       rG\+ " S1 S2\G5      5       rH\+ " S3 S4\G\5      5       rI " S5 S6\\G5      rJ " S7 S8\\G5      rK " S9 S:\\G5      rL/ S;QrMg)=    N)Callable)Optional)nn   )initialization)ACT2FN)CacheDynamicCacheStaticCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hub)create_causal_mask)_flash_attention_forward!flash_attn_supports_top_left_mask)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)capture_outputs   )DiffLlamaConfigc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DiffLlamaMLP8   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)super__init__confighidden_sizeintermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnselfr.   	__class__s     ځ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/diffllama/modeling_diffllama.pyr-   DiffLlamaMLP.__init__9   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../    c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)r4   r6   r2   r3   )r8   xr4   s      r:   forwardDiffLlamaMLP.forwardC   s6    NN4;;t~~a/@#ADLLQRO#ST	r<   )r6   r.   r4   r2   r/   r0   r3   )__name__
__module____qualname____firstlineno__r-   r@   __static_attributes____classcell__r9   s   @r:   r&   r&   8   s    0 r<   r&   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )DiffLlamaRotaryEmbeddingH   inv_freqNr.   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultrL   F)
persistentoriginal_inv_freq)r,   r-   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr.   rope_parametersrN   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r8   r.   devicerope_init_fnrL   r9   s        r:   r-   !DiffLlamaRotaryEmbedding.__init__K   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr<   rZ   ztorch.deviceseq_lenreturnztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNg      ?r      dtype)rZ   rd   )	rU   getattrr/   num_attention_headstorcharangeint64tofloat)r.   rZ   r]   basedimattention_factorrL   s          r:   rV   8DiffLlamaRotaryEmbedding.compute_default_rope_parameters[   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r<   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r#   mpscpuF)device_typeenabledrb   rm   rc   )rL   rk   expandshaperj   rZ   
isinstancetypestrr    	transposerg   catcosrW   sinrd   )
r8   r?   position_idsinv_freq_expandedposition_ids_expandedrt   freqsembr~   r   s
             r:   r@    DiffLlamaRotaryEmbedding.forwardy   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)rW   r.   rS   rT   rN   r>   )NNN)rB   rC   rD   rE   rg   Tensor__annotations__r$   r-   staticmethodr   inttuplerk   rV   no_gradr   r@   rF   rG   rH   s   @r:   rJ   rJ   H   s    llV V V  )-+/"*$&*(* t* 
~u$	%	* *: ]]_<  <r<   rJ   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nrq   rb   rv   )rx   rg   r}   )r?   x1x2s      r:   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr~   r   unsqueeze_dimq_embedk_embeds          r:   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr<   hidden_statesn_repr^   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r#   N)rx   rw   reshape)r   r   batchnum_key_value_headsslenra   s         r:   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr<   c                 @    SS[         R                  " SU -  5      -  -
  $ )Ng?g333333?g333333ӿ)mathexp)	layer_idxs    r:   lambda_init_fnr      s     txxy 01111r<   c                   N  ^  \ rS rSrSrSS\S\S-  4U 4S jjjr    SS\R                  S\
\R                  \R                  4   S	\R                  S-  S
\R                  S-  S\S-  S\S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )DiffLlamaAttention   z=Multi-headed attention from 'Attention Is All You Need' paperNr.   r   c                   > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        [        USU R                  U R                  -  5      U l        UR                  U l        U R                  U R                  -  U l        UR                   U l        SU l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  U R                  -  UR(                  S9U l        [$        R&                  " U R                  U R                  -  U R                  UR(                  S9U l        [3        U5      U l        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l         [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l!        [$        R6                  " [8        R:                  " SUR<                  U R                  4S95      U l"        [$        RF                  " SU R                  -  URH                  S	S
9U l%        g )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.ra   Tr*   r   )sizerb   F)epselementwise_affine)&r,   r-   r.   r   loggerwarning_oncer9   rB   attention_dropoutr/   rf   	num_headsre   ra   r   num_key_value_groupsrR   	is_causalr   r1   attention_biasq_projk_projv_projo_projr   lambda_init	Parameterrg   normallambda_std_dev	lambda_q1	lambda_k1	lambda_q2	lambda_k2RMSNormrms_norm_eps	groupnormr8   r.   r   r9   s      r:   r-   DiffLlamaAttention.__init__   sm   " !8!8 9 :, , "(!9!9!--33
D4D4D4VW#)#=#= $(NNd6N6N$N!'-'E'E$ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm))4ell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdell1f6K6KSWS`S`Rb&cdA$56;N;Nchir<   r   position_embeddingsattention_maskr   past_key_values	use_cacher^   c                    UR                  5       u  pn
U	nU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  nn[        XUU5      u  pUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " [        R                   " USSS9SS9nUR#                  SSSS5      n[        R$                  " XR                  SS5      5      [&        R(                  " U R                  5      -  nUb  UU-   n[*        R,                  R/                  US[        R0                  S9R3                  UR4                  5      n[*        R,                  R7                  UU R8                  U R:                  S9n[        R<                  " [        R>                  " U R@                  U RB                  -  S[        R0                  S95      R3                  UR4                  5      n[        R<                  " [        R>                  " U RD                  U RF                  -  S[        R0                  S95      R3                  UR4                  5      nUU-
  U RH                  -   n[        R$                  " UU5      n[        R                   " USSS9u  nnUUU-  -
  nSU RH                  -
  U RK                  U5      -  nUR                  SS5      RM                  5       nURO                  XS5      nU RQ                  U5      nUU4$ )Nr#   rb   rv   rq   r   rm   rd   )ptraining))r   r   r   r   viewr   ra   r|   r   r   updater   r   r   rg   r}   chunkrepeatmatmulr   sqrtr   
functionalsoftmaxfloat32rj   rd   dropoutr   r   r   sumr   r   r   r   r   r   
contiguousr   r   )r8   r   r   r   r   r   r   kwargsbsz
target_len_q_lenquery_states
key_statesvalue_statesr~   r   attn_weightslambda_1lambda_2lambda_fullattn_outputattn_output1attn_output2s                           r:   r@   DiffLlamaAttention.forward   s1    +//1{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&S#7RUWZ#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HIyy\1!!D"M#**1aA6||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%'.8L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<ll<>%*[[aQ%G"l"[<%??4+++t~~k/JJ!++Aq1<<>!))#b9kk+.L((r<   )r   r.   r   ra   r/   r   r   r   r   r   r   r   r   rR   r   r   r   r   r   r   r>   NNNF)rB   rC   rD   rE   __doc__r$   r   r-   rg   r   r   
LongTensorr	   boolr@   rF   rG   rH   s   @r:   r   r      s    Gj j3: j jJ /304(,8)||8) #5<<#=>8) t+	8)
 &&-8) 8) 8) 
u||U\\D0%2E2LL	M8) 8)r<   r   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S	\
S-  S
\S\\R                  S4   4S jjrSrU =r$ )DiffLlamaFlashAttention2i  a>  
DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r>   )r,   r-   r   _flash_attn_uses_top_left_mask)r8   argsr   r9   s      r:   r-   !DiffLlamaFlashAttention2.__init__   s#    $)&)
 /P.Q+r<   Nr   r   r   r   r   r   r^   c                 	   [        U[        5      (       a  [        S5      eUR                  5       u  pxn	U R	                  U5      n
U R                  U5      nU R                  U5      nU
R                  XxU R                  U R                  5      R                  SS5      n
UR                  XxU R                  U R                  5      R                  SS5      nUR                  XxU R                  U R                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  pU
R                  SS5      n
UR                  SS5      nUR                  SS5      nU R                  (       a  U R                   OSnU
R"                  nU
R$                  R&                  S:w  a  U
R$                  R&                  OSnU[(        R*                  :X  a  [(        R,                  " U5      (       a  [(        R.                  " U5      nOR[1        U R2                  S5      (       a  U R2                  R"                  nO U R                  R4                  R"                  n[6        R9                  SU S	35        U
R;                  U5      n
UR;                  U5      nUR;                  U5      n[(        R<                  " USSS
9u  nnUR?                  SSSS5      nUR?                  SSSS5      n[A        U
UUUUUU[C        U SS 5      U RD                  U RF                  S9
n[A        U
UUUUUU[C        U SS 5      U RD                  U RF                  S9
n[(        RH                  " UU/SS
9n[(        R<                  " USSS
9u  nn[(        RJ                  " [(        RL                  " U RN                  U RP                  -  S[(        R*                  S95      R;                  U
R"                  5      n[(        RJ                  " [(        RL                  " U RR                  U RT                  -  S[(        R*                  S95      R;                  U
R"                  5      nUU-
  U RV                  -   nUUU-  -
  nSU RV                  -
  U RY                  U5      -  nUR[                  XxS5      R]                  5       nU R_                  U5      nUS 4$ )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr#   rb           rr   rs   _is_quantizedzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .rv   sliding_window)r   r   r   use_top_left_maskr   rq   r   )0ry   r   
ValueErrorr   r   r   r   r   r   ra   r|   r   r   r   r   r   r   rd   rZ   rz   rg   r   is_autocast_enabledget_autocast_dtypehasattrr.   weightr   r   rj   r   r   r   re   r   r   r}   r   r   r   r   r   r   r   r   r   r   r   )r8   r   r   r   r   r   r   r   r   r   r   r   r   r~   r   dropout_rateinput_dtypert   target_dtypevalue_states1value_states2r   r   r   r   r   r   s                              r:   r@    DiffLlamaFlashAttention2.forward(  s7    o{33} 
 &**,A{{=1[[/
{{=1
 $((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&#7RU#[ &'6'='=jX\XfXf'g$J $--a3))!Q/
#--a315t--C #((2>2E2E2J2Je2Sl))..Y^%--'((55$77Do66#{{00#{{1177 >$ (??<8L#|4J'??<8L',{{<'J$}%,,Q1a8%,,Q1a8/% "4)94@"AAnn
 0% "4)94@"AAnn
 ii| <"E%*[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!))#b9DDFkk+.D  r<   )r   r   )rB   rC   rD   rE   r   r-   rg   r   r   r   r	   r   r@   rF   rG   rH   s   @r:   r   r     s    R 3704(,r!||r! #5<<#=>r! ((4/	r!
 &&-r! r! r! 
u||T!	"r! r!r<   r   c                   $   \ rS rSrSr    SS\R                  S\\R                  \R                  4   S\R                  S-  S\R                  S-  S\	S-  S	\
S
\\R                  \R                  S-  \\R                     S-  4   4S jjrSrg)DiffLlamaSdpaAttentioni  z
DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
Nr   r   r   r   r   r   r^   c           	         UR                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " [        R                   " USSS9SS9nUR#                  SSSS5      nUnUb  US S 2S S 2S S 2S UR$                  S   24   nUS L =(       a    U	S:  n[        R&                  R(                  R+                  UUUUU R,                  (       a  U R.                  OSUS9n[        R                   " USSS9u  nn[        R0                  " [        R2                  " U R4                  U R6                  -  S[        R8                  S95      R;                  UR<                  5      n[        R0                  " [        R2                  " U R>                  U R@                  -  S[        R8                  S95      R;                  UR<                  5      nUU-
  U RB                  -   nUUU-  -
  nSU RB                  -
  U RE                  U5      -  nUR                  SS5      RG                  5       nUR	                  XS5      nU RI                  U5      nUS 4$ )	Nr#   rb   rv   rq   r   )	attn_mask	dropout_pr   r   )%r   r   r   r   r   r   ra   r|   r   r   r   r   r   r   rg   r}   r   r   rx   r   r   scaled_dot_product_attentionr   r   r   r   r   r   r   rj   rd   r   r   r   r   r   r   )r8   r   r   r   r   r   r   r   r   r   r   r   r   r   r~   r   causal_maskr   r   r   r   r   r   r   s                           r:   r@   DiffLlamaSdpaAttention.forward  s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm&#7RU#[ &'6'='=jX\XfXf'g$Jz+D+DE
 /H/HIyy\1!!D"M#**1aA6$%%aA/E1A1A"1E/E&EFK  4'5EAI	hh))FF!04d,,3 G 
 &+[[aQ%G"l99UYYt~~'FBV[VcVcdehh
 99UYYt~~'FBV[VcVcdehh
 )D,<,<<"[<%??4+++t~~k/JJ!++Aq1<<>!&&s26kk+.D  r<    r   )rB   rC   rD   rE   r   rg   r   r   r   r	   r   r@   rF   r  r<   r:   r  r    s     /304(,?!||?! #5<<#=>?! t+	?!
 &&-?! ?! ?! 
u||U\\D0%2E2LL	M?! ?!r<   r  r   c                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )DiffLlamaRMSNormi  r   r^   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z/
DiffLlamaRMSNorm is equivalent to T5LayerNorm
N)r,   r-   r   r   rg   onesr   variance_epsilon)r8   r/   r   r9   s      r:   r-   DiffLlamaRMSNorm.__init__  s/     	ll5::k#:; #r<   r   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nrb   rq   T)keepdim)	rd   rj   rg   r   powmeanrsqrtr  r   )r8   r   r   variances       r:   r@   DiffLlamaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r<   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r   rx   r  )r8   s    r:   
extra_reprDiffLlamaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr<   )r  r   )gư>)rB   rC   rD   rE   rk   r-   rg   r   r@   r  rF   rG   rH   s   @r:   r  r    sB    $ $$ $ $;U\\ ;ell ;J Jr<   r  )eagerflash_attention_2sdpac                     ^  \ rS rSrS\S\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\S-  S\\R                  \R                  4   S-  S\\   S\R                  4S jjrSrU =r$ )DiffLlamaDecoderLayeri  r.   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                     " XS9U l        [        U5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        g )N)r.   r   r   )r,   r-   r/   DIFFLLAMA_ATTENTION_CLASSES_attn_implementation	self_attnr&   mlpr  r   input_layernormpost_attention_layernormr   s      r:   r-   DiffLlamaDecoderLayer.__init__  sw    !--4V5P5PQY_u'/0B0BH[H[\(89K9KQWQdQd(e%r<   Nr   r   r   r   r   r   r   r^   c           
          UnU R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r   r   r   r   r  )r)  r'  r*  r(  )
r8   r   r   r   r   r   r   r   residualr   s
             r:   r@   DiffLlamaDecoderLayer.forward  s     !,,];>> 
')%+ 3
 
 !0 !55mD/ 0r<   )r/   r)  r(  r*  r'  )NNNFN)rB   rC   rD   rE   r$   r   r-   rg   r   r   r	   r   r   r   r   r@   rF   rG   rH   s   @r:   r"  r"    s    f f3 f /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
 r<   r"  c                      ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.r\R&                  " 5       U 4S	 j5       rS
rU =r$ )DiffLlamaPreTrainedModeli.  r.   modelTr"  r   F)r   
attentionsc                    > [         TU ]  U5        [        U[        5      (       a  [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        [        R
                  " UR                  SU R                  R                  5        g g )Nr   )r,   _init_weightsry   r   initnormal_r   r.   r   r   r   r   )r8   moduler9   s     r:   r4  &DiffLlamaPreTrainedModel._init_weights@  s    f%f011LL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HILL))1dkk.H.HI	 2r<   r  )rB   rC   rD   rE   r$   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr"  r   _can_record_outputsrg   r   r4  rF   rG   rH   s   @r:   r0  r0  .  sn    &*#01#4"5N!"'.(
 ]]_J Jr<   r0  c                     ^  \ rS rSrS\4U 4S jjr\\\      SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S
\S-  S\\   S\4S jj5       5       5       rSrU =r$ )DiffLlamaModeliJ  r.   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr$  r.   F)r,   r-   pad_token_idpadding_idx
vocab_sizer   	Embeddingr/   embed_tokens
ModuleListrangenum_hidden_layersr"  layersr  r   normrJ   
rotary_embgradient_checkpointing	post_initr   s      r:   r-   DiffLlamaModel.__init__L  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfgGf)"65Gfg
 %V%7%7V=P=PQ	2&A&+# 	 hs   C?N	input_idsr   r   r   inputs_embedsr   r   r^   c           
      >   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR	                  5       OSn[
        R                  " UR                  S   UR                  S9U-   nUR                  S5      n[        U R                  UUUUS9n	Un
U R                  XS9nU R                  S U R                  R                    H  nU" U
4U	UUUUS.UD6n
M     U R                  U
5      n
[        U
US	9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsrF  r   r#   )rZ   )r.   rV  r   r   r   )r   )r   r   r   r   r   )last_hidden_stater   )r   rK  r
   r.   get_seq_lengthrg   rh   rx   rZ   r   r   rQ  rO  rN  rP  r   )r8   rU  r   r   r   rV  r   r   past_seen_tokensr  r   r   decoder_layers                r:   r@   DiffLlamaModel.forward\  sF    -t";<YZZ *.*;*;I*FM0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L(;;')+%
 &"oomoW![[)H4;;+H+HIM)*$7) /# M J 		-0&++
 	
r<   )rK  rR  rO  rP  rH  rQ  rI  )NNNNNN)rB   rC   rD   rE   r$   r-   r!   r"   r   rg   r   r   r	   FloatTensorr   r   r   r   r@   rF   rG   rH   s   @r:   rD  rD  J  s         .2.204(,26!%2
##d*2
 t+2
 &&-	2

 2
 ((4/2
 $;2
 +,2
 
!2
    2
r<   rD  c                   P  ^  \ rS rSrSS0rSS0rSS/S/40rU 4S jr\\	        SS
\
R                  S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\
R                  S	-  S\
R                  S	-  S\S	-  S\\
R                  -  S\\   S\4S jj5       5       rSrU =r$ )DiffLlamaForCausalLMi  zlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r)   )
r,   r-   rD  r1  rI  r   r1   r/   r`  rS  r7   s     r:   r-   DiffLlamaForCausalLM.__init__  sU     #F+
 ++yy!3!3V5F5FUS 	r<   NrU  r   r   r   rV  labelsr   logits_to_keepr   r^   c	           
      |   U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                  R                  S.U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a,  
Example:

```python
>>> from transformers import AutoTokenizer, DiffLlamaForCausalLM

>>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)rU  r   r   r   rV  r   N)rb  re  rI  )lossrb  r   r   r2  r  )r1  rX  ry   r   slicer`  loss_functionr.   rI  r   r   r   r2  )r8   rU  r   r   r   rV  re  r   rf  r   outputsr   slice_indicesrb  rh  s                  r:   r@   DiffLlamaForCausalLM.forward  s    > ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r<   )r`  r1  rI  )NNNNNNNr   )rB   rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr-   r   r   rg   r   r   r	   r]  r   r   r   r   r   r@   rF   rG   rH   s   @r:   r_  r_    s   *,GH23H_-z:;H  .2.204(,26*.!%-.6
##d*6
 t+6
 &&-	6

 6
 ((4/6
   4'6
 $;6
 ell*6
 +,6
 
 6
  6
r<   r_  c                       \ rS rSrSrg)"DiffLlamaForSequenceClassificationi  r  NrB   rC   rD   rE   rF   r  r<   r:   rr  rr        r<   rr  c                       \ rS rSrSrSrg)DiffLlamaForQuestionAnsweringi  transformerr  N)rB   rC   rD   rE   r9  rF   r  r<   r:   rv  rv    s    %r<   rv  c                       \ rS rSrSrg)DiffLlamaForTokenClassificationi  r  Nrs  r  r<   r:   ry  ry    rt  r<   ry  )r0  rD  r_  rr  rv  ry  )r#   )Nr   collections.abcr   typingr   rg   r    r   r5  activationsr   cache_utilsr	   r
   r   
generationr   integrationsr   r   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   r   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr    r!   utils.output_capturingr"   configuration_diffllamar$   
get_loggerrB   r   Moduler&   rJ   r   r   r   r   r   r   r   r   r  r  r%  r"  r0  rD  r_  rr  rv  ry  __all__r  r<   r:   <module>r     s  .  $    & ! ; ; ) Q / i  P K - & R R G 5 4 
		H	%299  ><ryy ><B( *+ ,2	UU\\ 	U# 	U%,, 	U2\) \)~A!1 A!HG!/ G!T Y'Jryy J (J*  1" (6 (V J J J6 F
- F
 F
R F
3_ F
 F
R	)IKc 	&$?AY &	&CE] 	r<   