
    Z jB9                        S r SSKJrJr  SSKrSSKJr  SSKJrJ	r	  SSK
JrJrJrJr  \" S5      r\" 5       (       a   SS	KJr  SS
KJrJrJr  \(       a  SSKJr  OSr\	R.                  " \5      r " S S5      rS\S\\\\S   -  4   4S jr S%S\R>                  S\R>                  S\R>                  S\R>                  \ \R>                  \R>                  4   -  4S jjr!\R>                  \"-  r#     S&S\R>                  S\"S-  S\ \#\#4   S-  S\S-  SS4
S jjr$S\R>                  S\"S\R>                  4S jr%   S'S\RL                  RN                  S\R>                  S\R>                  S\R>                  S \\R>                  S4   S!\(S-  S"\(S-  S#\R>                  S-  S\ \R>                  \R>                  S-  4   4S$ jjr)g)(a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)get_torch_versionis_torch_greater_or_equalis_torch_less_or_equalis_torchdynamo_compilingz2.9.0)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attention)
AuxRequestc                   |   ^  \ rS rSrSrSrSrSrU 4S jr\	R                  R                  SS9S 5       rS rS	rU =r$ )
WrappedFlexAttention;   z`
We are doing a singleton class so that flex attention is compiled once when it's first called.
NFc                 ^   > U R                   c  [        TU ]	  U 5      U l         U R                   $ N)	_instancesuper__new__)clsargskwargs	__class__s      y/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/integrations/flex_attention.pyr   WrappedFlexAttention.__new__D   s'    == !GOC0CM}}    )	recursivec                    U R                   (       a  XR                  :w  a  Xl        [        S5      (       a  [        R                  " [
        SS9U l        Or[        R                  " [        5       5      R                  S:X  a'  U(       a   [        R                  " [
        SSS9U l        O[        R                  " [
        5      U l        SU l         gg)	z.
Initialize or update the singleton instance.
2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r$   modeTN)_is_flex_compiledtrainingr   torchcompiler   _compiled_flex_attentionr   parser	   base_version)selfr'   s     r   __init__WrappedFlexAttention.__init__J   s    
 %%]])B$M%g..05nV[0\- 023@@GKPX05"E8T1-
 16n0M-%)D" *Cr    c                     U R                   $ r   )r*   )r-   s    r   __call__WrappedFlexAttention.__call__`   s    ,,,r    )r*   r&   r'   )__name__
__module____qualname____firstlineno____doc__r   r&   r*   r   r(   compilerdisabler.   r1   __static_attributes____classcell__)r   s   @r   r   r   ;   sP     I# ^^e,* -**- -r    r   
return_lsereturnr   c                 J    [         (       a  SU (       a
  [        SS90$ S0$ SU 0$ )aA  
Requests the LSE from flex_attention in a version-agnostic fashion.

Before torch 2.9, the LSE was requested via the boolean return_lse field. However, starting with
torch 2.9, an AuxRequest object must be passed via the aux_request field. This method conditionally
returns the correct form based on the python version.

return_auxT)lseNr<   )_TORCH_FLEX_USE_AUXr   )r<   s    r   get_flex_attention_lse_kwargsrB   d   s/     jjT2KKdKK*%%r    querykeyvaluec                 b    [        5       (       d  [        U5      " 5       O[        nU" U UU40 UD6$ r   )r   r   r   )rC   rD   rE   r'   r   flex_attention_compileds         r   compile_friendly_flex_attentionrH   r   s@     G_F`F`28<>ft" 	 r    attention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 h  ^ ^^^^^^ T R                   u  pgU(       d  UnU(       d  UnU[        -  S-   [        -  n[        R                  R                  R                  T SSX-
  4S9m T R                  n	T R                  5       mUb4  TR                  5       R                  S5      R                  S5      S-
  U-  mU U4S jmUU4S jn
U U4S jnU(       d  UmOUc  TOU
mUb1  US   R                  U	5      mUS   R                  U	5      mUUU4S	 jnOTn[        UUSUUU	[        S
5      (       + S9$ )a  
IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
and will be removed in a future version without warnings. New code should not use it. It is only kept here
for BC for now, while models using it are being patched accordingly.

Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
The resultant BlockMask is a compressed representation of the full (causal) block
mask. BlockMask is essential for performant computation of flex attention.
See: https://pytorch.org/blog/flexattention/

Args:
    attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
    of shape (batch_size, total_seq_len). e.g.

    For unpacked sequence:
    [[1, 1, 1, 1, 0, 0, 0],
     [1, 1, 1, 1, 1, 0, 0]]

    For packed sequence:
    [[1, 1, 1, 2, 2, 2, 0],
     [1, 1, 2, 2, 2, 3, 3]]

Returns:
    BlockMask
   r   )rE   padNc                 J   > X#:  nT	X4   T	X4   :H  nTX4   S:  nXF-  U-  nU$ )z
Defines the logic of a block causal mask by combining both a standard causal mask
and a block diagonal document mask.
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
for an illustration.
r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskrI   document_idss
           r   causal_mask_mod4make_flex_block_causal_mask.<locals>.causal_mask_mod   sK     o$Y%56,yGX:YY()9:Q> /-?
r    c                 8   > TX4   TX4   :H  nT" XX#5      nXE-  $ )zE
Combines the chunk mask with the causal mask for chunked attention.
rR   )rS   rT   rU   rV   
chunk_maskcausal_doc_maskr\   
chunk_idxss         r   chunk_causal_mask_mod:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s4      	 01Z	@Q5RR
))uM++r    c                 <   > TX4   TX4   :H  nTX4   S:  nXT-  nU$ )zX
Utilizes default attention mask to enable encoder and encoder-decoder
attention masks.
r   rR   )	rS   rT   rU   rV   rX   rY   rZ   rI   r[   s	          r   default_mask_mod5make_flex_block_causal_mask.<locals>.default_mask_mod   s?    
 %Y%56,yGX:YY():;a?!1
r    c                 *   > UT-   nUT-   nT" XXE5      $ r   rR   )	rS   rT   rU   rV   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_mod-make_flex_block_causal_mask.<locals>.mask_mod   s$    x'H*I*9TTr    r#   )rm   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer(   nn
functionalrO   rs   clonefill_cumsumtor   r   )rI   rJ   query_length
key_lengthrK   rL   
batch_sizetotal_seq_lenpad_lenrs   rb   re   rm   r\   ra   r[   rj   rk   rl   s   `            @@@@@@r   make_flex_block_causal_maskr      sD   D !2 7 7J"
$55:>UUG++//0AQRT[ThPi/j%%F$**,L'"((*003::2>BH\]
,	 "25I5Q/Wl1:==(AJMM&)		U 	U
 +

+G44	 	r    hidden_statesn_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
rN   N)ru   expandreshape)r   r   batchnum_key_value_headsslenhead_dims         r   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr    moduleattention_maskscalingsoftcaps_auxc           
        ^^ UR                  SS5      S:  a  [        S5      eS n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUU4S jn
SnUR                  S   nXS-
  -  S:w  aR  [        X!R                  S   UR                  S   -  5      n[        X1R                  S   UR                  S   -  5      nS	nUR                  S
5      nUR                  R                  S:g  nU(       d  Ub  [        S5      e[        UUU4U
U	UUUU R                  S.[        U5      D6nU(       a  [        (       a  Uu  nnUR                  nOUu  nnUR                  UR                  5      nUb  UR                  u  nnnnUR                  SSSS5      R!                  UUUS5      nUR#                  S5      n[$        R&                  " [$        R(                  " UU/SS9SSS9n[$        R*                  " UU-
  5      nUU-  nUR                  UR                  5      nOUnS nUR-                  SS5      R/                  5       nUU4$ )Ndropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 n   > Tb  T[         R                  " U T-  5      -  n Tb  U TU   S   U   U   -   n U $ )Nr   )r(   tanh)scorerS   rT   rU   rV   
score_maskr   s        r   	score_mod)flex_attention_forward.<locals>.score_mod!  sK    ejj99E!Jy1!4U;FCCE r    TrN   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   r'   rP   )dim)r   keepdimr   )get
ValueError
isinstancer   ru   r   rs   typerH   r'   rB   rA   r@   r|   dtypeviewr   	unsqueezer(   	logsumexpcatexp	transpose
contiguous)r   rC   rD   rE   r   r   r   r   r   r   r   r   num_local_query_headsr   r<   flex_attention_outputattention_outputauxr@   r   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         `                    @r   flex_attention_forwardr     s~    zz)S!A%a
 	

 JJ.),,#
#
1a399R= 89
 J!KKN 	!:;A[[^syy|;<%Q5;;q>!AB
ZZ 01N""e+J%+v
 	
 < %  (

3   $9!c''C$9!c ffU[[!2B2H2H/J	9aJJq"a+22:y)UVWE
 ==,L ??599lE5JPR+SY[eijL "IIl\&ABM/-?/225;;?0'11!Q7BBDS  r    )F)NNNNT)NNN)*r7   typingr   r   r(   	packagingr   utilsr   r   utils.import_utilsr	   r
   r   r   rA   !torch.nn.attention.flex_attentionr   rv   r   r   r   r   
get_loggerr3   loggerr   booldictstrrB   TensortuplerH   intOffsetr   r   rw   Modulefloatr   rR   r    r   <module>r      s=  8 #   9  08   !!g^^@
 
		H	%&- &-R&d &tCQ]H^A^<^7_ &$ 	<<	 << \\E%,,455$ 
	 (,,0!o||o*o
 66>"T)o d{o od	UU\\ 	U# 	U%,, 	U$ ! !%g!HHOOg!<<g! 
g! <<	g!
 %,,34g! T\g! T\g! <<$g! 5<<,,-g!r    