
    Z jSU                     t   S r SSKrSSKJr  SSKJr  SSKrSSKJrJ	r	  Sr
\ " S S	5      5       r SS
\R                  S-  S\R                  \-  \-  S\R                  S\S\S-  4
S jjr SS
\R                  S-  S\R                  \-  \-  S\R                  S\S\S-  4
S jjrSS\R                  S\R&                  S\S-  4S jjrSS\R                  S\R&                  S\S-  4S jjr  SS\R                  \-  \-  S\R&                  S\R,                  S\S\S-  S\R                  S-  4S jjrg)a  
IMPORTANT NOTICE: Every class and function in this file is deprecated in favor of using the much more general
`masking_utils.py` primitives. New code should not rely on it, it is only kept for backward compatibility for now,
and will be removed in the future.
    N)	dataclass)Union   )is_torchdynamo_compiling
is_tracingzThe attention mask API under `transformers.modeling_attn_mask_utils` (`AttentionMaskConverter`) is deprecated and will be removed in Transformers v5.10. Please use the new API in `transformers.masking_utils`.c                      \ rS rSr% Sr\\S'   \\S'   SS\S\S-  4S jjr S S\S\S	\S
\	R                  S\\	R                  S4   S\	R                  S-  4S jjr SS\	R                  S\S
\	R                  S	\S-  S\	R                  4
S jjr\  S!S\	R"                  S
\	R                  S\	R                  S\S\S-  4
S jj5       r\SS\	R                  S
\	R                  S\S-  4S jj5       r\S\	R(                  S\4S j5       r\  S"S\	R                  S-  S\	R                  S\S\S-  S\S\4S jj5       rSrg)#AttentionMaskConverter#   a  
A utility attention mask class that allows one to:
    - Create a causal 4d mask
    - Create a causal 4d mask with slided window
    - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
      key_value_length) that can be multiplied with attention scores

Examples:

```python
>>> import torch
>>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

>>> converter = AttentionMaskConverter(True)
>>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
```

Parameters:
    is_causal (`bool`):
        Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

    sliding_window (`int`, *optional*):
        Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
	is_causalsliding_windowNc                     [         R                  " [        [        5        Xl        X l        U R
                  b*  U R
                  S::  a  [        SU R
                   S35      eg g )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)warningswarnDEPRECATION_MESSAGEFutureWarningr   r   
ValueError)selfr   r   s      v/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/modeling_attn_mask_utils.py__init__AttentionMaskConverter.__init__F   ss    )=9",*t/B/Ba/Gstx  uH  uH  tI  IJ  K  0H*    
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc                     U R                   (       d  [        SU R                   S35      eX4nX2-
  nSnUS   S:  d  U R                  b  U R	                  UUUUU R                  S9nU$ )z
Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
bias to upper right hand triangular matrix (causal mask).
z"Please use `to_causal_4d` only if z has `is_causal` set to True.Nr   r   past_key_values_lengthr   )r   r   	__class__r   _make_causal_mask)	r   r   r   r   r   r   input_shaper#   causal_4d_masks	            r   to_causal_4d#AttentionMaskConverter.to_causal_4dQ   s     ~~A$..AQQnopp "0!1!@ r?Q$"5"5"A!33'=#22 4 N r   attention_mask_2dc                    UR                   S   U4nSnUS   S:  d  U R                  bK  U R                  (       a:  Uc  [        S5      eXB-
  nU R	                  UUUR
                  UU R                  S9nOU R                  b  [        S5      eU R                  XUS   S9R                  UR
                  5      nUb>  UR                  UR                  5       [        R                  " U5      R                  5      nUn	U	$ )	z
Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
causal, a causal mask will be added.
r   Nr!   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r"   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper   r   r   r%   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r*   r   r   r   r&   r'   r#   expanded_attn_maskexpanded_4d_masks
             r   to_4dAttentionMaskConverter.to_4dr   s%    )..q1<@ Oa4#6#6#B'  G  &6%D"!33(//'=#22 4 N   ,%&ghh "../@Q\]_Q`.add$$
 %!/!;!;<N<S<S<UW\WbWbchWiWmWm!n .r   input_ids_shaper#   c           	      H   [         R                  " [        [        5        U u  pV[        R
                  " Xf4[        R                  " U5      R                  US9n[        R                  " UR                  S5      US9nUR                  XS-   R                  UR                  S5      S5      :  S5        UR                  U5      nUS:  a*  [        R                  " [        R                  " XcXS9U/SS9nUb  X4-
  S-
  n	[        R                  " [        R                   " U[        R"                  S9U	S	9n
[%        5       (       a  UR'                  5       nUR                  U
[        R                  " U5      R                  5        USSSS2SS24   R)                  USXfU-   5      $ )
z:
Make causal mask used for bi-directional self-attention.
)r   r!   r   r   r   r   )dimNr   )diagonal)r   r   r   r   r3   fullr4   r5   arangesizemasked_fill_viewr0   catzerostril	ones_liker2   r   cloneexpand)r:   r   r   r#   r   bszr,   mask	mask_condr?   context_masks              r   r%   (AttentionMaskConverter._make_causal_mask   sM    	)=9&zz7,ekk%.@.D.DVTLL2v>	)1}&:&:499R=!&LLaPwwu~!A%99ekk'QVfhlmsuvD %->BH ::eood%**&MX`aL ())zz|lEKK,>,B,BCD$1$%,,S!WH^>^__r   rL   r,   c                    [         R                  " [        [        5        U R	                  5       u  p4Ub  UOUnU SS2SSSS24   R                  USX$5      R                  U5      n[        R                  " SUS9U-
  nUR                  UR                  [        R                  5      [        R                  " U5      R                  5      $ )zW
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Nr         ?r>   )r   r   r   r   rB   rJ   r0   r3   tensorr1   r2   r4   r5   )rL   r   r,   rK   src_lenexpanded_maskinverted_masks          r   r/   #AttentionMaskConverter._expand_mask   s    
 	)=9yy{$0'gQdA-.55c1gORRSXYS6F(()9)9%**)Eu{{SXGYG]G]^^r   rT   	min_dtypec                     [         R                  " [        [        5        U R                  [
        R                  :X  a  [        S5      eU R                  [
        R                  " X:H  SSS9) 5      $ )a  
Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
Details: https://github.com/pytorch/pytorch/issues/110213

`expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
`attention_mask` is [bsz, src_seq_len].

The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

For example, if `expanded_mask` is (e.g. here left-padding case)
```
[[[[0, 0, 0],
   [0, 0, 0],
   [0, 0, 1]]],
 [[[1, 0, 0],
   [1, 1, 0],
   [1, 1, 1]]],
 [[[0, 0, 0],
   [0, 1, 0],
   [0, 1, 1]]]]
```
then the modified `expanded_mask` will be
```
[[[[1, 1, 1],   <-- modified
   [1, 1, 1],   <-- modified
   [0, 0, 1]]],
 [[[1, 0, 0],
   [1, 1, 0],
   [1, 1, 1]]],
 [[[1, 1, 1],   <-- modified
   [0, 1, 0],
   [0, 1, 1]]]]
```
z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r!   T)r=   keepdim)
r   r   r   r   r   r3   r2   r   mulall)rT   rW   s     r   _unmask_unattended)AttentionMaskConverter._unmask_unattended   s_    R 	)=9 %**,n    %))M,FBX\"]!]^^r   attention_maskinputs_embedsis_trainingc                    [         R                  " [        [        5        UR                  S   UR                  S   peXb-   n[        U5      nSn	U c%  U(       d  U(       d  US:X  d  Xv:X  a
  Ub  Xs:  a  Sn	U	$ Ub  Xs:  aL  [        U R                  5      S:X  a  gU(       d+  [        R                  " U S:H  5      (       a  US:X  d  Xv:X  a  Sn	U	$ )a  
Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
`key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
passed).
r   r   FT   )	r   r   r   r   r-   r   lenr3   r[   )
r^   r_   r#   r   r`   _r   r   is_tracing_ignore_causal_masks
             r   _ignore_causal_mask_sdpa/AttentionMaskConverter._ignore_causal_mask_sdpa  s    " 	)=9'--a0-2E2Ea2H<'@ /"! K!Q&*:*J#+/?/P%)" "! #'7'H>''(A- UYY~/B%C%C1$(8(H)-& "!r   r   r   N)cpur   N)NF)__name__
__module____qualname____firstlineno____doc__r2   __annotations__intr   r3   r   r   r   Tensorr(   r8   staticmethodSizer%   r/   FloatTensorfloatr\   rg   __static_attributes__ r   r   r	   r	   #   s   < O	$ 	d
 	" .3  	
 {{ ellE)* 
	L (,-  <<-  -  {{	- 
 *-  
- ^ 
 '(%)!`!`{{!` !` !$	!`
 d
!` !`F _5<< _ _cDj _ _ 0_((0_0_ 0_d 
 &*!8"t+8"||8" !$8" d
	8"
 8" 
8" 8"r   r	   r^   r&   r_   r#   r   c                    [        SUS9nUS   U-   nU b8  [        U R                  5      S:X  a  UR                  XS   XbR                  S9n U $ U b  [        U R                  5      S:X  a  US   SUS   U4n[        U R                  5      U:w  a%  [        S	[        U R                  5       S
U S35      eSU -
  nUR                  UR                  [        R                  5      [        R                  " UR                  5      R                  5      n  U $ UR                  US   US   XbR                  UR                  S9n U $ )a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`

Args:
    attention_mask (`torch.Tensor` or `None`):
        A 2D attention mask of shape `(batch_size, key_value_length)`
    input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
        The input shape should be a tuple that defines `(batch_size, query_length)`.
    inputs_embeds (`torch.Tensor`):
        The embedded inputs as a torch Tensor.
    past_key_values_length (`int`):
        The length of the key value cache.
    sliding_window (`int`, *optional*):
        If the model uses windowed attention, a sliding window should be passed.
Tri   r!      )r   r   rb   r   r   z#Incorrect 4D attention_mask shape: z; expected: .rQ   r<   )r	   rc   r-   r8   r   tupler   r1   r0   r3   r2   r4   r5   r(   r   )	r^   r&   r_   r#   r   attn_mask_converterr   expected_shaperU   s	            r   !_prepare_4d_causal_attention_maskr   D  sg   . 14P^_"2)?? !c.*>*>&?1&D,22O>NViVi 3 
( # 
	#N,@,@(AQ(F%a.![^=MN%%&.85eN<P<P6Q5RR^_m^nnop 
  .0M*66  ,ekk-:M:M.N.R.RN 	 -99NKO-=EXEXanauau : 
 r   c                 
   [        SUS9nUS   U-   n[        U5      n[         R                  U UUUS9nU(       a  Sn	U	$ U c.  UR                  US   US   XbR                  UR
                  S9n	U	$ U R                  5       S:X  a  U n	OUR                  U US   UR                  US	9n	U(       dV  U	R
                  R                  S
;   a<  [         R                  U	[        R                  " UR                  5      R                  S9n	U	$ )a  
Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
`key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
Tri   r!   )r^   r_   r#   r   Nr   r<   rb   )r   r   )cudaxpu)rW   )r	   r   rg   r(   r   r   r=   r8   typer\   r3   r4   r5   )
r^   r&   r_   r#   r   r   r   re   rf   r7   s
             r   *_prepare_4d_causal_attention_mask_for_sdpar   y  s@    14P^_"2)??
 ]+K/HH%#5%	 I  0 / 
	.;;NKO-=EXEXanauau < 
, % 1$-288B#))!1	  9   /66;;N5HH EKK8K8K,L,P,P  I   r   rL   r   r,   c                 *    [         R                  XUS9$ )  
Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`

Args:
    mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)`
    dtype (`torch.dtype`):
        The torch dtype the created mask shall have.
    tgt_len (`int`):
        The target length or query length the created mask shall have.
rL   r   r,   )r	   r/   r   s      r   _prepare_4d_attention_maskr     s     "..Dw.WWr   c                     [         R                  " [        [        5        U R                  u  p4Ub  UOUn[        U 5      (       d  [        R                  " U S:H  5      (       a  g[        R                  XUS9$ )r   Nr   r   )
r   r   r   r   r-   r   r3   r[   r	   r/   )rL   r   r,   rd   r   s        r   #_prepare_4d_attention_mask_for_sdpar     se     MM%}5**A ,g2BG d		$!) 4 4%22SZ2[[r   r   r   c                 X    [        SUS9nX0S   -   nUR                  U S   U S   XaUS9nU$ )a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

Args:
    input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
        The input shape should be a tuple that defines `(batch_size, query_length)`.
    dtype (`torch.dtype`):
        The torch dtype the created mask shall have.
    device (`int`):
        The torch device the created mask shall have.
    sliding_window (`int`, *optional*):
        If the model uses windowed attention, a sliding window should be passed.
Tri   r!   r   r<   )r	   r(   )r&   r   r   r#   r   r   r   r^   s           r    _create_4d_causal_attention_maskr     sN    ( 14P^_-B?(55AB)9v 6 N r   rj   rl   )rq   r   dataclassesr   typingr   r3   utils.import_utilsr   r   r   r	   rt   rv   r~   listrs   r   r   r   r   r   r   r   rz   r   r   <module>r      s    !   Dw  ]" ]" ]"J	 "&1LL4'1e#d*1 <<1  	1
 $J1t "&7LL4'7e#d*7 <<7  	7
 $J7tXU\\ X%++ XPSVZPZ X \ell \5;; \Y\_cYc \: #$!%e#d*;; LL  	
 $J \\Dr   