
    Z j                        % S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	  SSK
JrJr  \R                  " \5      r\" 5       (       a  S SKr\(       a  SSKJr  S r    S!S	\S
   S\S   S\S-  S\S-  S\S\4   4
S jjr     S"S	\S
   S\S   S\S-  S\S-  S\S\S\4   4S jjr    S!S	\S
   S\S   S\S-  S\S-  S\S\4   4
S jjr   S#S	S
S\S   S\S-  S\S-  S\S\4   4
S jjr   S#S	S
S\S   S\S-  S\S-  S\S\4   4
S jjr   S#S	S
S\S   S\S-  S\S-  S\S\4   4
S jjr\\\\\\S.r\\\S\S\4   4   4   \ S'    " S S\	5      r! " S S5      r"S$S	\"S\#S-  4S  jjr$g)%    N)Callablewraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                 P   ^ ^^ SS jmSS jm[        T 5      SUUU 4S jj5       nU$ )aD  
Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
(i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

Args:
    rope_forward (Callable):
        The forward pass of the RoPE implementation.

Returns:
    The decorated forward pass.
c                 B   [         R                  " U5      S-   nUc4  U R                  nU R                  nSnU R                  R
                  S   nO>U R                  U   n[        X S35      nU S3nU R                  R
                  U   S   nXH:  aX  [        X S35      (       d!  [        U   n	U	" U R                  UUS-   US9u  pU R                  U S	3W
S
S9  [        X S3U
5        gUR                  U5      nU R                  U S	3US
S9  [        X S3U5        g)zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r	   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s               q/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update6dynamic_rope_update.<locals>.longrope_frequency_update/   sO   ))L)A-I $ 6 6F/3{{/J/JKm/n,z2I '<N.O P"|1%F/3{{/J/J:/V20, 54<~!>??29=#/KK<q@)	$    F88!4mPU VDHM2MB !2 4 4V <  F88!46GTY ZDH$568IJ    c                 p   [         R                  " U5      S-   nUc'  U R                  nU R                  nU R                  nSnO;U R                  U   n[        X S3U R                  5      n[        X S35      nU S3nXF:  aF  [        U   n	U	" U R                  UUUS9u  ol        U R                  U S3U
S	S
9  [        X S3U5        X@R                  :  a^  X`R                  :  aN  UR                  U5      nU R                  U S3US	S
9  [        X S3U5        [        X S3U R                  5        ggg)z
dynamic RoPE layers should recompute `inv_freq` in the following situations:
1 - growing beyond the cached sequence length (allow scaling)
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
r	   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r!   r#   r   attention_scalingr$   r%   original_max_seq_lenr&   )r'   r(   r)   r   r   r   r2   r   r*   r+   r   s              r,   dynamic_frequency_update5dynamic_rope_update.<locals>.dynamic_frequency_updateR   si    ))L)A-I!%!8!8 $ 6 6Fz2I!(=P/QSWSjSj!k '<N.O P"|1%F'.y9L/;%	0,H,   F88!4h5 QDL(;<gF...3EHaHa3a !2 4 4V <  F88!46GTY ZDH$568IJDL(;<d>W>WX 4b.r/   c                    > Uc  U R                   OU R                   U   nUb  SU0O0 nSU;   a  T" X4SUR                  0UD6  OUS:X  a  T" X4SUR                  0UD6  T" XU40 UD6$ )Nr   dynamicr)   longrope)r   r)   )	r'   xr(   r   r   kwargsr5   r-   rope_forwards	         r,   wrapper$dynamic_rope_update.<locals>.wrapperx   s{    &0&8DNNdnnZ>X	/9/E,
+2	!$TSSFS*$%dTTVTD\<V<<r/   Nr   )r<   r=   r5   r-   s   ` @@r,   dynamic_rope_updater@   "   s6    !KF$YL <= = = Nr/   r   r   r)   ztorch.devicer   r   returnztorch.Tensorc           	         U R                  5         Ub  U R                  U   OU R                  nUS   nUS   nUR                  SS5      n[        U SS5      =(       d    U R                  U R
                  -  n[        X-  5      n	Sn
SU[        R                  " SU	S[        R                  S	9R                  U[        R                  S
9U	-  -  -  nX-  nX4$ )a  
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper)   rJ   )standardize_rope_paramsr    getr!   hidden_sizenum_attention_headsintr   arangeint64r&   float)r   r)   r   r   rope_parameters_dictrC   baserE   rG   dimattention_factorr   s               r,   '_compute_linear_scaling_rope_parametersrX      s    B ""$AKAW611*=]c]s]s!(+F  -D0445LcRvz40dF4F4F&JdJd4dH
h.
/C du||AsAU[[ILLTZbgbmbmLnqttuvH
 H%%r/   head_dim_keyc           	      h   U R                  5         Ub  U R                  U   OU R                  n[        XS5      =(       d    U R                  U R                  -  nUS   nUR                  SS5      nUR                  SS5      n	Sn
[        X-  S-  5      nSU[        R                  " SSU-  S[        R                  S9R                  U[        R                  S	9U-  -  -  nUS-  U-
  nUS:  a:  [        R                  " U[        R                  " U[        R                  US
94SS9nOUnX-  nX4$ )a  
Computes the inverse frequencies with proportional RoPE.

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): The proportion of the embedding dimension
            to apply rotary positional encoding, e.g., [0.0, 0.25, 0.5, 0.75, 1.0]. Unlike other RoPE functions
            that use this parameter, proportional RoPE will always return an encoding that is the size of
            `head_dim`.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
NrD   rC   rF   rE   rH   r   rI   rK   rJ   r)   )rV   )rL   r    r!   rN   rO   rM   rP   r   rQ   rR   r&   rS   catzerosfloat32)r   r)   r   r   rY   rT   rG   rU   rC   rope_proportionrW   rope_anglesinv_freq_rotatednope_anglesr   s                  r,   %_compute_proportional_rope_parametersrc      sB   J ""$AKAW611*=]c]s]svT2ff6H6HFLfLf6fH-D!%%h4F*../FLOo0A56KLLAOQekkBEEV[`[f[fEgjrr	t
 a-+-KQ99 Ku}}VL 
 $H%%r/   c           	         U R                  5         Ub  U R                  U   OU R                  nUS   nUR                  SS5      n[        U SU R                  U R
                  -  5      n[        Xv-  5      nUS   n	Sn
Uc  U R                  nO~[        U[        R                  5      (       aJ  [        R                  " U[        R                  " U R                  UR                  UR                  S95      nO[        X R                  5      nXYU-  U R                  -  U	S-
  -
  XS-
  -  -  -  nSU[        R                   " S	US[        R"                  S
9R%                  U[        R&                  S9U-  -  -  nX4$ )a	  
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
            inference time
        *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
            will be accessed. The value of `factor` is used to determine the new base frequency, along with the
            current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
            computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
            factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
            context window using an exponent derived from `dim`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
        max_position_embeddings, this value will be overridden by max_position_embeddings.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
rD   rE   rF   rG   rC   r[   r	   rH   r   rI   rK   )rL   r    rM   r!   rN   rO   rP   max_position_embeddings
isinstancer   TensormaximumtensorrJ   r)   r   rQ   rR   r&   rS   )r   r)   r   r   rT   rU   rE   rG   rV   rC   rW   r   s               r,   _compute_dynamic_ntk_parametersrj     sm   V ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C!(+F 00	GU\\	*	*--LL77w}}U\UcUcd

 g==> W$v'E'EE&ST*U[^hibi[jkkDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%r/   c                   ^ U R                  5         Ub  U R                  U   OU R                  nUS   nUR                  SS5      n[        U SU R                  U R
                  -  5      n[        Xv-  5      nUS   n	UR                  S5      n
UR                  S5      nUR                  S5      nUS	   nU	c  U R                  U-  n	SS jnU
c1  U(       a"  U(       a  [        U" X5      U" X5      -  5      n
OU" U	5      n
UR                  S5      =(       d    SnUR                  S5      =(       d    S
nS mU4S jnS nU[        R                  " SUS5      R                  U[        R                  S9U-  -  nSU-  nSU	U-  -  nU R                  R                  SS5      nU" UUXUU5      u  nnS
U" UUUS-  5      R                  U[        R                  S9-
  nUS
U-
  -  UU-  -   nUU
4$ )a  
Computes the inverse frequencies with NTK scaling. Please refer to the
[original paper](https://huggingface.co/papers/2309.00071)

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
            *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                (only) in the linear ramp function.
            *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                (only) in the linear ramp function.
            *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                `mscale_all_dim`, if provided.
            *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                calculated based on `factor` only.
            *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                will be calculated based on `factor` only.
            *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
            *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
rD   rE   rF   rG   rC   rW   mscalemscale_all_dimr   r	   c                 N    U S::  a  gSU-  [         R                  " U 5      -  S-   $ )Nr	   rF   g?)mathlog)scalerl   s     r,   
get_mscale,_compute_yarn_parameters.<locals>.get_mscale  s(    A:V|dhhuo-33r/   	beta_fast    	beta_slowc                     U[         R                  " X0S-  [         R                  -  -  5      -  S[         R                  " U5      -  -  $ )zPInverse dimension formula to find the dimension based on the number of rotationsrH   )ro   rp   pi)num_rotationsrV   rU   re   s       r,   find_correction_dim5_compute_yarn_parameters.<locals>.find_correction_dim  s@    dhh6!:Kdgg:UVWW\]`d`h`him`n\noor/   c                    > T" XX45      nT" XX45      nU(       a,  [         R                  " U5      n[         R                  " U5      n[        US5      [	        XrS-
  5      4$ )z.Find dimension range bounds based on rotationsr   r	   )ro   floorceilr   min)	low_rothigh_rotrV   rU   re   truncatelowhighrz   s	           r,   find_correction_range7_compute_yarn_parameters.<locals>.find_correction_range  sR    !'N"8$P**S/C99T?D3{CAg...r/   c                     X:X  a  US-  n[         R                  " U[         R                  S9U -
  X-
  -  n[         R                  " USS5      nU$ )NgMbP?rI   r   r	   )r   rQ   r^   clamp)r   r   rV   linear_func	ramp_funcs        r,   linear_ramp_factor4_compute_yarn_parameters.<locals>.linear_ramp_factor  sH    :5LC||Cu}}=C	RKKQ2	r/   r   rH   rK   r   T)r	   )rL   r    rM   r!   rN   rO   rP   re   rS   r   rQ   r&   )r   r)   r   r   rT   rU   rE   rG   rV   rC   rW   rl   rm   r   rr   rt   rv   r   r   	pos_freqsinv_freq_extrapolationinv_freq_interpolationr   r   r   inv_freq_extrapolation_factorr   rz   s                              @r,   _compute_yarn_parametersr   G  s/   t ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C!(+F+//0BC!%%h/F)--.>?N';<^'_$
 ~//2RR4 n$Z%?*VBd%de)&1 %((5;I$((5:Ip/ aa03363UX[[\I 9_ FY$67%%))*d;H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r/   c                 H   U R                  5         Ub  U R                  U   OU R                  nUS   nUR                  SS5      n[        U SU R                  U R
                  -  5      n[        Xv-  5      nUS   n	US   n
UR                  S5      nUR                  S5      nUS	   nUc  U R                  U-  nUcM  US::  a  SnOD[        R                  " S
[        R                  " U5      [        R                  " U5      -  -   5      nU(       a*  X-:  a%  [        R                  " U	[        R                  US9nO$[        R                  " U
[        R                  US9n[        R                  " SUS[        R                  US9R!                  5       U-  nSXU-  -  -  nUU4$ )ay  
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
[original implementation](https://github.com/microsoft/LongRoPE)

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
            pretraining. If not provided, defaults to `max_position_embeddings`.
        *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
            will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                the value of `factor`.
            *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                overridden s the ratio between those values.
            *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
            *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
rD   rE   rF   rG   long_factorshort_factorrC   rW   r   r	   r[   r   rH   )rL   r    rM   r!   rN   rO   rP   re   ro   sqrtrp   r   ri   r^   rQ   rR   rS   )r   r)   r   r   rT   rU   rE   rG   rV   r   r   rC   rW   r   ext_factorsinv_freq_shaper   s                    r,   _compute_longrope_parametersr     s   d ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C&}5K'7L!%%h/F+//0BC';<^'_$
 ~//2RR S="#yyTXXf-=Ii@j-j)jk 7=ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\Nk.$889H%%%r/   c           	         U R                  5         Ub  U R                  U   OU R                  nUS   nUR                  SS5      n[        U SS5      =(       d    U R                  U R
                  -  n[        Xv-  5      nSn	SU[        R                  " SUS[        R                  S9R                  U[        R                  S	9U-  -  -  n
US
   nUS   nUS   nUS   nX-  nX-  nS[        R                  -  U
-  n[        R                  " UU:  X-  U
5      nUU-  U-
  X-
  -  nSU-
  U-  U-  UU-  -   nUU:  ) UU:  ) -  n[        R                  " UUU5      nUU	4$ )a,
  
Computes the inverse frequencies for llama 3.1.

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                during smoothing.
            *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
            *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                the shift applied to the numerator and denominator of the smoothing factor.
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
            *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                during pretraining. If not provided, the function falls back to `max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
NrD   rE   rF   rG   r   rH   rI   rK   rC   low_freq_factorhigh_freq_factorr   r	   )rL   r    rM   r!   rN   rO   rP   r   rQ   rR   r&   rS   ro   rx   where)r   r)   r   r   rT   rU   rE   rG   rV   rW   r   rC   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                         r,   _compute_llama3_parametersr   &  s   Z ""$AKAW611*=]c]s]s  -D0445LcRvz40dF4F4F&JdJd4dH
h.
/C du||AsAU[[ILLTZbgbmbmLnqttuvH!(+F*+<=O+,>?*+MNO&8':$''kH$G [[+;!;X=NPXYN$w.@EUEghM]*n<vEXfHff!223BR8R6SSN[[1BNSN+++r/   )linearr8   yarnr9   llama3proportional.r#   c                       \ rS rSr% Sr\S-  \S'   \S-  \S'   \S-  \S'   \S-  \S'   \S-  \S'   \S-  \S	'   \S-  \S
'   \S-  \S'   \	\   S-  \S'   \	\   S-  \S'   \S-  \S'   \S-  \S'   Sr
g)RopeParametersi  uu
  
Args:
    rope_theta (`float`, *optional*, defaults to `RotaryEmbeddingConfigMixin.default_theta`):
        The base period of the RoPE embeddings. Optional in serialized configs — if omitted,
        the model's `default_theta` (typically 10000.0) is used.
    rope_type (`str`, *optional*, defaults to "default"):
        The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
        'llama3'], with 'default' being the original RoPE implementation.
    partial_rotary_factor (`float`, *optional*):
        The percentage of the query and key head embedding on which RoPE will be applied.
    factor (`float`, *optional*):
        Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
        most scaling types, a `factor` of x will enable the model to handle sequences of length x *
        original maximum pre-trained length.
    original_max_position_embeddings (`int`, *optional*):
        Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
        pretraining.
    attention_factor (`float`, *optional*):
        Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
        computation. If unspecified, it defaults to value recommended by the implementation, using the
        `factor` field to infer the suggested value.
    beta_fast (`float`, *optional*):
        Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
        ramp function. If unspecified, it defaults to 32.
    beta_slow (`float`, *optional*):
        Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
        ramp function. If unspecified, it defaults to 1.
    short_factor (`list[float]`, *optional*):
        Only used with 'longrope'. The scaling factor to be applied to short contexts (<
        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
        size divided by the number of attention heads divided by 2
    long_factor (`list[float]`, *optional*):
        Only used with 'longrope'. The scaling factor to be applied to long contexts (<
        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
        size divided by the number of attention heads divided by 2
    low_freq_factor (`float`, *optional*):
        Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
    high_freq_factor (`float`, *optional*):
        Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
NrD   r   rE   rC   r   rW   rt   rv   r   r   r   r    )__name__
__module____qualname____firstlineno____doc__rS   __annotations__strrP   list__static_attributes__r   r/   r,   r   r     s    'R Tz 4<'DL&)Dj0dl"t|t|u+$$et##T\!dl"r/   r   c                   @   \ rS rSrSrSr\" 5       rS rS r	SS jr
SS\S	\S-  4S
 jjrSS\S	\S-  4S jjrSS\S	\S-  4S jjrSS\S	\S-  4S jjrSS\S	\S-  4S jjrSS\S	\S-  4S jjrSS\S	\S-  4S jjr\  SS\S\S\S\S-  S	\S-  4
S jj5       rSrg)RotaryEmbeddingConfigMixini  zS
A Mixin containing the functionality to standardize and validate RoPE parameters.
g     @c                    UR                  SS 5      nU=(       d    U R                  U l        U R                  b  U R                  O0 U l        UR                  S[        U SU R                  5      5      nU R                  R	                  SU5        UR                  S[        U SS 5      5      nUb1  U R                  R	                  SU5        U R                  S1-  U l        U R                  5         U$ )Nrope_scalingrD   rE   )popr    r!   default_theta
setdefaultrM   ignore_keys_at_rope_validationrL   )r'   r;   r   rD   rE   s        r,   convert_rope_params_to_dict6RotaryEmbeddingConfigMixin.convert_rope_params_to_dict  s    zz.$7+Ct/C/C7;7K7K7Wt33]_ ZZgdL$J\J\.]^
''jA &

+BGDRikoDp q ,  ++,CEZ[262U2UYpXq2qD/$$&r/   c                    [        U SS5      n[        U SS5      n[        U SS5      =(       d    0 n[        U SS5      nU(       d  U(       d  [        R                  S5        gUb3  U0 :X  d-  [        UR	                  5       5      R                  U5      (       d  UR                  SUR                  SS	5      5        UR                  SU5        Ub  X#S'   US   S
;   aQ  [        U S5      (       a  U R                  U R                  S'   OU R                  R                  SU R                  5        O[        U5       H}  nX5   R                  SX5   R                  SS	5      5        X5   R                  SU5        Ub  X#U   S'   X5   S   S
;   d  MT  U R                  U   R                  SU R                  5        M     X0l
        g)z
Helper to standardize the config's rope params field by ensuring the params are defined for each
later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
rD   NrE   r    layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r9   r   )r!   loggerwarningsetkeysissubsetr   rM   r"   r   r    re   )r'   rD   rE   r    r   r   s         r,   rL   2RotaryEmbeddingConfigMixin.standardize_rope_params  s    T<6
 '.Et L!$(94@FBdM48  :NNde Or$9_EYEYE[A\AeAefqArAr&&{O4G4GPY4Z[&&|Z@$0;P 78 {+/MM4!CDD PTOtOtD(()KL((334VX\XtXtu "+.
+66{OD_DcDcdjluDvw+66|ZP(4K`J/0GH".{;?]]((4??:D<X<X /  /r/   c                    [        U SS5      nU(       d  g[        U SS5      b8  [        UR                  5       5      R                  U R                  5      (       a  OSU0nUR                  5        Hh  nUR                  SUR                  SS5      5      n[        U SU S	3S5      nX2S'   Ub  U" X R                  S
9  MO  [        R                  SU S35        Mj     g)zI
Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
r    Nr   full_attentionr   r   r   
_validate__rope_parametersignore_keyszMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r!   r   r   r   r   valuesrM   r   r   r   )r'   rT   r    r   validation_fns        r,   validate_rope(RotaryEmbeddingConfigMixin.validate_rope  s      't->E#4-9cBVB[B[B]>^>g>g?
 ?
 $46J#K 3::<O'++K9L9LVU^9_`I#DJykAQ*RTXYM+4K((o;^;^_cdmcnnop  =r/   Nr    r   c                 n    S1nS1n[        UR                  5       5      nUS   nU R                  XeX4US9  g )Nr   rD   optional_keysr   )r   r   _check_received_keys)r'   r    r   required_keysr   received_keysr   s          r,   !_validate_default_rope_parameters<RotaryEmbeddingConfigMixin._validate_default_rope_parameters$  sH    $%O0023#K0	!!m^i 	" 	
r/   c                     SS1nS1n[        UR                  5       5      nUS   nU R                  XeX4US9  US   nUb!  [        U[        [
        45      (       a  US:  a  [        R                  SU 35        g g Nr   rC   rD   r   rF   B`rope_parameters`'s factor field must be a float or int >= 1, got r   r   r   rf   rS   rP   r   r   r'   r    r   r   r   r   r   rC   s           r,    _validate_linear_rope_parameters;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters-      $h/%O0023#K0	!!m^i 	" 	
 !*>FUCL!A!AVc\NN_`f_ghi FRr/   c                     SS1nS1n[        UR                  5       5      nUS   nU R                  XeX4US9  US   nUb!  [        U[        [
        45      (       a  US:  a  [        R                  SU 35        g g r   r   r   s           r,   !_validate_dynamic_rope_parameters<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters:  r   r/   c           	         1 Skn1 Skn[        UR                  5       5      nUS   nU R                  XeX4US9  US   nUb!  [        U[        [
        45      (       a  US:  a  [        R                  SU 35        UR                  S5      nUb3  [        U[        5      (       a  US	:  a  [        R                  S
U 35        UR                  S5      n	U	b3  [        U	[        [
        45      (       d  [        R                  SU	 35        UR                  S5      n
U
b3  [        U
[        [
        45      (       d  [        R                  SU
 35        U	=(       d    SU
=(       d    S:  a  [        R                  SU	 SU
 S35        U R                  S   nU R                  U-  nX:w  a'  US:w  a   [        R                  SU SU SU S35        g g g )N>   rC   r   r   >   rl   r   rt   rv   rD   rm   rW   r   r   rC   rF   r   rW   r   zO`rope_parameters`'s attention_factor field must be a float greater than 0, got rt   z@`rope_parameters`'s beta_fast field must be a float or int, got rv   z@`rope_parameters`'s beta_slow field must be a float or int, got ru   r	   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rf   rS   rP   r   r   rM   r    re   warning_once)r'   r    r   r   r   r   r   rC   rW   rt   rv   r   implicit_factors                r,   _validate_yarn_rope_parameters9RotaryEmbeddingConfigMixin._validate_yarn_rope_parametersG  s   S
 O0023#K0	!!)Mfq!r *>FUCL!A!AVc\NN_`f_ghi*../AB'<Le1T1TXhklXlNNabrast $''4	 Is|)L)LNN]^g]hij#''4	 Is|)L)LNN]^g]hijO	Q/NNdendo p::CD\^ ,0+?+?@b+c(669YY$A)=]^d]e fq ###A& J~	~ *>$r/   c                    1 Skn1 Skn[        UR                  5       5      nUS   nU R                  XeX4US9  UR                  SS5      n[	        U SU R
                  U R                  -  5      n[        X-  5      n	UR                  S5      n
[        U
[        5      (       a  [        S	 U
 5       5      (       d  [        R                  S
U
 35        [        U
5      U	S-  :w  a'  [        R                  SU	S-   S[        U
5       35        UR                  S5      n[        U[        5      (       a  [        S U 5       5      (       d  [        R                  SU 35        [        U5      U	S-  :w  a'  [        R                  SU	S-   S[        U5       35        UR                  S5      nUS   nUc  Ub  [        R                  S5        OUUc  Uc  [        R                  S5        O9[        U[        [        45      (       a  US:  a  [        R                  SU 35        UR                  S5      nUb;  [        U[        [        45      (       a  US:  a  [        R                  SU 35        g g g )N>   r   r   r   r   >   rC   rD   rW   r   r   rE   rF   rG   r   c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fr?   rf   rP   rS   .0r:   s     r,   	<genexpr>PRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s!     6i\hWXz!c5\7R7R\h   #%zF`rope_parameters`'s short_factor field must be a list of numbers, got rH   z8`rope_parameters`'s short_factor field must have length z, got r   c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fr?   r   r   s     r,   r   r     s!     5g[fVWjS%L6Q6Q[fr   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length rC   r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   rW   g        zV`rope_parameters`'s attention_factor field must be a float or int greater than 0, got )r   r   r   rM   r!   rN   rO   rP   rf   r   allr   r   lenr   rS   )r'   r    r   r   r   r   r   rE   rG   rV   r   r   rC   r   rW   s                  r,   "_validate_longrope_rope_parameters=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters{  sZ   hDO0023#K0	!!)Mfq!r / 3 34KS Q4T-=-=AYAY-YZ(23&**>:<..36i\h6i3i3iNNcdpcqrs|q(NNJ3RS8*TZ[^_k[lZmn &))-8;--#5g[f5g2g2gNNbcnbopq{sax'NNI#QR(SYZ]^iZjYkl !$$X.+:;]+^( >>JE ^ @ HNNQRFUCL11Vc\NN_`f_ghi*../AB'<LuVYl1[1[_oru_uNNhiyhz{ `v'r/   c                    1 SknUS   n[        UR                  5       5      nU R                  XEX2S9  US   nUb!  [        U[        [
        45      (       a  US:  a  [        R                  SU 35        US   nUS   nUb  [        U[        [
        45      (       d  [        R                  S	U 35        Ub  [        U[        [
        45      (       d  [        R                  S
U 35        X::  a  [        R                  SU SU 35        US   n	U	b  [        U	[
        5      (       d  [        R                  SU	 35        XR                  :  a&  [        R                  SU	 SU R                   35        g g )N>   rC   r   rD   r   r   r   r   r   rC   rF   r   r   r   zF`rope_parameters`'s low_freq_factor field must be a float, or int got zG`rope_parameters`'s high_freq_factor field must be a float or int, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rf   rS   rP   r   r   re   )
r'   r    r   r   r   r   rC   r   r   r   s
             r,    _validate_llama3_rope_parameters;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters  s   
 $K0	O0023!!)M!c *>FUCL!A!AVc\NN_`f_ghi)*;<*+=>"*_ucl*S*SNNcdsctuv#:6FPS+U+UNNYZjYkl .NNx#$$9/9JL
 ,;;]+^(+3:Ffhk;l;lNNe346 ,/K/KKNN|344QRVRnRnQoq Lr/   c                     SS1nUS   n[        UR                  5       5      nU R                  XEX2S9  UR                  S5      nUc  [        R                  S5        g g )Nr   rD   r   rE   z`rope_parameters`'s partial_rotary_factor is None. This will default to 1.0 in the computation, making this equivalent to the linear_scaling RoPE type. Provide a value in the range [0.0, 1.0) to make use of the proportional RoPE funcitonality.)r   r   r   rM   r   r   )r'   r    r   r   r   r   rE   s          r,   &_validate_proportional_rope_parametersARotaryEmbeddingConfigMixin._validate_proportional_rope_parameters  sk    $l3#K0	O0023!!)M!c / 3 34K L (NNC )r/   r   r   r   r   c                 @   SU;   a  US1-  nUR                  S5        U=(       d
    [        5       nSU;  a  UR                  S5        Ub  U[        U5      -  nX!-
  nU(       a  [        SU  SU 35      eX-
  U-
  nU(       a  [        R	                  SU  SU 35        gg)z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rE   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss          r,   r   /RotaryEmbeddingConfigMixin._check_received_keys  s     ]"fX%Mk*%."-756 "S--M$4YZcYddghtguvww#3mCNNUV_U``cdocpqr r/   )r   r    )r'   r   r?   )NN)r   r   r   r   r   r   r   r   r   rL   r   dictr   r   r   r   r   r   r   staticmethodr   r   r   r   r/   r,   r   r     s>    M%(U"*./`:
 
TWZ^T^ 
j jSVY]S] jj jTWZ^T^ j2d 2QTW[Q[ 2h0$ 0UX[_U_ 0d) )SVY]S] )Vd Y\_cYc  
 %)"&sss s Tz	s
 4Zs sr/   r   r   c                 z    [         R                  " S[        5        U R                  5         U R	                  5         g)ze
This is a deprecated function.
It has been kept for backward compatibility with custom code models.
aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.N)warningswarnFutureWarningrL   r   )r   r   s     r,   rope_config_validationr    s5    
 MM	G
 	 ""$
r/   )NNNN)NNNNrG   )NNNr?   )%ro   r	  collections.abcr   	functoolsr   typingr   r   r   utilsr
   r   
get_loggerr   r   r   configuration_utilsr   r@   rP   r   tuplerS   rX   rc   rj   r   r   r   r#   r  r   r   r   r   r  r   r/   r,   <module>r     s     $  5 5 . 
		H	% 5`H ,0'+!	3&'(3&^$3& 4Z3& d
	3&
 >5 !3&n ,0'+!"C&'(C&^$C& 4ZC& d
	C&
 C& >5 !C&N ,0'+!	C&'(C&^$C& 4ZC& d
	C&
 >5 !C&P (,!	D&D&^$D& 4ZD& d
	D&
 >5 !D&R (,!	U&U&^$U& 4ZU& d
	U&
 >5 !U&t (,!	L,L,^$L, 4ZL, d
	L,
 >5 !L,f 6.$,(9O T#xU>53H-I(IJJK 5#Y 5#pHs HsV
#= CRVJ r/   