
    Z j4              	       H   S SK r S SKrS SKJr  S SKrS SKJrJr  SSKJr  SSK	J
r
  SSKJr  \
R                  " \5      r\" S5       " S	 S
\R                   5      5       r\r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r\" S5       " S S\R                   5      5       r " S S\R                   5      r " S S\R                   5      r " S S\R                   5      r " S  S!\R                   5      r " S" S#\R                   5      r " S$ S%\R                   5      r " S& S'\R                   5      r " S( S)\5      r " S* S+\R                   5      r 0 S,\_S-\S.S/S0.4_S1\_S2\_S3\S4S504_S6\_S7\S8S504_S9\_S:\RB                  _S;\_S<\RD                  _S=\_S>\_S?\_S@\RF                  _SA\_SB\RH                  _\RJ                  \\\RL                  \RN                  \RP                  \ SC.Er)\" \)5      r*SD r+\+" S35      r,\+" S25      r-\+" S,5      r.\+" S15      r/\+" S65      r0\+" S?5      r1\+" SE5      r2\+" S>5      r3\+" S=5      r4g)F    N)OrderedDict)Tensornn   )use_kernel_forward_from_hub)logging)is_torchdynamo_compilingGeluTanhc                   \   ^  \ rS rSrSrS
S\4U 4S jjjrS\S\4S jrS\S\4S jr	S	r
U =r$ )GELUTanh   a  
A fast C implementation of the tanh approximation of the GeLU activation function. See
https://huggingface.co/papers/1606.08415.

This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
match due to rounding errors.
use_gelu_tanh_pythonc                    > [         TU ]  5         U(       a  U R                  U l        g [        R
                  " [        R                  R                  SS9U l        g )Ntanh)approximate)	super__init___gelu_tanh_pythonact	functoolspartialr   
functionalgelu)selfr   	__class__s     i/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/activations.pyr   GELUTanh.__init__(   s<    --DH ((););PDH    inputreturnc                     US-  S[         R                  " [        R                  " S[        R                  -  5      US[         R
                  " US5      -  -   -  5      -   -  $ N      ?      ?       @Hm?g      @torchr   mathsqrtpipowr   r   s     r   r   GELUTanh._gelu_tanh_python/   sP    s{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   c                 $    U R                  U5      $ Nr   r-   s     r   forwardGELUTanh.forward2       xxr   r1   F)__name__
__module____qualname____firstlineno____doc__boolr   r   r   r2   __static_attributes____classcell__r   s   @r   r   r      sJ    QT Q Qwv w& wV   r   r   NewGELUc                   *    \ rS rSrSrS\S\4S jrSrg)NewGELUActivation:   z
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
r   r    c                     SU-  S[         R                  " [        R                  " S[        R                  -  5      US[         R
                  " US5      -  -   -  5      -   -  $ r"   r'   r-   s     r   r2   NewGELUActivation.forwardA   sP    U{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr    Nr6   r7   r8   r9   r:   r   r2   r<   rE   r   r   rA   rA   :   s    
wV w wr   rA   GeLUc                   \   ^  \ rS rSrSrS
S\4U 4S jjjrS\S\4S jrS\S\4S jr	S	r
U =r$ )GELUActivationE   a  
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
use_gelu_pythonc                    > [         TU ]  5         U(       a  U R                  U l        g [        R
                  R                  U l        g r0   )r   r   _gelu_pythonr   r   r   r   )r   rK   r   s     r   r   GELUActivation.__init__N   s/    ((DH}}))DHr   r   r    c                 n    US-  S[         R                  " U[        R                  " S5      -  5      -   -  $ )Nr#   r$   r%   )r(   erfr)   r*   r-   s     r   rM   GELUActivation._gelu_pythonU   s,    s{cEIIediin.D$EEFFr   c                 $    U R                  U5      $ r0   r1   r-   s     r   r2   GELUActivation.forwardX   r4   r   r1   r5   )r6   r7   r8   r9   r:   r;   r   r   rM   r2   r<   r=   r>   s   @r   rI   rI   E   sG    * * *G& GV GV   r   rI   SiLUc                   *    \ rS rSrSrS\S\4S jrSrg)SiLUActivation\   a  
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
later.
r   r    c                 @    [         R                  R                  U5      $ r0   )r   r   silur-   s     r   r2   SiLUActivation.forwardf   s    }}!!%((r   rE   NrF   rE   r   r   rV   rV   \   s    )V ) )r   rV   FastGELUc                   *    \ rS rSrSrS\S\4S jrSrg)FastGELUActivationj   zu
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
r   r    c                 ^    SU-  S[         R                  " US-  SSU-  U-  -   -  5      -   -  $ )Nr#   r$   g3E?r&   )r(   r   r-   s     r   r2   FastGELUActivation.forwardp   s:    U{cEJJu|/CsXX]M]`eMeGe/f$gghhr   rE   NrF   rE   r   r   r]   r]   j   s    iV i ir   r]   	QuickGELUc                   *    \ rS rSrSrS\S\4S jrSrg)QuickGELUActivationt   zj
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
r   r    c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)r(   sigmoidr-   s     r   r2   QuickGELUActivation.forwardz   s    u}}UU]333r   rE   NrF   rE   r   r   rc   rc   t   s    4V 4 4r   rc   c                   J   ^  \ rS rSrSrS\S\4U 4S jjrS\S\4S jrS	r	U =r
$ )
ClippedGELUActivation~   ar  
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://huggingface.co/papers/2004.09602.

Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created.

For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
minmaxc                 h   > X:  a  [        SU SU S35      e[        TU ]	  5         Xl        X l        g )Nzmin should be < max (got min: z, max: ))
ValueErrorr   r   rk   rl   )r   rk   rl   r   s      r   r   ClippedGELUActivation.__init__   s8    9=cU'#aPQQr   xr    c                 l    [         R                  " [        U5      U R                  U R                  5      $ r0   )r(   clipr   rk   rl   )r   rq   s     r   r2   ClippedGELUActivation.forward   s!    zz$q'488TXX66r   )rl   rk   )r6   r7   r8   r9   r:   floatr   r   r2   r<   r=   r>   s   @r   ri   ri   ~   s3    
E  7 7F 7 7r   ri   c                   >   ^  \ rS rSrSrU 4S jrS\S\4S jrSrU =r	$ )AccurateGELUActivation   z
Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
https://github.com/hendrycks/GELUs

Implemented along with MEGA (Moving Average Equipped Gated Attention)
c                 z   > [         TU ]  5         [        R                  " S[        R                  -  5      U l        g )N   )r   r   r)   r*   r+   precomputed_constantr   r   s    r   r   AccurateGELUActivation.__init__   s'    $(IIa$''k$:!r   r   r    c                     SU-  S[         R                  " U R                  US[         R                  " US5      -  -   -  5      -   -  $ )Nr#   r   r&      )r(   r   r{   r,   r-   s     r   r2   AccurateGELUActivation.forward   sE    U{a%**T-F-F%RZ]b]f]fglno]pRpJp-q"rrssr   )r{   )
r6   r7   r8   r9   r:   r   r   r2   r<   r=   r>   s   @r   rw   rw      s)    ;tV t t tr   rw   c                   P   ^  \ rS rSrSrU 4S jrS\S\4S jrS\S\4S jrSr	U =r
$ )	MishActivation   z
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
visit the official repository for the paper: https://github.com/digantamisra98/Mish
c                 `   > [         TU ]  5         [        R                  R                  U l        g r0   )r   r   r   r   mishr   r|   s    r   r   MishActivation.__init__   s    ==%%r   r   r    c                 n    U[         R                  " [        R                  R	                  U5      5      -  $ r0   )r(   r   r   r   softplusr-   s     r   _mish_pythonMishActivation._mish_python   s%    uzz"--"8"8"?@@@r   c                 $    U R                  U5      $ r0   r1   r-   s     r   r2   MishActivation.forward   r4   r   r1   )r6   r7   r8   r9   r:   r   r   r   r2   r<   r=   r>   s   @r   r   r      s;    
&A& AV AV   r   r   c                   *    \ rS rSrSrS\S\4S jrSrg)LinearActivation   zS
Applies the linear activation function, i.e. forwarding input directly to output.
r   r    c                     U$ r0   rE   r-   s     r   r2   LinearActivation.forward   s    r   rE   NrF   rE   r   r   r   r      s    V  r   r   c                   "    \ rS rSrSrSS jrSrg)LaplaceActivation   z
Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
https://huggingface.co/papers/2209.10655

Inspired by squared relu, but with bounded range and gradient for better stability
c                     X-
  R                  U[        R                  " S5      -  5      nSS[        R                  " U5      -   -  $ )Nr%   r#   r$   )divr)   r*   r(   rP   )r   r   musigmas       r   r2   LaplaceActivation.forward   s:      3!78cEIIe,,--r   rE   N)g۞?g ^/?r6   r7   r8   r9   r:   r2   r<   rE   r   r   r   r      s    .r   r   c                       \ rS rSrSrS rSrg)ReLUSquaredActivation   zV
Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
c                 p    [         R                  R                  U5      n[        R                  " U5      nU$ r0   )r   r   relur(   square)r   r   relu_appliedsquareds       r   r2   ReLUSquaredActivation.forward   s)    }}))%0,,|,r   rE   Nr   rE   r   r   r   r      s    r   r   c                       \ rS rSrSrS rSrg)SqrtSoftplusActivation   uF   sqrt(softplus(x)) — the router scoring function used by DeepSeek V4.c                 \    [         R                  R                  U5      R                  5       $ r0   )r   r   r   r*   r-   s     r   r2   SqrtSoftplusActivation.forward   s     }}%%e,1133r   rE   Nr   rE   r   r   r   r      s
    P4r   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )ClassInstantier   c                 l   > [         TU ]  U5      n[        U[        5      (       a  UOU0 4u  p4U" S0 UD6$ )NrE   )r   __getitem__
isinstancetuple)r   keycontentclskwargsr   s        r   r   ClassInstantier.__getitem__   s7    '%c*!+GU!;!;g'2}V}r   rE   )r6   r7   r8   r9   r   r<   r=   r>   s   @r   r   r      s     r   r   c                      ^  \ rS rSrSrSSSS\R                  S4U 4S jjrS\S	\4S
 jr	S\S	\4S jr
S\S	\4S jrSrU =r$ )XIELUActivation   z
Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
Otherwise, we emit a single warning and use xIELU Python
g?r#   gưFc                 <  > [         TU ]  5         [        R                  " [        R
                  " [        R                  " [        R                  " XS95      5      R                  S5      5      U l	        [        R                  " [        R
                  " [        R                  " [        R                  " X#-
  US95      5      R                  S5      5      U l
        U R                  S[        R                  " X5S95        U R                  S[        R                  " XES95        X`l        [        U5      U l        [        U5      U l        S U l         SS Kn[        R$                  R&                  R)                  5       U l        Sn SSKJn	  U	" U R.                  5      U l        US-  n[4        R7                  U5        g ! [2         a$  n
USU
 S	3-  nU R.                  U l         S n
A
N?S n
A
ff = f! [2         a#  n
[4        R7                  S
U
 S35         S n
A
g S n
A
ff = f)N)dtyper   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.z CUDA-fused xIELU not available (u   ) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)r   r   r   	Parameterr(   logexpm1tensor	unsqueezealpha_palpha_nregister_bufferwith_vector_loadsru   _beta_scalar_eps_scalar_xielu_cuda_obj	xielu.opsclassesxieluXIELUtorch.compilerr   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_once)r   alpha_p_initalpha_n_initr   r   r   r   r   msgr   errr   s              r   r   XIELUActivation.__init__   s    	||EIIekk%,,|:a.b$c$m$mno$pq||IIekk%,,|/B%"PQR\\]^_
 	VU\\$%DEUELL$BC!2!$K :#	#(==#6#6#<#<#>D 2C79&4T5E5E&F#?? $  7DSEIstt&*&6&6##7  	23% 8j j 	sB   3G. "F= 'G. =
G+G&!G. &G++G. .
H8HHrq   r    c           
         [         R                  R                  U R                  5      nU R                  [         R                  R                  U R
                  5      -   n[        R                  " US:  X!-  U-  U R                  U-  -   [        R                  " [        R                  " XR                  5      5      U-
  U-  U R                  U-  -   5      $ )Nr   )r   r   r   r   r   r   r(   wherer   rk   r   )r   rq   r   r   s       r   _xielu_pythonXIELUActivation._xielu_python  s    --((6))bmm44T\\BB{{EK!Odii!m+[[1hh/014?$))a-O
 	
r   c                    UR                   nUR                  5       S:  a'  UR                  S5      nUR                  5       S:  a  M'  UR                  5       S:  a"  UR                  SSUR	                  S5      5      nX!R                   :w  a!  [
        R                  SUUR                   5        U R                  R                  UU R                  R                  UR                  5      U R                  R                  UR                  5      U R                  U R                  U R                  5      nUR                  U5      $ )zDFirewall function to prevent torch.compile from seeing .item() callsr   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr   viewsizer   r   r   r2   r   tor   r   r   r   r   )r   rq   original_shaperesults       r   r   XIELUActivation._xielu_cuda"  s    eegkAA eegk557Q;r1affRj)AWW$q
 %%--LLOOAGG$LLOOAGG$""
 {{>**r   r   c                     U R                   bF  UR                  (       a5  [        5       (       d  U R                  U5      $ [        R                  S5        U R                  U5      $ )Nz:torch._dynamo is compiling, using Python version of xIELU.)r   is_cudar	   r   r   r   r   r-   s     r   r2   XIELUActivation.forward;  sN    ++--**511##$`a!!%((r   )r   r   r   r   r   r   r   )r6   r7   r8   r9   r:   r(   bfloat16r   r   r   r   r2   r<   r=   r>   s   @r   r   r      sd     nn(T
v 
& 
+V + +2)V ) ) )r   r   r   gelu_10i
   )rk   rl   	gelu_fastgelu_newgelu_pythonrK   Tgelu_pytorch_tanhgelu_python_tanhr   gelu_accurate	hardswishlaplace
leaky_relulinearr   
quick_gelur   relu2relu6)rf   rY   sqrtsoftplusswishr   prelur   c           	          U [         ;   a	  [         U    $ [        SU  S[        [         R                  5       5       35      e)Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr  a  sB    F"'((#4"55RSWX^XcXcXeSfRghiir   rY   )5r   r)   collectionsr   r(   r   r   integrations.hub_kernelsr   utilsr   utils.import_utilsr	   
get_loggerr6   r   Moduler   PytorchGELUTanhrA   rI   rV   r]   rc   ri   rw   r   r   r   r   r   r   r   	Hardswish	LeakyReLUReLUReLU6SigmoidrT   TanhPReLUACT2CLSr   r  r   r   r   r   r   r   rY   r   
linear_actrE   r   r   <module>r     sJ     #   A  8 
		H	% Z(ryy  )0  Y'w		 w (w V$RYY  %, V$
)RYY 
) %
) Z(i i )i [)4")) 4 *47BII 72tRYY t RYY "ryy 
.		 
.BII 4RYY 4k Z)bii Z)z
N%s2'>? # !	
 N%6$=>  $:D#AB +    ",,  N % BGG  "!" RXX#$ zz*WWGGXX14 
	!j ]+*%f;'	"#67 L)
ffH%
r   