
    Z jP                        S r SSKJr  SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SS	KJrJr  \" 5       (       a  SS
KJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJrJ r   SSK!J"r"  SSK#J$r$  SSKJ%r%  SSK&J'r'  \%RP                  " \)5      r*S r+S r,S r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1 " S S\R\                  5      r2 " S S\R\                  5      r3 " S  S!\R\                  5      r4 " S" S#\5      r5 " S$ S%\R\                  5      r6 " S& S'\R\                  5      r7 " S( S)\R\                  5      r8 " S* S+\R\                  5      r9 " S, S-\R\                  5      r: " S. S/\R\                  5      r; " S0 S1\R\                  5      r<\ " S2 S3\"5      5       r=\" S4S59\ " S6 S7\5      5       5       r>\ " S8 S9\=5      5       r?\" S:S59 " S; S<\=5      5       r@\ " S= S>\=5      5       rA\" S?S59 " S@ SA\=5      5       rB\" SBS59 " SC SD\=5      5       rC\ " SE SF\=5      5       rD\ " SG SH\=5      5       rE\ " SI SJ\=5      5       rF/ SKQrGg)LzPyTorch FNet model.    )	dataclass)partialN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)auto_docstringis_scipy_available)linalg)ACT2FN)GradientCheckpointingLayer)	BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)logging   )
FNetConfigc                     U R                   S   nUSU2SU24   nU R                  [        R                  5      n [        R                  " SXU5      $ )z4Applies 2D matrix multiplication to 3D input arrays.r   Nzbij,jk,ni->bnk)shapetypetorch	complex64einsum)xmatrix_dim_onematrix_dim_two
seq_lengths       w/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/fnet/modeling_fnet.py_two_dim_matmulr)   5   sL    J#KZK*$<=N	uA<<(!^LL    c                     [        XU5      $ N)r)   )r$   r%   r&   s      r(   two_dim_matmulr-   >   s    1n==r*   c                     U n[        [        U R                  5      SS 5       H   n[        R                  R	                  XS9nM"     U$ )z
Applies n-dimensional Fast Fourier Transform (FFT) to input array.

Args:
    x: Input n-dimensional array.

Returns:
    n-dimensional Fourier transform of input n-dimensional array.
r   N)axis)reversedrangendimr!   fft)r$   outr/   s      r(   fftnr5   C   s@     Cqvvqr*+iimmCm+ ,Jr*   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )FNetEmbeddingsS   zGConstruct the embeddings from word, position and token_type embeddings.c                 j  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR
                  UR
                  5      U l        [        R                   " UR"                  5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S[(        R.                  " U R0                  R3                  5       [(        R4                  S9SS9  g )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsLinear
projectionDropouthidden_dropout_probdropoutregister_bufferr!   arangeexpandzerosr=   sizelongselfconfig	__class__s     r(   rE   FNetEmbeddings.__init__V   s:   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>ST))F$6$68J8JKzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r*   c                 b   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr?   r   rA   r   rC   device)rZ   r=   hasattrrA   rX   r!   rY   r[   rc   rJ   rN   rL   rO   rR   rU   )r]   	input_idsrA   r=   inputs_embedsinput_shaper'   buffered_token_type_ids buffered_token_type_ids_expandedrN   
embeddingsrL   s               r(   forwardFNetEmbeddings.forwardj   s:    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
__Z0
\\*-
r*   )rO   rU   rL   rR   rN   rJ   )NNNN)	__name__
__module____qualname____firstlineno____doc__rE   rk   __static_attributes____classcell__r_   s   @r(   r7   r7   S   s    Q
(! !r*   r7   c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )FNetBasicFourierTransform   c                 D   > [         TU ]  5         U R                  U5        g r,   )rD   rE   _init_fourier_transformr\   s     r(   rE   "FNetBasicFourierTransform.__init__   s    $$V,r*   c                    UR                   (       d(  [        [        R                  R                  SS9U l        g UR                  S::  a  [        5       (       a  U R                  S[        R                  " [        R                  " UR                  5      [        R                  S95        U R                  S[        R                  " [        R                  " UR                  5      [        R                  S95        [        [        U R                   U R"                  S9U l        g [$        R&                  " S5        [        U l        g [        U l        g )	N)r      dim   dft_mat_hiddenrB   dft_mat_seq)r%   r&   zpSciPy is needed for DFT matrix calculation and is not found. Using TPU optimized fast fourier transform instead.)use_tpu_fourier_optimizationsr   r!   r3   r5   fourier_transformrK   r   rV   tensorr   dftrH   r"   tpu_short_seq_lengthr-   r   r   r   warning)r]   r^   s     r(   ry   1FNetBasicFourierTransform._init_fourier_transform   s    33%,UYY^^%HD"++t3!##$$$ell6::f>P>P3QY^YhYh&i $$!5<<

6;V;V0W_d_n_n#o *1"43C3CTXTgTg*& * *.&%)D"r*   c                 >    U R                  U5      R                  nU4$ r,   )r   real)r]   hidden_statesoutputss      r(   rk   !FNetBasicFourierTransform.forward   s"     ((7<<zr*   )r   )	rm   rn   ro   rp   rE   ry   rk   rr   rs   rt   s   @r(   rv   rv      s    -*. r*   rv   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetBasicOutput   c                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g Nr;   )rD   rE   r   rO   rH   rP   r\   s     r(   rE   FNetBasicOutput.__init__   s,    f&8&8f>S>STr*   c                 ,    U R                  X!-   5      nU$ r,   rO   r]   r   input_tensors      r(   rk   FNetBasicOutput.forward   s    |'CDr*   r   rm   rn   ro   rp   rE   rk   rr   rs   rt   s   @r(   r   r      s    U r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetFourierTransform   c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r,   )rD   rE   rv   r]   r   outputr\   s     r(   rE   FNetFourierTransform.__init__   s&    -f5	%f-r*   c                 X    U R                  U5      nU R                  US   U5      nU4nU$ Nr   )r]   r   )r]   r   self_outputsfourier_outputr   s        r(   rk   FNetFourierTransform.forward   s1    yy/\!_mD!#r*   )r   r]   r   rt   s   @r(   r   r      s    .
 r*   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )FNetIntermediate   c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r,   )rD   rE   r   rQ   rH   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr\   s     r(   rE   FNetIntermediate.__init__   s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r*   r   returnc                 J    U R                  U5      nU R                  U5      nU$ r,   r   r   r]   r   s     r(   rk   FNetIntermediate.forward   s&    

=100?r*   r   
rm   rn   ro   rp   rE   r!   Tensorrk   rr   rs   rt   s   @r(   r   r      s(    9U\\ ell  r*   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
FNetOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rD   rE   r   rQ   r   rH   r   rO   rP   rS   rT   rU   r\   s     r(   rE   FNetOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r,   )r   rU   rO   r   s      r(   rk   FNetOutput.forward   s5    

=1]3}'CDr*   )rO   r   rU   r   rt   s   @r(   r   r      s6    >U\\  RWR^R^  r*   r   c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )	FNetLayer   c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g Nr   )
rD   rE   chunk_size_feed_forwardseq_len_dimr   fourierr   intermediater   r   r\   s     r(   rE   FNetLayer.__init__   sI    '-'E'E$+F3,V4 (r*   c                     U R                  U5      nUS   n[        U R                  U R                  U R                  U5      nU4nU$ r   )r   r   feed_forward_chunkr   r   )r]   r   self_fourier_outputsr   layer_outputr   s         r(   rk   FNetLayer.forward   sO    #||M:-a00##T%A%A4CSCSUc
  /r*   c                 J    U R                  U5      nU R                  X!5      nU$ r,   )r   r   )r]   r   intermediate_outputr   s       r(   r   FNetLayer.feed_forward_chunk   s(    "//?{{#6Gr*   )r   r   r   r   r   )	rm   rn   ro   rp   rE   rk   r   rr   rs   rt   s   @r(   r   r      s    )
 r*   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )FNetEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rD   rE   r^   r   
ModuleListr1   num_hidden_layersr   layergradient_checkpointing)r]   r^   _r_   s      r(   rE   FNetEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                     U(       a  SOS n[        U R                  5       H  u  pVU(       a  XA4-   nU" U5      nUS   nM      U(       a  XA4-   nU(       d  [        S X4 5       5      $ [        XS9$ )N r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr,   r   ).0vs     r(   	<genexpr>&FNetEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)last_hidden_stater   )	enumerater   tupler   )r]   r   output_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss           r(   rk   FNetEncoder.forward  sw    "6BD(4OA#$58H$H!(7M)!,M  5   14D DX]$FXXX``r*   )r^   r   r   )FTr   rt   s   @r(   r   r     s    ,a ar*   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
FNetPooleri!  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r,   )rD   rE   r   rQ   rH   r   Tanh
activationr\   s     r(   rE   FNetPooler.__init__"  s9    YYv1163E3EF
'')r*   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ r   )r   r   )r]   r   first_token_tensorpooled_outputs       r(   rk   FNetPooler.forward'  s6     +1a40

#566r*   )r   r   r   rt   s   @r(   r   r   !  s(    $
U\\ ell  r*   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )FNetPredictionHeadTransformi1  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rD   rE   r   rQ   rH   r   r   r   r   r   transform_act_fnrO   rP   r\   s     r(   rE   $FNetPredictionHeadTransform.__init__2  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr*   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r,   )r   r   rO   r   s     r(   rk   #FNetPredictionHeadTransform.forward;  s4    

=1--m<}5r*   )rO   r   r   r   rt   s   @r(   r   r   1  s)    UU\\ ell  r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetLMPredictionHeadiB  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  5      U l        [        R                  " [        R                  " UR                  5      5      U l        g r,   )rD   rE   r   	transformr   rQ   rH   rG   decoder	Parameterr!   rY   biasr\   s     r(   rE   FNetLMPredictionHead.__init__C  sW    4V<yy!3!3V5F5FGLLV->->!?@	r*   c                 J    U R                  U5      nU R                  U5      nU$ r,   )r   r   r   s     r(   rk   FNetLMPredictionHead.forwardI  s$    }5]3r*   )r  r   r   r   rt   s   @r(   r   r   B  s    A r*   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetOnlyMLMHeadiO  c                 B   > [         TU ]  5         [        U5      U l        g r,   )rD   rE   r   predictionsr\   s     r(   rE   FNetOnlyMLMHead.__init__P  s    /7r*   c                 (    U R                  U5      nU$ r,   r  )r]   sequence_outputprediction_scoress      r(   rk   FNetOnlyMLMHead.forwardT  s     ,,_=  r*   r  r   rt   s   @r(   r  r  O  s    8! !r*   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetOnlyNSPHeadiZ  c                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr|   )rD   rE   r   rQ   rH   seq_relationshipr\   s     r(   rE   FNetOnlyNSPHead.__init__[  s'     "		&*<*<a @r*   c                 (    U R                  U5      nU$ r,   r  )r]   r   seq_relationship_scores      r(   rk   FNetOnlyNSPHead.forward_  s    !%!6!6}!E%%r*   r  r   rt   s   @r(   r  r  Z  s    A& &r*   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FNetPreTrainingHeadsie  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g r  )rD   rE   r   r  r   rQ   rH   r  r\   s     r(   rE   FNetPreTrainingHeads.__init__f  s4    /7 "		&*<*<a @r*   c                 L    U R                  U5      nU R                  U5      nX44$ r,   r  r  )r]   r  r   r  r  s        r(   rk   FNetPreTrainingHeads.forwardk  s-     ,,_=!%!6!6}!E 88r*   r  r   rt   s   @r(   r  r  e  s    A
9 9r*   r  c                   <   ^  \ rS rSr% \\S'   SrSrU 4S jrSr	U =r
$ )FNetPreTrainedModeliq  r^   fnetTc                 F  > [         TU ]  U5        [        U[        5      (       a|  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        [        R                  " UR                  5        g g )Nr?   r>   )rD   _init_weightsr   r7   initcopy_r=   r!   rW   r   rX   zeros_rA   )r]   moduler_   s     r(   r$  !FNetPreTrainedModel._init_weightsw  so    f%fn--JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. .r*   r   )rm   rn   ro   rp   r   __annotations__base_model_prefixsupports_gradient_checkpointingr$  rr   rs   rt   s   @r(   r!  r!  q  s    &*#/ /r*   r!  z0
    Output type of [`FNetForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Srg)	FNetForPreTrainingOutputi~  ar  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
    before SoftMax).
Nlossprediction_logitsseq_relationship_logitsr   r   )rm   rn   ro   rp   rq   r0  r!   FloatTensorr*  r1  r2  r   r   rr   r   r*   r(   r/  r/  ~  sd    	 &*D%

d
")26u((4/68<U..5<59M5**+d29r*   r/  c                      ^  \ rS rSrSrSU 4S jjrS rS r\      SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )	FNetModeli  z

The model can behave as an encoder, following the architecture described in [FNet: Mixing Tokens with Fourier
Transforms](https://huggingface.co/papers/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.

c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rD   rE   r^   r7   rj   r   encoderr   pooler	post_init)r]   r^   add_pooling_layerr_   s      r(   rE   FNetModel.__init__  sK    
 	 (0"6*,=j(4 	r*   c                 .    U R                   R                  $ r,   rj   rJ   r]   s    r(   get_input_embeddingsFNetModel.get_input_embeddings  s    ...r*   c                 $    XR                   l        g r,   r=  )r]   values     r(   set_input_embeddingsFNetModel.set_input_embeddings  s    */'r*   Nre   rA   r=   rf   r   r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR	                  5       nUu  pO&Ub  UR	                  5       S S nUu  pO[        S5      eU R                   R
                  (       a+  U
S::  a%  U R                   R                  U
:w  a  [        S5      eUb  UR                  OUR                  nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U
24   nUR                  X5      nUnO$[        R                  " U[        R                  US9nU R                  UUUUS9nU R                  UUUS	9nUS
   nU R                   b  U R!                  U5      OS nU(       d
  UU4USS  -   $ [#        UUUR$                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer?   z5You have to specify either input_ids or inputs_embedsr   zThe `tpu_short_seq_length` in FNetConfig should be set equal to the sequence length being passed to the model when using TPU optimizations.rA   rb   )re   r=   rA   rf   )r   r   r   r   )r   pooler_outputr   )r^   r   r   
ValueErrorrZ   r   r   rc   rd   rj   rA   rX   r!   rY   r[   r7  r8  r   r   )r]   re   rA   r=   rf   r   r   kwargsrg   
batch_sizer'   rc   rh   ri   embedding_outputencoder_outputsr  rF  s                     r(   rk   FNetModel.forward  s    %9$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"#..*K%0"J
&',,.s3K%0"J
TUU KK55d"00J>; 
 &/%:!!@T@T!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z??%)'	 + 
 ,,!5# ' 

 *!,8<8OO4UY#]3oab6III)-')77
 	
r*   )r^   rj   r7  r8  )T)NNNNNN)rm   rn   ro   rp   rq   rE   r?  rC  r   r!   
LongTensorr3  boolr   r   rk   rr   rs   rt   s   @r(   r5  r5    s     /0  .2260426,0#'D
##d*D
 ((4/D
 &&-	D

 ((4/D
 #TkD
 D[D
 
	 D
 D
r*   r5  z
    FNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   8  ^  \ rS rSrSSS.rU 4S jrS rS r\        SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )FNetForPreTrainingi  cls.predictions.bias&fnet.embeddings.word_embeddings.weightzcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r,   )rD   rE   r5  r"  r  clsr9  r\   s     r(   rE   FNetForPreTraining.__init__	  s4     f%	'/ 	r*   c                 B    U R                   R                  R                  $ r,   rU  r  r   r>  s    r(   get_output_embeddings(FNetForPreTraining.get_output_embeddings      xx##+++r*   c                     XR                   R                  l        UR                  U R                   R                  l        g r,   rU  r  r   r  r]   new_embeddingss     r(   set_output_embeddings(FNetForPreTraining.set_output_embeddings  *    '5$$2$7$7!r*   Nre   rA   r=   rf   labelsnext_sentence_labelr   r   r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n
U
SS u  pU R                  X5      u  pSnUbv  Ubs  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   nU(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, FNetForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
>>> model = FNetForPreTraining.from_pretrained("google/fnet-base")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
```NrA   r=   rf   r   r   r|   r?   )r0  r1  r2  r   )	r^   r   r"  rU  r   viewrG   r/  r   )r]   re   rA   r=   rf   rc  rd  r   r   rH  r   r  r   r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   s                       r(   rk   FNetForPreTraining.forward  s+   L &1%<k$++BYBY)))%'!5#  
 *1!&48HH_4\1
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J'@712;NF/9/EZMF*Q6Q'/$:!//	
 	
r*   rU  r"  NNNNNNNN)rm   rn   ro   rp   _tied_weights_keysrE   rY  r`  r   r!   r   rN  r   r/  rk   rr   rs   rt   s   @r(   rP  rP    s     )?*R
,8  *..2,0-1&*37,0#'C
<<$&C
 t+C
 llT)	C

 ||d*C
 t#C
 #\\D0C
 #TkC
 D[C
 
)	)C
 C
r*   rP  c                     ^  \ rS rSrSSS.rU 4S jrS rS r\       SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )FNetForMaskedLMi`  rQ  rR  rS  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r,   )rD   rE   r5  r"  r  rU  r9  r\   s     r(   rE   FNetForMaskedLM.__init__g  4     f%	"6* 	r*   c                 B    U R                   R                  R                  $ r,   rX  r>  s    r(   rY  %FNetForMaskedLM.get_output_embeddingsp  r[  r*   c                     XR                   R                  l        UR                  U R                   R                  l        g r,   r]  r^  s     r(   r`  %FNetForMaskedLM.set_output_embeddingss  rb  r*   Nre   rA   r=   rf   rc  r   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      nSnUbF  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        XU	R                  S9$ )a{  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Nrf  r   r?   r|   r0  logitsr   )	r^   r   r"  rU  r   rg  rG   r   r   )r]   re   rA   r=   rf   rc  r   r   rH  r   r  r  rj  ri  r   s                  r(   rk   FNetForMaskedLM.forwardw  s    $ &1%<k$++BYBY)))%'!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY>[b[p[pqqr*   rm  NNNNNNN)rm   rn   ro   rp   ro  rE   rY  r`  r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   rq  rq  `  s     )?*R
,8  *..2,0-1&*,0#'(r<<$&(r t+(r llT)	(r
 ||d*(r t#(r #Tk(r D[(r 
	(r (rr*   rq  zT
    FNet Model with a `next sentence prediction (classification)` head on top.
    c                     ^  \ rS rSrU 4S jr\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	\
-  4S jj5       rSrU =r$ )FNetForNextSentencePredictioni  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r,   )rD   rE   r5  r"  r  rU  r9  r\   s     r(   rE   &FNetForNextSentencePrediction.__init__  rt  r*   Nre   rA   r=   rf   rc  r   r   r   c           	      Z   Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      nSnUb2  [	        5       nU" UR                  SS5      UR                  S5      5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        UUU	R                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring). Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, FNetForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
>>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
```Nrf  r   r?   r|   rz  )r^   r   r"  rU  r   rg  r   r   )r]   re   rA   r=   rf   rc  r   r   rH  r   r   seq_relationship_scoresrk  ri  r   s                  r(   rk   %FNetForNextSentencePrediction.forward  s    H &1%<k$++BYBY)))%'!5#  
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//
 	
r*   rm  r}  )rm   rn   ro   rp   rE   r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   r  r    s      *..2,0-1&*,0#'?
<<$&?
 t+?
 llT)	?

 ||d*?
 t#?
 #Tk?
 D[?
 
,	,?
 ?
r*   r  z
    FNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                     ^  \ rS rSrU 4S jr\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	\
-  4S jj5       rSrU =r$ )FNetForSequenceClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r,   rD   rE   
num_labelsr5  r"  r   rS   rT   rU   rQ   rH   
classifierr9  r\   s     r(   rE   &FNetForSequenceClassification.__init__  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r*   Nre   rA   r=   rf   rc  r   r   r   c           	      4   Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [!        XU	R"                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrf  r   
regressionsingle_label_classificationmulti_label_classificationr?   r|   rz  )r^   r   r"  rU   r  problem_typer  rC   r!   r[   intr   squeezer   rg  r   r   r   )r]   re   rA   r=   rf   rc  r   r   rH  r   r   r{  r0  ri  r   s                  r(   rk   %FNetForSequenceClassification.forward  s   $ &1%<k$++BYBY)))%'!5#  
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'TPWPePeffr*   r  rU   r"  r  r}  )rm   rn   ro   rp   rE   r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   r  r    s    	  *..2,0-1&*,0#':g<<$&:g t+:g llT)	:g
 ||d*:g t#:g #Tk:g D[:g 
)	):g :gr*   r  c                     ^  \ rS rSrU 4S jr\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	\
-  4S jj5       rSrU =r$ )FNetForMultipleChoiceiE  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g r   )rD   rE   r5  r"  r   rS   rT   rU   rQ   rH   r  r9  r\   s     r(   rE   FNetForMultipleChoice.__init__G  sV     f%	zz&"<"<=))F$6$6: 	r*   Nre   rA   r=   rf   rc  r   r   r   c           	         Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   n	Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUS9n
U
S   nU R                  U5      nU R                  U5      nUR                  SU	5      nSnUb  [        5       nU" X5      nU(       d  U4U
SS -   nUb  U4U-   $ U$ [        XU
R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r?   rf  r|   rz  )r^   r   r   rg  rZ   r"  rU   r  r   r   r   )r]   re   rA   r=   rf   rc  r   r   rH  num_choicesr   r   r{  reshaped_logitsr0  ri  r   s                    r(   rk   FNetForMultipleChoice.forwardQ  s   T &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 )))%'!5#  
  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE(dZaZoZoppr*   )r  rU   r"  r}  )rm   rn   ro   rp   rE   r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   r  r  E  s      *..2,0-1&*,0#'Mq<<$&Mq t+Mq llT)	Mq
 ||d*Mq t#Mq #TkMq D[Mq 
*	*Mq Mqr*   r  c                     ^  \ rS rSrU 4S jr\       SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	\
-  4S jj5       rSrU =r$ )FNetForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r,   r  r\   s     r(   rE   #FNetForTokenClassification.__init__  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r*   Nre   rA   r=   rf   rc  r   r   r   c           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        XU	R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nrf  r   r?   r|   rz  )
r^   r   r"  rU   r  r   rg  r  r   r   )r]   re   rA   r=   rf   rc  r   r   rH  r   r  r{  r0  ri  r   s                  r(   rk   "FNetForTokenClassification.forward  s      &1%<k$++BYBY)))%'!5#  
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$$WMbMbccr*   r  r}  )rm   rn   ro   rp   rE   r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   r  r    s    
  *..2,0-1&*,0#')d<<$&)d t+)d llT)	)d
 ||d*)d t#)d #Tk)d D[)d 
&	&)d )dr*   r  c                   "  ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\	\
-  4S jj5       rSrU =r$ )FNetForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r,   )
rD   rE   r  r5  r"  r   rQ   rH   
qa_outputsr9  r\   s     r(   rE   !FNetForQuestionAnswering.__init__  sS      ++f%	))F$6$68I8IJ 	r*   Nre   rA   r=   rf   start_positionsend_positionsr   r   r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U
SS  -   nUb  U4U-   $ U$ [        XXR                  S9$ )	Nrf  r   r   r?   r}   )ignore_indexr|   )r0  start_logits
end_logitsr   )r^   r   r"  r  splitr  
contiguouslenrZ   clampr   r   r   )r]   re   rA   r=   rf   r  r  r   r   rH  r   r  r{  r  r  rh  ignored_indexri  
start_lossend_lossr   s                        r(   rk    FNetForQuestionAnswering.forward  s    &1%<k$++BYBY)))%'!5#  
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+:]r]r
 	
r*   )r"  r  r  rn  )rm   rn   ro   rp   rE   r   r!   r   rN  r   r   rk   rr   rs   rt   s   @r(   r  r    s    	  *..2,0-1/3-1,0#'5
<<$&5
 t+5
 llT)	5

 ||d*5
 ,5
 ||d*5
 #Tk5
 D[5
 
-	-5
 5
r*   r  )
rq  r  r  rP  r  r  r  r   r5  r!  )Hrq   dataclassesr   	functoolsr   r!   r   torch.nnr   r   r    r
   r%  utilsr   r   scipyr   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   configuration_fnetr   
get_loggerrm   loggerr)   r-   r5   Moduler7   rv   r   r   r   r   r   r   r   r   r   r  r  r  r!  r/  r5  rP  rq  r  r  r  r  r  __all__r   r*   r(   <module>r     s    !    A A & 7  ! 9
 
 
 . 6  * 
		H	%M>
 8RYY 8v#		 #Lbii 
299 
ryy   * 6a")) a8  ")) "
299 
!bii !&bii &	9299 	9 	// 	/ 	/ 
 :{ : :$ c
# c
 c
L Z
, Z
Z
z ?r) ?r ?rD 
J
$7 J

J
Z Gg$7 GgGgT Yq/ Yq Yqx 7d!4 7d 7dt B
2 B
 B
Jr*   