
    Z jJK                       S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJrJr  SSKJr  \R@                  " \!5      r" " S S\RF                  5      r$ " S S\RF                  5      r% " S S\RF                  5      r& " S S\RF                  5      r' " S S\RF                  5      r( " S S\RF                  5      r)  SzS\RF                  S\RT                  S\RT                  S\RT                  S \RT                  S-  S!\+S-  S"\+S#\\   4S$ jjr, " S% S&\RF                  5      r- " S' S(\RF                  5      r. " S) S*\RF                  5      r/ " S+ S,\RF                  5      r0 " S- S.\RF                  5      r1 " S/ S0\RF                  5      r2 " S1 S2\RF                  5      r3\ " S3 S4\
5      5       r4 " S5 S6\RF                  5      r5   S{S7\RT                  S8\+S9\6S-  S:\7S;\84
S< jjr9  S|S7\RT                  S=\6\8-  S9\6S-  S;\84S> jjr: " S? S@\RF                  5      r; " SA SB\RF                  5      r< " SC SD\RF                  5      r= " SE SF\RF                  5      r> " SG SH\RF                  5      r?\" SISJ9\ " SK SL\5      5       5       r@ " SM SN\45      rA\" SOSJ9\ " SP SQ\5      5       5       rB\" SRSJ9 " SS ST\45      5       rC\" SUSJ9\ " SV SW\5      5       5       rD\" SXSJ9 " SY SZ\45      5       rE\" S[SJ9\ " S\ S]\5      5       5       rF\" S^SJ9\ " S_ S`\5      5       5       rG\" S^SJ9\ " Sa Sb\5      5       5       rHSc\R                  R                  Sd\RT                  Se\RT                  4Sf jrKS}Sg\RT                  Sh\RT                  S-  Se\RT                  4Si jjrL " Sj Sk\45      rM\" SlSJ9\ " Sm Sn\5      5       5       rN " So Sp\45      rO\" SqSJ9\ " Sr Ss\5      5       5       rP " St Su\RF                  5      rQ\" SvSJ9 " Sw Sx\45      5       rR/ SyQrSg)~zPyTorch PatchTSMixer model.    N)Callable)	dataclass)PreTrainedModel)ModelOutput   )initialization)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)TransformersKwargsauto_docstringlogging   )PatchTSMixerConfigc                   >   ^  \ rS rSrSrS\S\4U 4S jjrS rSrU =r	$ )PatchTSMixerGatedAttention&   z
Module that applies gated attention to input data.

Args:
    in_size (`int`): The input size.
    out_size (`int`): The output size.
in_sizeout_sizec                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " SS9U l        g )Ndim)super__init__nnLinear
attn_layerSoftmaxattn_softmax)selfr   r   	__class__s      ڇ/root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/modeling_patchtsmixer.pyr   #PatchTSMixerGatedAttention.__init__/   s/    ))G6JJ2.    c                 N    U R                  U R                  U5      5      nX-  nU$ N)r#   r!   )r$   inputsattn_weights      r&   forward"PatchTSMixerGatedAttention.forward4   s(    ''(?@%r(   )r!   r#   )
__name__
__module____qualname____firstlineno____doc__intr   r-   __static_attributes____classcell__r%   s   @r&   r   r   &   s%    / /s /
 r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerBatchNorm;   zH
Compute batch normalization over the sequence length (time) dimension.
configc                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g )Neps)r   r   r   BatchNorm1dd_modelnorm_eps	batchnormr$   r;   r%   s     r&   r   PatchTSMixerBatchNorm.__init__@   s(    FOOLr(   r+   c                 l    UR                  SS5      nU R                  U5      nUR                  SS5      $ )z
Parameters:
    inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
        input for Batch norm calculation
Returns:
    `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
r      )	transposerB   )r$   r+   outputs      r&   r-   PatchTSMixerBatchNorm.forwardD   s7     !!!Q''1%%r(   )rB   r/   r0   r1   r2   r3   r   r   torchTensorr-   r5   r6   r7   s   @r&   r9   r9   ;   s,    M1 M
&ell 
& 
&r(   r9   c                      ^  \ rS rSrSrS\4U 4S jjr\S\S\R                  4S j5       r
S\R                  4S jrS	rU =r$ )
PatchTSMixerPositionalEncodingQ   z
Class for positional encoding
r;   c                    > [         TU ]  5         UR                  (       a  U R                  U5      U l        g [
        R                  " [        R                  " UR                  UR                  5      5      U l        g r*   )r   r   use_positional_encoding_init_peposition_encr   	ParameterrK   zerosnum_patchesr@   rC   s     r&   r   'PatchTSMixerPositionalEncoding.__init__V   sN    )) $f 5D "U[[9K9KV^^-\ ]Dr(   returnc                 d   U R                   S:X  a@  [        R                  " [        R                  " U R
                  U R                  5      SS9nU$ U R                   S:X  Ga8  [        R                  " U R
                  U R                  5      n[        R                  " SU R
                  5      R                  S5      n[        R                  " [        R                  " SU R                  S5      [        R                  " S5      U R                  -  * -  5      n[        R                  " X#-  5      US S 2SS S24'   [        R                  " X#-  5      US S 2SS S24'   XR                  5       -
  nXR!                  5       S	-  -  n[        R                  " US
S9nU$ [#        U R                    S35      e)NrandomTrequires_gradsincosr   r   rF   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   rT   rK   randnrV   r@   rU   arange	unsqueezeexpmathlogsincosmeanstd
ValueError)r;   rS   positiondiv_terms       r&   rR   'PatchTSMixerPositionalEncoding._init_pe^   sn    **h6<<F4F4F(WgklL  ,,8 ;;v'9'96>>JL||Av'9'9:DDQGHyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!'*;*;*==L'+;+;+=+BCL<<EJL
  223  4B  C r(   patch_inputc                 "    XR                   -   nU$ r*   rS   )r$   rn   hidden_states      r&   r-   &PatchTSMixerPositionalEncoding.forwardr   s    "%6%66r(   rp   )r/   r0   r1   r2   r3   r   r   staticmethodr   rT   rR   rK   rL   r-   r5   r6   r7   s   @r&   rN   rN   Q   sS    ^1 ^ +   &5<<  r(   rN   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerNormLayerx   zUNormalization block

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                    > [         TU ]  5         UR                  U l        SUR                  R                  5       ;   a  [	        U5      U l        g [        R                  " UR                  UR                  S9U l        g )Nbatchr=   )
r   r   norm_mlplowerr9   normr   	LayerNormr@   rA   rC   s     r&   r   PatchTSMixerNormLayer.__init__   sT    foo++---f5DIV^^IDIr(   r+   c                 l   SU R                   R                  5       ;   a  [        R                  " UUR                  S   UR                  S   -  UR                  S   UR                  S   45      nU R                  U5      n[        R                  " X!R                  5      nU$ U R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the normalization layer.
Returns:
    `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
rx   r   r   rF   r   )ry   rz   rK   reshapeshaper{   )r$   r+   inputs_reshapeds      r&   r-   PatchTSMixerNormLayer.forward   s     dmm))++#mmLLOfll1o5LLOLLOO #ii8O ]]?LLAF
  YYv&Fr(   )r{   ry   rJ   r7   s   @r&   ru   ru   x   s,    J1 Jell  r(   ru   c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )PatchTSMixerMLP   c                 >  > [         TU ]  5         XR                  -  n[        R                  " X5      U l        [        R                  " UR                  5      U l        [        R                  " XB5      U l	        [        R                  " UR                  5      U l
        g r*   )r   r   expansion_factorr   r    fc1Dropoutdropoutdropout1fc2dropout2)r$   in_featuresout_featuresr;   
num_hiddenr%   s        r&   r   PatchTSMixerMLP.__init__   sd     #:#::
99[5

6>>299Z6

6>>2r(   r+   c                     U R                  [        R                  R                  U R	                  U5      5      5      nU R                  U5      nU R                  U5      nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        Input to the MLP layer.
Returns:
    `torch.Tensor` of the same shape as `inputs`
)r   r   
functionalgelur   r   r   )r$   r+   s     r&   r-   PatchTSMixerMLP.forward   sK     r}}11$((62BCD&!v&r(   )r   r   r   r   )
r/   r0   r1   r2   r   rK   rL   r-   r5   r6   r7   s   @r&   r   r      s    3ell  r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )$PatchTSMixerChannelFeatureMixerBlock   zzThis module mixes the features in the channel dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g Nr   r   r;   r   r   )
r   r   ru   r{   
gated_attnr   num_input_channelsmlpr   gating_blockrC   s     r&   r   -PatchTSMixerChannelFeatureMixerBlock.__init__   sv    )&1	 ++"1122
  :11F<U<U!D r(   r+   c                     UnU R                  U5      nUR                  SSSS5      nU R                  (       a  U R                  U5      nU R	                  U5      nUR                  SSSS5      nX-   nU$ )z
Args:
    inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
        input to the MLP layer
Returns:
    `torch.Tensor` of the same shape as `inputs`
r   r   rF   r   )r{   permuter   r   r   )r$   r+   residualouts       r&   r-   ,PatchTSMixerChannelFeatureMixerBlock.forward   sq     6"1a+??&&v.F&!1a+
r(   r   r   r   r{   rJ   r7   s   @r&   r   r      s*    1  ell  r(   r   modulequerykeyvalueattention_maskscalingr   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr         rF   r   r   )ptrainingr   )
sizerK   matmulrG   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r&   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r(   c                   :  ^  \ rS rSrSr     SS\S\S\S\S\S	\S
\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\
R                  S-  S\S-  S\\   S\\
R                  \
R                  S-  \\
R                     S-  4   4S jjrSrU =r$ )PatchTSMixerAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr   
is_decoderbias	is_causalr;   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r   )r   )r   r   r   r   r   head_dimr;   rj   r   r   r   r   r    k_projv_projq_projout_proj)	r$   r   r   r   r   r   r   r;   r%   s	           r&   r   PatchTSMixerAttention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr(   hidden_stateskey_value_statesr   output_attentionsr   rX   c                    USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U(       a  UOUn
/ U
R                   SS QSPU R                  P7nU R                  U
5      R                  U5      R	                  SS5      nU R                  U
5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nU" U U	UUU4U R                  (       d  SOU R                  U R                  US.UD6u  nnUR                  " / UQSP76 R!                  5       nU R#                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNr   r   rF           )r   r   r   )r   r   r   viewrG   r   r   r
   get_interfacer;   _attn_implementationr   r   r   r   r   r   r   )r$   r   r   r   r   r   is_cross_attentioninput_shapehidden_shapequery_statescurrent_stateskv_shape
key_statesvalue_statesattention_interfacer   r   s                    r&   r-   PatchTSMixerAttention.forward0  s    .T9 $))#2.88b8$--8 {{=166|DNNqRST-?)]B^))#2.BBDMMB[[055h?II!QO
{{>277AKKAqQ(?(M(MKK,,.E)
 %8
%
  $}}C$,,LL/
%
 
%
!\ "));;;;FFHmmK0L$..r(   )r;   r   r   r   r   r   r   r   r   r   r   r   )r   FTFN)NNF)r/   r0   r1   r2   r3   r4   floatboolr   r   rK   rL   r   r	   tupler-   r5   r6   r7   s   @r&   r   r     s
   G  ,0CC C 	C
 C C C #T)C CD 15.2).0/||0/  ,,-0/ t+	0/
  $;0/ -.0/ 
u||U\\D0%2E2LL	M0/ 0/r(   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchMixerBlockic  zhThis module mixes the patch dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        UR
                  U l        [        UR                  UR                  US9U l        UR
                  (       a#  [        UR                  UR                  S9U l
        UR                  (       a@  [        UR                  UR                  UR                  US9U l        [        U5      U l        g g )Nr   r   )r   r   r   r;   )r   r   ru   r{   	self_attnr   r   rV   r   r   r   r   r@   self_attn_headsr   self_attn_layer	norm_attnrC   s     r&   r   PatchMixerBlock.__init__k  s    )&1	)) ++"**++
  :6CUCU`f`r`r sD#8 .. 00	$D  36:DN r(   c                    UnU R                  U5      nU R                  (       aI  UR                  u  p4pVUR                  X4-  XV5      nU R	                  USS9u  n  n	UR                  X4XV5      nUR                  SS5      nU R                  U5      nU R                  (       a  U R                  U5      nUR                  SS5      nU R                  (       a  U R                  UW-   5      nX-   n
U
$ )zj
Args:
    hidden_state (`torch.Tensor`): Input tensor.

Returns:
    `torch.Tensor`: Transformed tensor.
F)r   rF   r   )
r{   r   r   r   r   rG   r   r   r   r   )r$   rq   r   
batch_sizen_varsrV   r@   hidden_state_reshapedx_attn_r   s              r&   r-   PatchMixerBlock.forward  s      yy.>>7C7I7I4J$0$8$89Lk$c!//0EY^/_LFAq^^JMF $--a3xx-??,,\:L $--a3>>>>,*?@L%
r(   )r   r   r   r{   r   r   r   
r/   r0   r1   r2   r3   r   r   r-   r5   r6   r7   s   @r&   r   r   c  s    ;1 ;4! !r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )FeatureMixerBlocki  zrThis module mixes the hidden feature dimension.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r;   c                   > [         TU ]  5         [        U5      U l        UR                  U l        [        UR                  UR                  US9U l        UR                  (       a$  [        UR                  UR                  S9U l	        g g r   )
r   r   ru   r{   r   r   r@   r   r   r   rC   s     r&   r   FeatureMixerBlock.__init__  sn    )&1	 ++"
  :6>>\b\j\j kD r(   hiddenc                     UnU R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX-   nU$ )
Args:
    hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
        Input tensor to the layer.

Returns:
    `torch.Tensor`: Transformed tensor.
)r{   r   r   r   )r$   r   r   r   s       r&   r-   FeatureMixerBlock.forward  sI     6"&!??&&v.F
r(   r   rJ   r7   s   @r&   r   r     s,    l1 l ell  r(   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerLayeri  z
The `PatchTSMixer` layer that does all three kinds of mixing.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

r;   c                    > [         TU ]  5         [        US9U l        [	        US9U l        UR                  U l        UR                  S:X  a  [        US9U l        g g )Nr;   mix_channel)	r   r   r   patch_mixerr   feature_mixermoder   channel_feature_mixerrC   s     r&   r   PatchTSMixerLayer.__init__  sR    *&9.f=KK	;;-')MU[)\D& (r(   r   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU$ )r   r   )r   r   r   r   )r$   r   s     r&   r-   PatchTSMixerLayer.forward  sE     99%//7F!!&)##F+r(   )r   r   r   r   rJ   r7   s   @r&   r   r     s,    	]1 	]ell  r(   r   c                   F   ^  \ rS rSrSrS\4U 4S jjrSS\4S jjrSr	U =r
$ )	PatchTSMixerBlocki  z{The main computing framework of the `PatchTSMixer` model.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c           	         > [         TU ]  5         UR                  n[        R                  " [        U5       Vs/ s H  n[        US9PM     sn5      U l        g s  snf Nr   )r   r   
num_layersr   
ModuleListranger   mixers)r$   r;   r  r   r%   s       r&   r   PatchTSMixerBlock.__init__  sI    &&
mmuU_O`$aO`!%6f%EO`$ab$as   Aoutput_hidden_statesc                     / nUnU R                    H%  nU" U5      nU(       d  M  UR                  U5        M'     U(       a  XC4$ US4$ )a3  
Args:
    hidden_state (`torch.Tensor`): The input tensor.
    output_hidden_states (`bool`, *optional*, defaults to False.):
        Whether to output the hidden states as well.

Returns:
    `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
    `True`.
N)r	  append)r$   rq   r  all_hidden_states	embeddingmods         r&   r-   PatchTSMixerBlock.forward  sR      	;;CII##!((3 
  //d?"r(   )r	  F)r/   r0   r1   r2   r3   r   r   r   r-   r5   r6   r7   s   @r&   r  r    s(    c1 c#$ # #r(   r  c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerForPredictionHeadi%  zaPrediction Head for Forecasting

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                   > [         TU ]  5         UR                  U l        U R                  b  U R                  R                  5         [        R
                  " UR                  5      U l        Uc>  [        R                  " UR                  UR                  -  UR                  5      U l        O-UR                  UR                  UR                  -  5      U l        [        R                  " SS9U l        g )N	start_dim)r   r   prediction_channel_indicessortr   r   head_dropoutdropout_layerr    rV   r@   prediction_lengthbase_forecast_blockget_parameter_projectionFlattenflatten)r$   r;   distribution_outputr%   s      r&   r   &PatchTSMixerForPredictionHead.__init__-  s    *0*K*K'**6++002ZZ(;(;<&')yy&2D2Dv~~2UX^XpXp'qD$':'S'S""V^^3(D$ zzB/r(   c                 v  ^  T R                  U5      nT R                  U5      nT R                  U5      n[        U[        5      (       a  [	        S U 5       5      nOUR                  SS5      nT R                  b=  [        U[        5      (       a  [	        U 4S jU 5       5      nU$ UST R                  4   nU$ )a:  

Args:
    hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
        or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.

c              3   D   #    U  H  oR                  S S5      v   M     g7f)r   r  N)rG   ).0zs     r&   	<genexpr>8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>P  s     C(Q[[R00(s    r   r  c              3   D   >#    U  H  oS TR                   4   v   M     g7f).N)r  )r&  r'  r$   s     r&   r(  r)  V  s!      [RZQ3(G(G#G!HRZs    .)r!  r  r  
isinstancer   rG   r  r$   hidden_featuresforecasts   `  r&   r-   %PatchTSMixerForPredictionHead.forward?  s     ,,7,,_=++O<h&&C(CCH))"b1H**6(E**  [RZ [[  $C)H)H$HIr(   )r  r  r!  r  r*   r   r7   s   @r&   r  r  %  s$    01 0 0$ r(   r  c                   >   ^  \ rS rSrSrSS\4U 4S jjjrS rSrU =r	$ )PatchTSMixerLinearHeadi]  zpLinear head for Classification and Regression.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                 \  > [         TU ]  5         UR                  U l        UR                  U l        UR                  c  UR                  nOSnX l        UcA  [        R                  " UR                  UR                  -  U-  UR                  5      U l        O0UR                  UR                  UR                  -  U-  5      U l        UR                  c  [        R                  " SS9U l        O[        R                  " SS9U l        [        R                  " UR                   5      U l        g )Nr   r  r  )r   r   head_aggregationoutput_rangerV   r"  r   r    r@   r   num_targets
projectionr  r   r!  r   r  r   )r$   r;   r"  
mul_factorr%   s       r&   r   PatchTSMixerLinearHead.__init__e  s     & 7 7"//""*++JJ#6 & ii!:!::ZG""DO
 2JJ!:!::ZGDO ""*::3DL::3DLzz&"5"56r(   c                 0   UR                  SS5      nU R                  S:X  a  US   nOIU R                  S:X  a  UR                  SS9R                  nOU R                  S:X  a  UR	                  SS9nU R
                  (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  cS  U R                  bF  [        R                  " U5      U R                  S   U R                  S	   -
  -  U R                  S	   -   nU$ )
a1  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x num_targets)`.
r   r  use_last).r   max_poolr   avg_poolr   r   )rG   r4  maxvaluesrh   r!  r   r7  r"  r5  rK   sigmoid)r$   r-  s     r&   r-   PatchTSMixerLinearHead.forward  s
    *33B;  J.-g6O""j0-11b19@@O""j0-22r2:O<<"ll?;O,,7///:$$,43D3D3Po.$2C2CA2FIZIZ[\I]2]^aeararstauu  r(   )r"  r   r!  r4  r5  r7  r*   r   r7   s   @r&   r1  r1  ]  s$    71 7 78   r(   r1  c                   ^    \ rS rSr% \\S'   SrSrSrSr	\
R                  " 5       S 5       rSrg	)
PatchTSMixerPreTrainedModeli  r;   modelpast_values)timeFc                 <   [        U[        5      (       a<  U R                  R                  S:X  a!  [        R
                  " UR                  SSS9  gg[        U[        R                  [        R                  45      (       a  [        R                  " UR                  5        [        R                  " UR                  5        [        USS5      ba  [        R                  " UR                  5        [        R                  " UR                   5        [        R                  " UR"                  5        gg[        U[$        5      (       aU  [        R                  " UR&                  R                  5        [        R                  " UR&                  R                  5        g[        U[        R(                  5      (       ac  [        R
                  " UR                  SU R                  R*                  S9  UR                  b!  [        R                  " UR                  5        ggg)zInitialize weightsrZ   r   g?)rh   ri   running_meanN)r+  rN   r;   r_   initnormal_rS   r   r|   r?   zeros_r   ones_weightgetattrrH  running_varnum_batches_trackedr9   rB   r    init_std)r$   r   s     r&   _init_weights)PatchTSMixerPreTrainedModel._init_weights  sX    f<=={{33x?V00sD @r~~ >??KK$JJv}}%v~t4@F//0

6--.F667 A  566KK((--.JJv''../		**LLSdkk6J6JK{{&FKK( ' +r(    N)r/   r0   r1   r2   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingrK   no_gradrR  r5   rT  r(   r&   rC  rC    s9     #O &+#
]]_) )r(   rC  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )PatchTSMixerPretrainHeadi  zSPretraining head.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                    > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        g r*   )
r   r   r   r   r  r  r    r@   patch_lengthbase_pt_blockrC   s     r&   r   !PatchTSMixerPretrainHead.__init__  sB    ZZ(;(;<YYv~~v7J7JKr(   c                 J    U R                  U5      nU R                  U5      nU$ )aG  
Args:
    hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
        or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
        features.

Returns:
    `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
)r  r_  r,  s      r&   r-    PatchTSMixerPretrainHead.forward  s)     ,,_=%%o6r(   )r_  r  r   r7   s   @r&   r\  r\    s!    L1 L r(   r\  r+   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    US:  d  US:  a  [        SU S35      eU R                  u  pVpxU R                  n	[        USU-
  -  5      n
U(       a*  [        R
                  " USXyS9nUR                  SUS5      nO[        R
                  " XVXyS9n[        R                  " XVXyS9nSUSS2SS2SU
24'   [        R                  " USS9n[        R                  " USS9n[        R                  " USUS	9nUR                  S5      R                  SSSU5      nUb  SUSS2USS2SS24'   U R                  UR                  5       U5      nXS
   4$ )a  random_masking: Mask the input considering the control variables.

Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
        The input tensor to mask.
    mask_ratio (`float`):
        Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
    unmasked_channel_indices (list, *optional*):
        Indices of channels that will not be masked.
    channel_consistent_masking (bool, *optional*, defaults to `False`):
        When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
        across channels.
    mask_value (int, *optional*, defaults to 0):
        Define the value of masked patches for pretraining.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
    n]
r   r   zMask ratio z has to be between 0 and 1.deviceNr   r   )r   index.r   )rj   r   ri  r4   rK   randrepeatonesargsortgatherrb   masked_fillr   )r+   rc  rd  re  rf  r   num_channelssequence_lengthnum_featuresri  len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r&   random_maskingr{    sA   4 A~q;zl2MNOO>Dll;Jo]]F?a*n56H!

:q/IQa0 

:_T ::jODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r(   num_forecast_mask_patchesc                 P   [        U[        5      (       a  U/nU Vs/ s H  nSPM     nnU R                  u  pgp[        R                  " XgXR
                  S9n
/ nSn[        U5      n[        X5       HG  u  pUS::  d  X:  a  [        SU S35      e[        Xo-  U-  5      nUR                  XU/5        UU-  nMI     [        US S9nX:  a  US   S   Xl-
  -   US   S'   OX:  a  US	   S   X-
  -   US	   S'   SnU H  u  nnnUU-   nSU
UU2S
S
2U* S
24'   UnM     [        R                  " U
R                  S   5      nU
U   n
U
R                  S	5      R                  SSSU	5      n
Ub  SU
S
S
2US
S
2S
S
24'   U R                  U
R                  5       U5      nUU
S   4$ s  snf )ai  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

Parameters:
    inputs (`torch.Tensor`):
        Input of shape `(bs, num_channels, num_patch, patch_length)`
    num_forecast_mask_patches (`list`):
        Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
    unmasked_channel_indices (`list`, *optional*):
        Indices of channels that are not masked.
    mask_value (`int`, *optional*, defaults to 0):
        Values in the masked patches will be filled by `mask_value`.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
    num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
r   rh  r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     U S   $ NrF   rT  )xs    r&   <lambda>"forecast_masking.<locals>.<lambda>J  s    !A$r(   )r   rF   r   Nrk  )r+  r4   r   rK   rU   ri  sumziprj   r  sortedrandpermrb   rm  rq  r   )r+   r|  rd  rf  r   forecast_mask_ratiosr   rr  rs  rt  rw  t_listtotal_lengthtotal_ratior^  ratiotemp_lenbatch1	patch_lenbatch2permrz  s                         r&   forecast_maskingr    s   0 +S11%>$?!'@A'@!A'@A>Dll;Jo;;zWDFL*+K"#<S1 ?,\N:pq  z)K78|H56   T F/F ay|z'@Aq	!		"r
1)BCr
1F"(	1h("./VF]A	z{*+ #)
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   F#c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerPatchifyic  z
A class to patchify the time series sequence into different patches

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r;   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  ::  a&  [        SU R                   SU R                   S35      e[        U R                  U R                  5      U R                  -
  U R
                  -  S-   U l        U R                  U R
                  U R                  S-
  -  -   nU R                  U-
  U l	        g )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   context_lengthrs  r^  patch_striderj   r>  rV   sequence_start)r$   r;   new_sequence_lengthr%   s      r&   r   PatchTSMixerPatchify.__init__k  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr(   rE  c                 4   UR                   S   nX R                  :w  a  [        SU SU R                   S35      eUSS2U R                  S2SS24   nUR	                  SU R
                  U R                  S9nUR                  SS5      R                  5       nU$ )z
Parameters:
    past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
        Input for patchification

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r  zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionr   stepr3  )	r   rs  rj   r  unfoldr^  r  rG   r   )r$   rE  rs  rH   s       r&   r-   PatchTSMixerPatchify.forward|  s     &++B/222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r(   )rV   r^  r  rs  r  rJ   r7   s   @r&   r  r  c  s,    I1 I"5<<  r(   r  c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSMixerMaskingi  ap  
Class to perform random or forecast masking.

Parameters:
    config (`PatchTSMixerConfig`): model config
Returns:
    x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points
r;   c                 >  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  [        U R                  5      U l        g g r*   )	r   r   random_mask_ratiore  	mask_typer|  rd  rf  r  rC   s     r&   r   PatchTSMixerMasking.__init__  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r(   rn   c                 d   U R                   S:X  a8  [        UU R                  U R                  U R                  U R
                  S9u  p#OVU R                   S:X  a-  [        UU R                  U R                  U R
                  S9u  p#O[        SU R                    S35      eUR                  5       nX#4$ )a  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input

Return:
    masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points

rZ   )r+   rc  rd  re  rf  r.  )r+   r|  rd  rf  zInvalid mask type .)
r  r{  r  rd  re  rf  r  r|  rj   r   )r$   rn   masked_inputrw  s       r&   r-   PatchTSMixerMasking.forward  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{!!r(   )re  r  rf  r|  r  rd  rJ   r7   s   @r&   r  r    s,    
	R1 	R!"5<< !" !"r(   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerStdScaleri  z
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
r;   c                   > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  U l        g SU l        g )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrr  r   r  r  rC   s     r&   r   PatchTSMixerStdScaler.__init__  sd    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[_r(   dataobserved_indicatorrX   c                 r   UR                  U R                  U R                  S9nUR                  S5      nX-  R                  U R                  U R                  S9U-  nX-
  U-  S-  R                  U R                  U R                  S9U-  n[        R
                  " XPR                  -   5      nX-
  U-  XF4$ )  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
r        ?rF   )r  r   r  	clamp_minrK   sqrtr  )r$   r  r  denominatorlocvariancescales          r&   r-   PatchTSMixerStdScaler.forward  s     ),,TXXt||,L!++C0(--dhh-MP[[j$661<AA$((TXT`T`Aadoo

8&8&889
e#S//r(   )r   r  r  r/   r0   r1   r2   r3   r   r   rK   rL   r   r-   r5   r6   r7   s   @r&   r  r    sY    
`1 `0LL06;ll0	u||U\\5<<7	80 0r(   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSMixerMeanScaleri  z~
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
r;   c                 N  > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  OSU l        [        US5      (       a  UR                  U l        g S U l        g )Nr  r   r  Tr  绽|=default_scale)r   r   r  r  r   r  r  r  rC   s     r&   r   PatchTSMixerMeanScaler.__init__  s    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[`5<V_5U5UV11[_r(   r  r  rX   c                    X-  R                  5       R                  U R                  SS9nUR                  U R                  SS9nU[        R                  " USS9-  nU R
                  cL  UR                  SS9n[        R                  " UR                  S5      SS9n[        R                  " Xg-  5      nO#U R
                  [        R                  " U5      -  n[        R                  " US:  XX5      n[        R                  " XPR                  S9nX-  n	U R                  (       d  UR                  U R                  S9nU	[        R                  " U5      U4$ )r  Tr  r   minr   r   )absr  r   rK   clampr  squeeze	ones_likewherer  r  
zeros_like)
r$   r  r  ts_sumnum_observedr  	batch_sumbatch_observationsr  scaled_datas
             r&   r-   PatchTSMixerMeanScaler.forward  s"    +00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)*HIM ..1GGM L1,eC E'9'9:l||MMdhhM/EE,,U3U::r(   )r  r   r  r  r  r7   s   @r&   r  r    sY    
`1 `&;LL&;6;ll&;	u||U\\5<<7	8&; &;r(   r  c            
          ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\R                  \R                  \R                  4   4S	 jjr
S
rU =r$ )PatchTSMixerNOPScaleri,  zt
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
r;   c                    > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  U l        g SU l        g )Nr  r   r  T)r   r   r  r  r   r  rC   s     r&   r   PatchTSMixerNOPScaler.__init__1  sF    )0)G)G6%%Q)0)C)Cv~~r(   Nr  r  rX   c                     [         R                  " USS9R                  U R                  U R                  S9n[         R
                  " USS9R                  U R                  U R                  S9nXU4$ )aP  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
Fr[   r   r  )rK   r  rh   r   r  r  )r$   r  r  r  r  s        r&   r-   PatchTSMixerNOPScaler.forward6  sg     E:??DHHVZVbVb?ct59>>488UYUaUa>b%r(   r  r*   r  r7   s   @r&   r  r  ,  se    N1 N MQ LL 6;llT6I 	u||U\\5<<7	8   r(   r  zS
    Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.
    )custom_introc                   p    \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                     S-  \S'   Srg)PatchTSMixerEncoderOutputiG  a  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
    Hidden-state at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
Nlast_hidden_stater   rT  )r/   r0   r1   r2   r3   r  rK   FloatTensorrU  r   r   r5   rT  r(   r&   r  r  G  s9     37u((4/659M5**+d29r(   r  c                      ^  \ rS rSrSrS\4U 4S jjr\  SS\R                  S\
S-  S\
S-  S	\\-  4S
 jj5       rSrU =r$ )PatchTSMixerEncoderiY  z
Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.
r;   c                 0  > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  [        US9U l
        OS U l
        [        US9U l        U R                  5         g r  )r   r   return_dictr   r    r^  r@   patcherrQ   rN   positional_encoderr  mlp_mixer_encoder	post_initrC   s     r&   r   PatchTSMixerEncoder.__init__b  sq     !--yy!4!4fnnE))&DF&SD#&*D#!2&!A 	r(   NrE  r  r  rX   c                     Ub  UOU R                   nU R                  U5      nU R                  b  U R                  U5      nU R                  XRS9u  pgU(       d  [	        S UU4 5       5      $ [        XgS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to
    predict the masked portion. For a forecasting task, this denotes the history/past time series values.
    Similarly, for classification or regression tasks, it denotes the appropriate context values of the
    time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
    it is greater than 1.

Returns:
    `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
)r  c              3   &   #    U  H  nUv   M	     g 7fr*   rT  r&  vs     r&   r(  .PatchTSMixerEncoder.forward.<locals>.<genexpr>  s      A    )r  r   )r  r  r  r  r   r  )r$   rE  r  r  r   patchesr  r   s           r&   r-   PatchTSMixerEncoder.forwardq  s    , &1%<k$BRBR ,,{+ "".--g6G+/+A+A'+A+u(  &!   );Ljjr(   )r  r  r  r  )FN)r/   r0   r1   r2   r3   r   r   r   rK   rL   r   r   r  r-   r5   r6   r7   s   @r&   r  r  Y  so    1   -2#'	)k\\)k #Tk)k D[	)k 
*	*)k )kr(   r  zG
    Base class for model's outputs, with potential hidden states.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   S
rg)PatchTSMixerModelOutputi  a  
last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
    Hidden-state at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
    Patched input data to the model.
mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
    Bool Tensor indicating True in masked patches and False otherwise.
loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
    enabled.
scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
    enabled.
Nr  r   rn   rw  r  r  rT  )r/   r0   r1   r2   r3   r  rK   r  rU  r   r   rn   rw  r  r  r5   rT  r(   r&   r  r    s    " 37u((4/659M5**+d29,0K""T)0%)D%

d
")$(C		T	!(&*E5t#*r(   r  z=
    The PatchTSMixer Model for time-series forecasting.
    c                      ^  \ rS rSrSS\S\4U 4S jjjr\   SS\R                  S\R                  S-  S\S-  S	\S-  S
\
4
S jj5       rSrU =r$ )PatchTSMixerModeli  r;   
mask_inputc                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        USL a  [        U5      U l        OSU l        UR                  S:X  a  [        U5      U l        O@UR                  S:X  d  UR                  SL a  [        U5      U l        O[        U5      U l        U R                  5         g)z}
mask_input (bool, *optional*, defaults to `False`):
    Whether to mask the input using the [`PatchTSMixerMasking`] module.
TNrh   ri   )r   r   r  r  encoderr  patchingr  maskingr   r  scalerr  r  r  )r$   r;   r  r%   s      r&   r   PatchTSMixerModel.__init__  s    
 	 !--*62,V4.v6DLDL>>V#08DK^^u$$(>/7DK/7DK 	r(   NrE  observed_maskr  r  rX   c           	         Ub  UOU R                   nSnUc  [        R                  " U5      nU R                  X5      u  pxn	U R	                  U5      n
U
nU R
                  b  U R                  U
5      u  pU R                  UUUS9n[        U[        5      (       a  [        U6 nU(       d,  [        S UR                  UR                  U
UUU	4 5       5      $ [        UR                  UR                  U
UUU	S9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
Nr  r  c              3   &   #    U  H  nUv   M	     g 7fr*   rT  r  s     r&   r(  ,PatchTSMixerModel.forward.<locals>.<genexpr>        
A r  )r  r   rn   rw  r  r  )r  rK   r  r  r  r  r  r+  r   r  r  r   r  )r$   rE  r  r  r  r   rw  scaled_past_valuesr  r  	patched_x	enc_inputencoder_outputs                r&   r-   PatchTSMixerModel.forward  s   . &1%<k$BRBR !OOK8M)-[)P&MM"45		<<#"ll95OI !5# & 
 ne,,6GN 
 #44"00
 
 
 ',>>(66!
 	
r(   )r  r  r  r  r  r  )NFN)r/   r0   r1   r2   r   r   r   r   rK   rL   r  r-   r5   r6   r7   s   @r&   r  r    s    1 t  4  .2,1#'B
\\B
 ||d*B
 #Tk	B

 D[B
 
!B
 B
r(   r  z>
    Output type of [`PatchTSMixerForPreTrainingOutput`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Srg)	 PatchTSMixerForPreTrainingOutputi$  a  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
    Prediction output from the pretrain head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer.
Nlossprediction_outputsr  r   rT  r/   r0   r1   r2   r3   r
  rK   r  rU  r  r  r   r   r5   rT  r(   r&   r	  r	  $  d    	 &*D%

d
")37))D0726u((4/659M5**+d29r(   r	  z.
    `PatchTSMixer` for mask pretraining.
    c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S\R                  S-  S\	S-  S\	S	\	S-  S
\
4S jj5       rSrU =r$ )PatchTSMixerForPretrainingi<  r;   c                    > [         TU ]  U5        [        USS9U l        [	        US9U l        UR                  U l        UR                  U l        U R                  5         g )NT)r  r   )	r   r   r  rD  r\  headmasked_lossr  r  rC   s     r&   r   #PatchTSMixerForPretraining.__init__B  sR     &v$?
,F;	!--!-- 	r(   NrE  r  r  return_lossr  rX   c                    Ub  UOU R                   nU R                  SL a  [        R                  R	                  SS9nO[        R                  R	                  SS9nU R                  UUUUS9n[        U[        5      (       a  [        U6 nU R                  UR                  5      n	USL a  U" XR                  5      n
OSn
U R                  SL aK  U
bH  U
R                  SS9UR                  -  R                  5       UR                  R                  5       S	-   -  n
U(       d*  [        S
 U
U	UR                  UR                  4 5       5      $ [!        U
U	UR                  UR                  S9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
NTnone	reductionrh   r  r  r  r   r   r  c              3   &   #    U  H  nUv   M	     g 7fr*   rT  r  s     r&   r(  5PatchTSMixerForPretraining.forward.<locals>.<genexpr>        A r  r
  r  r  r   )r  r  rK   r   MSELossrD  r+  r   r  r  r  rn   rh   rw  r  r   r	  )r$   rE  r  r  r  r  r   r
  model_outputx_hatloss_vals              r&   r-   "PatchTSMixerForPretraining.forwardL  sp   4 &1%<k$BRBRt#88##f#5D88##f#5D zz'!5#	 " 
 lE**2LAL		,889$E#;#;<HH t#(< "-0A0AAFFHLL]L]LaLaLcfkLklH   22 ..	   0$*<<&44	
 	
r(   )r  r  rD  r  NFTN)r/   r0   r1   r2   r   r   r   rK   rL   r   r	  r-   r5   r6   r7   s   @r&   r  r  <  s    1   .2,1 #'E
\\E
 ||d*E
 #Tk	E

 E
 D[E
 
*E
 E
r(   r  z=
    Output type of [`PatchTSMixerForPredictionOutput`].
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   S
rg)PatchTSMixerForPredictionOutputi  a  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
    Prediction output from the forecast head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
    Input mean
scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
    Input std dev
Nr
  r  r  r   r  r  rT  )r/   r0   r1   r2   r3   r
  rK   r  rU  r  r  r   r   r  r  r5   rT  r(   r&   r%  r%    s     &*D%

d
")37))D0726u((4/659M5**+d29$(C		T	!(&*E5t#*r(   r%  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)"SamplePatchTSMixerPredictionOutputi  
sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
    Sampled values from the chosen distribution.
N	sequencesrT  
r/   r0   r1   r2   r3   r)  rK   r  rU  r5   rT  r(   r&   r'  r'        
 +/Iu  4'.r(   r'  c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)"SamplePatchTSMixerRegressionOutputi  r(  Nr)  rT  r*  rT  r(   r&   r-  r-    r+  r(   r-  inputtargetrX   c                 &    U R                  U5      * $ )z[
Computes the negative log likelihood loss from input distribution with respect to target.
)log_prob)r.  r/  s     r&   nllr2    s     NN6"""r(   input_tensorweightsc                 R   Ub  [         R                  " US:g  X-  [         R                  " U 5      5      n[         R                  " U(       a  UR	                  US9OUR	                  5       SS9nU(       a  UR	                  US9U-  $ UR	                  5       U-  $ U R                  US9$ )a:  
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

Args:
    input_tensor (`torch.FloatTensor`):
        Input tensor, of which the average must be computed.
    weights (`torch.FloatTensor`, *optional*):
        Weights tensor, of the same shape as `input_tensor`.
    dim (`int`, *optional*):
        The dim along which to average `input_tensor`.

Returns:
    `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
r   r   r  r  )rK   r  r  r  r  rh   )r3  r4  r   weighted_tensorsum_weightss        r&   weighted_averager8    s      ++glL4JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r(   c                   8  ^  \ rS rSrSrS\4U 4S jjr\     SS\R                  S\R                  S-  S\R                  S-  S	\
S-  S
\
S\
S-  S\4S jj5       r\R                  " 5        SS\R                  S\R                  S-  S\4S jj5       rSrU =r$ )PatchTSMixerForPredictioni  z
`PatchTSMixer` for forecasting application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r;   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  S:X  a  S U l        O`UR                  n[        [        [        S.nUR                  UR                  5      nUb  U" US9U l        O[        SUR                   35      e[        U5      U l        [        UU R                  S9U l        U R#                  5         g )Nmse	student_tnormalnegative_binomialr   Unknown distribution output r;   r"  )r   r   r
  r  r  num_parallel_samplesr"  r  r   r   r   getrj   r  rD  r  r  r  )r$   r;   r   distribution_output_mapoutput_classr%   s        r&   r   "PatchTSMixerForPrediction.__init__   s     KK	!--*0*K*K'$*$?$?!;;%'+D$**C+&%;'#
 366v7Q7QRL'+7C+@( #?@Z@Z?[!\]]&v.
1 $ 8 8
	 	r(   NrE  r  future_valuesr  r  r  rX   c           	         U R                   S:X  a  [        R                  " SS9nO"U R                   S:X  a  [        nO[	        S5      eUb  UOU R
                  nU R                  UUUUS9n	[        U	[        5      (       a  [        U	6 n	U R                  U	R                  5      n
SnU R                  b  U R                  (       ay  U R                  R                  U
U	R                  SU R                  4   U	R                   SU R                  4   S	9nUb(  US
L a#  U" UUSU R                  4   5      n[#        U5      nOXR                   SU R                  4   -  U	R                  SU R                  4   -   n
Ub  US
L a  U" XSU R                  4   5      nOU R                  (       aJ  U R                  R                  XR                  U	R                   S	9nUb  US
L a  U" X5      n[#        U5      nO+XR                   -  U	R                  -   n
Ub  US
L a  U" X5      nU R                  b7  U	R                  SU R                  4   nU	R                   SU R                  4   nOU	R                  nU	R                   nU(       d,  [        S UU
U	R                  U	R$                  UU4 5       5      $ ['        UU
U	R                  U	R$                  UUS9$ )a  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:
    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `future_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.
return_loss (`bool`,  *optional*):
    Whether to return the loss in the `forward` call.
r<  rh   r  r2  2Invalid loss function: Allowed values: mse and nllNr  .r  r  Tc              3   &   #    U  H  nUv   M	     g 7fr*   rT  r  s     r&   r(  4PatchTSMixerForPrediction.forward.<locals>.<genexpr>  r  r  )r
  r  r  r   r  r  )r
  r   r  r2  rj   r  rD  r+  r   r  r  r  r  r"  distributionr  r  r8  r   r%  )r$   rE  r  rH  r  r  r  r   r
  r  y_hatr!  rN  r  r  s                  r&   r-   !PatchTSMixerForPrediction.forward  s   J 99::/DYY%DQRR%0%<k$BRBR zz'!5#	 " 
 lE**2LAL 		,889**6''#77DD$((d.M.M)MN&,,S$2Q2Q-QR  E  
 !,1D#$%c4+J+J&JK H
  09H ..sD4S4S/STT"&&sD,K,K'KLM  !,1D#Ed>]>]9]+^_H''#77DD//|7I7I  E   !,1D#L@H/9H 2 22\5E5EE ,1D#E9H**6""3(G(G#GHC &&sD,K,K'KLE""C &&E 
  22 ..
 
 
 /$*<<&44
 	
r(   c                 4   U R                   nU " USUSS9nU R                  R                  UR                  UR                  UR
                  S9n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9n[        US9$ s  snf )aX  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.

    observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, prediction_length, num_input_channels)`.
NF)rE  rH  r  r  rK  r   r   r)  )rC  r"  rN  r  r  r  r  samplerK   stackr'  )r$   rE  r  rC  outputsrN  r   sampless           r&   generate"PatchTSMixerForPrediction.generate  s    2  $88 #'!&	
 //<<&&GKKw}} = 

 388L2MN2MQ&&(2MN ++g1-1GDD	 Os   B)r"  r  r
  rD  rC  r  r  )NNFTNr*   )r/   r0   r1   r2   r3   r   r   r   rK   rL   r   r%  r-   rZ  r'  rW  r5   r6   r7   s   @r&   r:  r:    s    	1 >  .2-1,1 #'x
\\x
 ||d*x
 ||d*	x

 #Tkx
 x
 D[x
 
)x
 x
t ]]_ .2-E\\-E ||d*-E 
,	-E -Er(   r:  zK
    Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Srg)	-PatchTSMixerForTimeSeriesClassificationOutputi  a,  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
    Prediction output from the classification head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nr
  r  r  r   rT  r  rT  r(   r&   rZ  rZ    r  r(   rZ  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\R                  S\R                  S-  S\
S-  S	\
S
\
S-  S\4S jj5       rSrU =r$ )'PatchTSMixerForTimeSeriesClassificationi  z
`PatchTSMixer` for classification application.

Args:
    config (`PatchTSMixerConfig`):
        Configuration.

Returns:
    `None`.
r;   c                   > [         TU ]  U5        [        U5      U l        [	        US9U l        UR                  U l        UR                  S;   a$  [        UR                  UR                  S9U l        OS U l        U R                  5         g )Nr   ri   rh   Tr@   rV   )r   r   r  rD  r1  r  r  r   InjectScalerStatistics4Dr@   rV   inject_scaler  rC   s     r&   r   0PatchTSMixerForTimeSeriesClassification.__init__  sv     &v.
*
	 "-->>22 8]c]o]o pD $D 	r(   NrE  target_valuesr  r  r  rX   c                 <   [         R                  R                  5       nUb  UOU R                  nU R	                  UUUS9n[        U[        5      (       a  [        U6 nU R                  b4  U R                  UR                  UR                  UR                  S9Ul	        U R                  UR                  5      n	Ub  USL a	  U" X5      n
OSn
U(       d*  [        S U
U	UR                  UR                  4 5       5      $ [        U
U	UR                  UR                  S9$ )aH  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target
    values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
Nr  rK  Tc              3   &   #    U  H  nUv   M	     g 7fr*   rT  r  s     r&   r(  BPatchTSMixerForTimeSeriesClassification.forward.<locals>.<genexpr>?  r  r  r  )rK   r   CrossEntropyLossr  rD  r+  r   r  ra  r  r  r  r  r   rZ  )r$   rE  rc  r  r  r  r   r
  r  rO  r!  s              r&   r-   /PatchTSMixerForTimeSeriesClassification.forward  s/   J xx((*%0%<k$BRBRzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<E1HH   22 ..	   =$*<<&44	
 	
r(   )r  ra  rD  r  r#  )r/   r0   r1   r2   r3   r   r   r   rK   rL   r   rZ  r-   r5   r6   r7   s   @r&   r\  r\    s    	1    .2,1 #'N
\\N
 ||d*N
 #Tk	N

 N
 D[N
 
7N
 N
r(   r\  z=
    Output type of [`PatchTSMixerForRegressionOutput`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Srg)	PatchTSMixerForRegressionOutputiQ  a)  
loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
    Total loss.
regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
    Prediction output from the regression head.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
    Backbone embeddings before passing through the head.
hidden_states (`tuple(torch.FloatTensor)`, *optional*):
    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nr
  regression_outputsr  r   rT  )r/   r0   r1   r2   r3   r
  rK   r  rU  rk  r  r   r   r5   rT  r(   r&   rj  rj  Q  r  r(   rj  c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  S\R                  S\R                  4S	 jrS
r	U =r
$ )r`  ii  r@   rV   	expansionc                 (  > [         TU ]  5         [        R                  " US-   X1-  5      U l        [        R                  " X1-  U5      U l        [        R                  " SSU-  5      U l        [        R                  " SU-  S5      U l        X l        g r  )	r   r   r   r    inverse_trans_expansioninverse_trans_compressionmap_scale_expansionmap_scale_compressionrV   )r$   r@   rV   rm  r%   s       r&   r   !InjectScalerStatistics4D.__init__j  sr    ')yy1i>Q'R$)+93F)P&#%99QI#> %'YYq9}a%@"&r(   r+   r  r  c                    UR                  SS5      nUR                  S5      nUR                  SSU R                  S5      nUR                  SS5      nUR                  S5      nUR                  SSU R                  S5      n[        R
                  " XE/SS9nU R                  U5      nU R                  U5      n[        R
                  " X/SS9nU R                  U5      nU R                  U5      nU$ )aQ  
Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
    loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
    scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
Returns:
    `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
r   r  r   r   )
rG   rb   rm  rV   rK   catrq  rr  ro  rp  )r$   r+   r  r  rh   stdevconcat_statss          r&   r-    InjectScalerStatistics4D.forwards  s     }}R$~~b!{{1a!1!115B'#Q4#3#3Q7yy$B7//=11,?F1r:--f5//7r(   )rp  ro  rr  rq  rV   )rF   )r/   r0   r1   r2   r4   r   rK   rL   r-   r5   r6   r7   s   @r&   r`  r`  i  sM    ' '# '# ' 'ell  ell  r(   r`  z4
    `PatchTSMixer` for regression application.
    c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S\R                  S-  S\	S-  S\	S	\	S-  S
\
4S jj5       r\R                  " 5       S\R                  S
\4S j5       rSrU =r$ )PatchTSMixerForRegressioni  r;   c                   > [         TU ]  U5        [        U5      U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  S:X  a  S U l        O^[        [        [        S.nUR                  UR
                  5      nUb  U" UR                  S9U l        O[        SUR
                   35      eUR                  S;   a$  [        UR                   UR"                  S9U l        OS U l        ['        UU R
                  S9U l        U R+                  5         g )Nr<  r=  r   rA  r^  r_  rB  )r   r   r  rD  r
  r"  r  rC  r   r   r   rD  r6  rj   r   r`  r@   rV   ra  r1  r  r  )r$   r;   rE  rF  r%   s       r&   r   "PatchTSMixerForRegression.__init__  s    &v.
KK	#)#=#= !--$*$?$?!;;%'+D$ ,&%;'#
 366v7Q7QRL'+7F<N<N+O( #?@Z@Z?[!\]]>>22 8]c]o]o pD $D* $ 8 8
	 	r(   NrE  rc  r  r  r  rX   c                   ^  T R                   S:X  a  [        R                  " SS9nO"T R                   S:X  a  [        nO[	        S5      eUb  UOT R
                  nT R                  UUUS9n[        U[        5      (       a  [        U6 nT R                  b4  T R                  UR                  UR                  UR                  S9Ul        T R                  UR                  5      n	Ub  US	L a  T R                  (       a}  T R                  S
:X  a)  [         R"                  " US:  5      (       a  [%        S5      eT R                  R'                  U	5      n
[        U 4S jU	 5       5      n	U" X5      n[)        U5      nOU" X5      nOSnU(       d*  [        S UU	UR                  UR*                  4 5       5      $ [-        UU	UR                  UR*                  S9$ )aD  
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
    Context values of the time series. For a pretraining task, this denotes the input time series to predict
    the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
    for classification or regression tasks, it denotes the appropriate context values of the time series.

    For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
    greater than 1.
target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
    `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
    Target values of the time series, that serve as labels for the model. The `target_values` is what the
    Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
    required for a pretraining task.

    For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
    to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
    pass the target data with all channels, as channel Filtering for both prediction and target will be
    manually applied before the loss computation.

    For a classification task, it has a shape of `(batch_size,)`.

    For a regression task, it has a shape of `(batch_size, num_targets)`.
return_loss (`bool`, *optional*):
    Whether to return the loss in the `forward` call.
r<  rh   r  r2  rJ  Nr  rK  Tr@  r   zDtarget_values cannot be negative for negative_binomial distribution.c              3   n   >#    U  H*  oR                  S TR                  R                  5      v   M,     g7f)r   N)r   r;   r6  )r&  itemr$   s     r&   r(  4PatchTSMixerForRegression.forward.<locals>.<genexpr>   s)     WQViiDKK,C,CDDQVs   25c              3   &   #    U  H  nUv   M	     g 7fr*   rT  r  s     r&   r(  r  
  r  r  )r
  rk  r  r   )r
  r   r  r2  rj   r  rD  r+  r   r  ra  r  r  r  r  r"  rK   any	ExceptionrN  r8  r   rj  )r$   rE  rc  r  r  r  r   r
  r  rO  rN  r!  s   `           r&   r-   !PatchTSMixerForRegression.forward  s   H 99::/DYY%DQRR%0%<k$BRBRzz!5# " 

 lE**2LAL(-1->->.. $$"(( .? .L* 		,889$)<''++/BBuyyQ^abQbGcGc#$jkk#77DDUKWQVWW<+H55H   22 ..	   /$*<<&44	
 	
r(   c                 R   U R                   nU " USSS9nU R                  R                  UR                  5      n[	        U5       Vs/ s H  oTR                  5       PM     nn[        R                  " USS9R                  SX R                  R                  5      n[        US9$ s  snf )a  
Generate sequences of sample predictions from a model with a probability distribution head.

Args:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the target values.

Return:
    [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
    number of samples, num_targets)`.
NF)rE  rc  r  r   r   r   rR  )rC  r"  rN  rk  r  rS  rK   rT  r   r;   r6  r-  )r$   rE  rC  rU  rN  r   rV  s          r&   rW  "PatchTSMixerForRegression.generate  s    "  $88 #!&
 //<<W=W=WX ,11E+F
+Fa!+F 	 

 ++g1-2227K[[MdMde1GDD
s   B$)r"  r  ra  r
  rD  rC  r  r#  )r/   r0   r1   r2   r   r   r   rK   rL   r   rj  r-   rZ  r-  rW  r5   r6   r7   s   @r&   rz  rz    s    $1 $L  .2,1 #'[
\\[
 ||d*[
 #Tk	[

 [
 D[[
 
)[
 [
z ]]_#E\\#E 
,#E #Er(   rz  )rC  r  r  r:  r\  rz  )Nr   )NFr   )Nr   )NN)Tr3   rd   collections.abcr   dataclassesr   rK   torch.nnr   transformers.modeling_utilsr   transformers.utilsr    r   rI  modeling_flash_attention_utilsr	   modeling_utilsr
   processing_utilsr   time_series_utilsr   r   r   utilsr   r   r   configuration_patchtsmixerr   
get_loggerr/   loggerModuler   r9   rN   ru   r   r   rL   r   r   r   r   r   r   r  r  r1  rC  r\  listr   r4   r{  r  r  r  r  r  r  r  r  r  r  r	  r  r%  r'  r-  distributionsDistributionr2  r8  r:  rZ  r\  rj  r`  rz  __all__rT  r(   r&   <module>r     sS   "  $ !   7 * & B 5 & U U @ @ : 
		H	% *&BII &,$RYY $N.BII .bbii .-299 -n !%II%<<% 
% <<	%
 LL4'% T\% % '(%:R/BII R/jCbii CL*		 *Z#		 #L&#		 &#R5BII 5pDRYY DN )/ ) )>ryy D -1',7%LL7%7% #Tk7% !%	7%
 7%| -1	A%LLA%#czA% #TkA% 	A%J-299 -b9"")) 9"z 0BII  0H3;RYY 3;n BII  6 
 	: 	: 	:Bk5 BkJ 
 +k + +4 
^
3 ^

^
B 
 :{ : :$ 
Q
!< Q

Q
h 
 +k + +0  / / /  / / /#u""// # #%,, #*5<< *%,,:M *chcoco *0TE ; TEn 
 :K : :$k
.I k
\ 
 :k : :$%ryy %P 
iE ; iE
iEXr(   