
    Z jK                       S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSKJ	r
  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJrJrJrJ r   SSK!J"r"  \ RF                  " \$5      r%  SoS\RL                  S\RN                  S\RN                  S\RN                  S\RN                  S-  S\(S-  S\(S\\   4S jjr) " S S\RL                  5      r* " S S\RL                  5      r+   SpS\RN                  S \(S!\,S-  S"\-S#\.4
S$ jjr/  SqS\RN                  S%\,\.-  S!\,S-  S#\.4S& jjr0 " S' S(\RL                  5      r1 " S) S*\RL                  5      r2 " S+ S,\RL                  5      r3\ " S- S.\5      5       r4 " S/ S0\RL                  5      r5 " S1 S2\RL                  5      r6 " S3 S4\45      r7\" S5S69\ " S7 S8\5      5       5       r8\" S9S69\ " S: S;\5      5       5       r9\" S<S69\ " S= S>\5      5       5       r:\" S?S69\ " S@ SA\5      5       5       r;\" SBS69\ " SC SD\5      5       5       r<\" SES69\ " SF SG\5      5       5       r=SH\R|                  R~                  SI\RN                  SJ\RN                  4SK jr@SrSL\RN                  SM\RN                  S-  SJ\RN                  4SN jjrA " SO SP\RL                  5      rB " SQ SR\RL                  5      rC " SS ST\RL                  5      rD " SU SV\RL                  5      rE\ " SW SX\45      5       rF " SY SZ\RL                  5      rG\" S[S69 " S\ S]\45      5       rH " S^ S_\RL                  5      rI\" S`S69 " Sa Sb\45      5       rJ\" ScS69 " Sd Se\RL                  5      5       rK\" SfS69 " Sg Sh\45      5       rL " Si Sj\RL                  5      rM\" SkS69 " Sl Sm\45      5       rN/ SnQrOg)szPyTorch PatchTST model.    N)Callable)	dataclass)nn   )initialization)ACT2CLS)is_deepspeed_zero3_enabled)FlashAttentionKwargs)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputTransformersKwargsauto_docstringlogging   )PatchTSTConfigmodulequerykeyvalueattention_maskscalingdropoutkwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  USS9n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )N         r   dim)ptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr   r'   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             /root/GenerationalWealth/GenerationalWealth/venv/lib/python3.13/site-packages/transformers/models/patchtst/modeling_patchtst.pyeager_attention_forwardr2   '   s     **R.D( <<}}Q':;gEL!#4==((2(>L==((6??([L,,|3K''1-88:K$$    c                   :  ^  \ rS rSrSr     SS\S\S\S\S\S	\S
\S-  4U 4S jjjr	   SS\
R                  S\
R                  S-  S\
R                  S-  S\S-  S\\   S\\
R                  \
R                  S-  \\
R                     S-  4   4S jjrSrU =r$ )PatchTSTAttentionD   z=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr   
is_decoderbias	is_causalconfigc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r"   r:   )super__init__r7   r8   r   head_dimr<   
ValueErrorr   r9   r;   r   Lineark_projv_projq_projout_proj)	selfr7   r8   r   r9   r:   r;   r<   	__class__s	           r1   rA   PatchTSTAttention.__init__G   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr3   hidden_stateskey_value_statesr   output_attentionsr   returnc                    USLnUR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U(       a  UOUn
/ U
R                   SS QSPU R                  P7nU R                  U
5      R                  U5      R	                  SS5      nU R                  U
5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      nU" U U	UUU4U R                  (       d  SOU R                  U R                  US.UD6u  nnUR                  " / UQSP76 R!                  5       nU R#                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNr!   r   r#           )r   r   rN   )shaperB   rG   viewr+   rE   rF   r   get_interfacer<   _attn_implementationr2   r'   r   r   reshaper.   rH   )rI   rL   rM   r   rN   r   is_cross_attentioninput_shapehidden_shapequery_statescurrent_stateskv_shape
key_statesvalue_statesattention_interfacer0   r/   s                    r1   forwardPatchTSTAttention.forwardf   s    .T9 $))#2.88b8$--8 {{=166|DNNqRST-?)]B^))#2.BBDMMB[[055h?II!QO
{{>277AKKAqQ(?(M(MKK,,.E)
 %8
%
  $}}C$,,LL/
%
 
%
!\ "));;;;FFHmmK0L$..r3   )r<   r   r7   rB   r;   r9   rE   r8   rH   rG   r   rF   )rQ   FTFN)NNF)__name__
__module____qualname____firstlineno____doc__intfloatboolr   rA   r)   Tensorr   r
   tupler`   __static_attributes____classcell__rJ   s   @r1   r5   r5   D   s
   G  (,CC C 	C
 C C C %C CD 15.2).0/||0/  ,,-0/ t+	0/
  $;0/ -.0/ 
u||U\\D0%2E2LL	M0/ 0/r3   r5   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTBatchNorm   zH
Compute batch normalization over the sequence length (time) dimension.
r<   c                 ~   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        g )Neps)r@   rA   r   BatchNorm1dd_modelnorm_eps	batchnormrI   r<   rJ   s     r1   rA   PatchTSTBatchNorm.__init__   s(    FOOLr3   inputsc                 l    UR                  SS5      nU R                  U5      nUR                  SS5      $ )z
Parameters:
    inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
        input for Batch norm calculation
Returns:
    `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
r   r#   )r+   rx   )rI   r{   outputs      r1   r`   PatchTSTBatchNorm.forward   s7     !!!Q''1%%r3   )rx   rb   rc   rd   re   rf   r   rA   r)   rj   r`   rl   rm   rn   s   @r1   rp   rp      s+    M~ M
&ell 
& 
&r3   rp   r{   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    US:  d  US:  a  [        SU S35      eU R                  u  pVpxU R                  n	[        USU-
  -  5      n
U(       a*  [        R
                  " USXyS9nUR                  SUS5      nO[        R
                  " XVXyS9n[        R                  " XVXyS9nSUSS2SS2SU
24'   [        R                  " USS9n[        R                  " USS9n[        R                  " USUS	9nUR                  S5      R                  SSSU5      nUb  SUSS2USS2SS24'   U R                  UR                  5       U5      nXS
   4$ )a  random_masking: Mask the input considering the control variables.

Args:
    inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
        The input tensor to mask.
    mask_ratio (`float`):
        Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
    unmasked_channel_indices (list, *optional*):
        Indices of channels that will not be masked.
    channel_consistent_masking (bool, *optional*, defaults to `False`):
        When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
        across channels.
    mask_value (int, *optional*, defaults to 0):
        Define the value of masked patches for pretraining.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
    n]
r   r   zMask ratio z has to be between 0 and 1.deviceNr!   r$   )r%   index.r   )rC   rR   r   rg   r)   randrepeatonesargsortgather	unsqueezemasked_fillri   )r{   r   r   r   r   
batch_sizenum_channelssequence_lengthnum_featuresr   len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r1   random_maskingr      sA   4 A~q;zl2MNOO>Dll;Jo]]F?a*n56H!

:q/IQa0 

:_T ::jODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r3   num_forecast_mask_patchesc                 P   [        U[        5      (       a  U/nU Vs/ s H  nSPM     nnU R                  u  pgp[        R                  " XgXR
                  S9n
/ nSn[        U5      n[        X5       HG  u  pUS::  d  X:  a  [        SU S35      e[        Xo-  U-  5      nUR                  XU/5        UU-  nMI     [        US S9nX:  a  US   S   Xl-
  -   US   S'   OX:  a  US	   S   X-
  -   US	   S'   SnU H  u  nnnUU-   nSU
UU2S
S
2U* S
24'   UnM     [        R                  " U
R                  S   5      nU
U   n
U
R                  S	5      R                  SSSU	5      n
Ub  SU
S
S
2US
S
2S
S
24'   U R                  U
R                  5       U5      nUU
S   4$ s  snf )ai  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

Parameters:
    inputs (`torch.Tensor`):
        Input of shape `(bs, num_channels, num_patch, patch_length)`
    num_forecast_mask_patches (`list`):
        Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
    unmasked_channel_indices (`list`, *optional*):
        Indices of channels that are not masked.
    mask_value (`int`, *optional*, defaults to 0):
        Values in the masked patches will be filled by `mask_value`.

Returns:
    `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
    num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
r   r   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     U S   $ )Nr#    )xs    r1   <lambda>"forecast_masking.<locals>.<lambda>  s    !A$r3   )r   r#   r!   Nr   )
isinstancerg   rR   r)   zerosr   sumziprC   appendsortedrandpermr   r   r   ri   )r{   r   r   r   _forecast_mask_ratiosr   r   r   r   r   t_listtotal_lengthtotal_ratiopatch_lengthratiotemp_lenbatch1	patch_lenbatch2permr   s                         r1   forecast_maskingr      s   0 +S11%>$?!'@A'@!A'@A>Dll;Jo;;zWDFL*+K"#<S1 ?,\N:pq  z)K78|H56   T F/F ay|z'@Aq	!		"r
1)BCr
1F"(	1h("./VF]A	z{*+ #)
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   F#c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTPatchifyi-  z
A class to patchify the time series sequence into different patches

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
r<   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  ::  a&  [        SU R                   SU R                   S35      e[        U R                  U R                  5      U R                  -
  U R
                  -  S-   U l        U R                  U R
                  U R                  S-
  -  -   nU R                  U-
  U l	        g )NzSequence length (z+) has to be greater than the patch length ()r   )
r@   rA   context_lengthr   r   patch_striderC   maxnum_patchessequence_start)rI   r<   new_sequence_lengthrJ   s      r1   rA   PatchTSTPatchify.__init__5  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr3   past_valuesc                 4   UR                   S   nX R                  :w  a  [        SU SU R                   S35      eUSS2U R                  S2SS24   nUR	                  SU R
                  U R                  S9nUR                  SS5      R                  5       nU$ )z
Parameters:
    past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
        Input for patchification

Returns:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
zInput sequence length (z%) doesn't match model configuration (r>   N)	dimensionr(   step)	rR   r   rC   r   unfoldr   r   r+   r.   )rI   r   r   r}   s       r1   r`   PatchTSTPatchify.forwardF  s     &++B/222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r3   )r   r   r   r   r   r   rn   s   @r1   r   r   -  s+    I~ I"5<<  r3   r   c                   V   ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	Sr
U =r$ )PatchTSTMaskingi]  al  
Class to perform random or forecast masking.

Parameters:
    config (`PatchTSTConfig`): model config
Returns:
    x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points
r<   c                 >  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        U R                  b  [        U R                  5      U l        g g N)	r@   rA   random_mask_ratior   	mask_typer   r   r   r   ry   s     r1   rA   PatchTSTMasking.__init__j  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r3   patch_inputc                 d   U R                   S:X  a8  [        UU R                  U R                  U R                  U R
                  S9u  p#OVU R                   S:X  a-  [        UU R                  U R                  U R
                  S9u  p#O[        SU R                    S35      eUR                  5       nX#4$ )a  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input

Return:
    masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
        Masked patched input
    mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
        Bool tensor indicating True on masked points

random)r{   r   r   r   r   forecast)r{   r   r   r   zInvalid mask type .)
r   r   r   r   r   r   r   r   rC   ri   )rI   r   masked_inputr   s       r1   r`   PatchTSTMasking.forwardu  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{!!r3   )r   r   r   r   r   r   r   rn   s   @r1   r   r   ]  s+    
	R~ 	R!"5<< !" !"r3   r   c                   d   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	S-  4S jjr
S	rU =r$ )PatchTSTEncoderLayeri  z
PatchTST encoder layer
r<   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  UR                  US9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      eU R                  (       a  UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      e[        R*                  " [        R,                  " UR                  UR.                  UR0                  S9[2        UR4                     " 5       UR6                  S:  a   [        R                  " UR6                  5      O[        R                  " 5       [        R,                  " UR.                  UR                  UR0                  S95      U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        UR                  S:X  a  [        U5      U l        OWUR                  S:X  a/  [        R                   " UR                  UR"                  S9U l        O[%        UR                   S35      eUR>                  U l        g )N)r7   r8   r   r<   r   rx   	layernormrs   z$ is not a supported norm layer type.r?   ) r@   rA   channel_attentionr5   rv   num_attention_headsattention_dropout	self_attnpath_dropoutr   DropoutIdentitydropout_path1	norm_typerp   norm_sublayer1	LayerNormrw   rC   dropout_path2norm_sublayer2
SequentialrD   ffn_dimr:   r   activation_function
ff_dropoutffdropout_path3norm_sublayer3pre_normry   s     r1   rA   PatchTSTEncoderLayer.__init__  s   !'!9!9*nn00,,	
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWW !!DJDWDWZ[D[F,?,?!@acalalanD;.&7&?#!![0&(ll6>>v&W# F$4$4#55Y!Z[[ --IIfnnfnn6;;GF../1-3->->-BBJJv(()IIfnnfnn6;;G	
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWWr3   Nhidden_staterN   c                    UR                   u  p4pVUR                  X4-  XV5      nU R                  (       a6  U R                  U R	                  U5      US9u  pxn	XR                  U5      -   nO4U R                  XS9u  pxn	U R	                  XR                  U5      -   5      nUR                  X4XV5      nU R                  (       a  UR                  SS5      R                  5       nUR                  X5-  XF5      nU R                  (       a6  U R                  U R                  U5      US9u  pzn	XR                  U5      -   nO4U R                  XS9u  pzn	U R                  XR                  U5      -   5      nUR                  X5XF5      nUR                  SS5      R                  5       nUR                  X4-  XV5      nU R                  (       a2  XR                  U R                  U R                  U5      5      5      -   nO1U R                  XR                  U R                  U5      5      -   5      nUR                  X4XV5      nU4nU(       a  XR                  (       a  UW
4OU4-  nU$ )ao  
Parameters:
    hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
        Past values of the time series
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
Return:
    `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

)rL   rN   r#   r   )rR   rS   r   r   r   r   rV   r   r+   r.   r   r   r   r   r   )rI   r   rN   r   num_input_channelsr   rv   r0   r/   r   channel_attn_weightsoutputss               r1   r`   PatchTSTEncoderLayer.forward  si    DPCUCU@
 $(()H/c==+/>>"11,?Sd ,: ,(Kq (*<*<[*IIL ,0>>* ,: ,(Kq  ..|>P>PQ\>]/]^L $++JOe !!'11!Q7BBDL',,Z-IK]gL}}7;~~"&"5"5l"CWh 8F 841  ,.@.@.MM 8<~~". 8F 841  $22<BTBTU`Ba3ab (//
M_iL'11!Q7BBDL $(()H/c== (*<*<TWWTEXEXYeEf=g*hhL  ..|>P>PQUQXQXYeQf>g/ghL $++JOe/?U?U&:;\h[jjGr3   )
r   r   r   r   r   r   r   r   r   r   r   )rb   rc   rd   re   rf   r   rA   r)   rj   ri   r`   rl   rm   rn   s   @r1   r   r     s9    0(~ 0(dQELL QTD[ Q Qr3   r   c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr\R                  " 5       S\R                   4S	 j5       rSS
 jrSrg)PatchTSTPreTrainedModeli$  r<   modelr   )timeFTr   c                    [        U[        5      (       Gab  [        U R                  R                  U R                  R
                  5      U R                  R
                  -
  U R                  R                  -  S-   nU R                  R                  (       a$  [        R                  " UR                  SS9  US-  nUR                  U R                  U5      n[        5       (       aq  SSKnUR                  R                  UR                   SS9   UR                   R#                  5       S:  a!  [        R$                  " UR                   U5        SSS5        g[        R$                  " UR                   U5        g[        U[&        R(                  [&        R*                  45      (       a  [        R,                  " UR.                  5        [        R0                  " UR2                  5        [5        USS5      ba  [        R,                  " UR6                  5        [        R0                  " UR8                  5        [        R,                  " UR:                  5        gg[        U[&        R<                  5      (       ac  [        R                  " UR2                  SU R                  R>                  S	9  UR.                  b!  [        R,                  " UR.                  5        ggg! , (       d  f       g= f)
z
Initialize weights
r   g{Gz?)stdr   N)modifier_rankrunning_meanrQ   )meanr   ) r   PatchTSTPositionalEncodingr   r<   r   r   r   use_cls_tokeninitnormal_	cls_token_init_per	   	deepspeedzeroGatheredParametersposition_encnumelcopy_r   r   ru   zeros_r:   ones_weightgetattrr   running_varnum_batches_trackedrD   init_std)rI   r   r   r	  r  s        r1   _init_weights%PatchTSTPreTrainedModel._init_weights/  s   
 f899 DKK..0H0HIDKKLdLdd))*,-.K {{((V--48q !??4;;DL)++ ^^66v7J7JZ^6_**002Q6

6#6#6E `_ 

6..=r~~ >??KK$JJv}}%v~t4@F//0

6--.F667 A 		**LLSdkk6J6JK{{&FKK( ' + `_s   A K
K)c                 <    [        U[        5      (       a  X!l        g g r   )r   PatchTSTEncodergradient_checkpointing)rI   r   r   s      r1   _set_gradient_checkpointing3PatchTSTPreTrainedModel._set_gradient_checkpointingS  s    f00,1) 1r3   r   N)F)rb   rc   rd   re   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr)   no_gradr   Moduler  r  rl   r   r3   r1   r   r   $  sY    #O &+#N
]]_!)BII !) !)F2r3   r   c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )PatchTSTEmbeddingiX  r<   c                   > [         TU ]  5         UR                  U l        UR                  U l        U R                  (       a1  [        R
                  " UR                  UR                  5      U l        g [        R                  " 5       U l        [        UR                  5       HG  nU R                  R                  [        R
                  " UR                  UR                  5      5        MI     g r   )r@   rA   r   share_embeddingr   rD   r   rv   input_embedding
ModuleListranger   )rI   r<   r   rJ   s      r1   rA   PatchTSTEmbedding.__init__Y  s    "(";";%55#%99V-@-@&..#QD #%==?D 6445$$++BIIf6I6I6>>,Z[ 6r3   r   c                 j   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  (       a  U R	                  U5      nU$ [        U5       Vs/ s H$  o@R                  U   " USS2USS2SS24   5      PM&     nn[        R                  " USS9nU$ s  snf )z
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Patch input for embedding
return:
    `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   Nr$   )rR   r   rC   r'  r(  r*  r)   stack)rI   r   r   
embeddingsis        r1   r`   PatchTSTEmbedding.forwarde  s     )..q1!8!8889P9P8Q RTTfSgghj  --k:J  UZZlTmnTmq..q1+aAqj2IJTmJnZQ7J os   ,+B0)r(  r   r'  rb   rc   rd   re   r   rA   r)   rj   r`   rl   rm   rn   s   @r1   r%  r%  X  s&    
\~ 
\5<<  r3   r%  c                      ^  \ rS rSrSrS\S\4U 4S jjr\S\S\S\	R                  4S j5       rS\R                  4S	 jrS
rU =r$ )r   i|  z
Class for positional encoding
r<   r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aA  [        R
                  " [        R                  " SSSUR                  5      5      U l	        US-  nU R                  X5      U l        UR                  S:  a&  [        R                  " UR                  5      U l        g [        R                  " 5       U l        g )Nr   r   )r@   rA   r  r   r   	Parameterr)   r   rv   r  r  r	  positional_dropoutr   r   rI   r<   r   rJ   s      r1   rA   #PatchTSTPositionalEncoding.__init__  s    #11"(";";\\%++aAv~~*NODN1K MM&> 6<5N5NQR5RBJJv001 	XZXcXcXe 	r3   rO   c                 $   U R                   S:X  a5  [        R                  " [        R                  " XR
                  5      SS9nU$ U R                   S:X  Ga#  [        R                  " XR
                  5      n[        R                  " SU5      R                  S5      n[        R                  " [        R                  " SU R
                  S5      [        R                  " S5      U R
                  -  * -  5      n[        R                  " X4-  5      US S 2SS S24'   [        R                  " X4-  5      US S 2SS S24'   X"R                  5       -
  nX"R                  5       S	-  -  n[        R                  " US
S9nU$ [!        U R                    S35      e)Nr   Trequires_gradsincosr   r   r#   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   r4  r)   randnrv   r   aranger   expmathlogsincosr   r   rC   )r<   r   r	  positiondiv_terms        r1   r  #PatchTSTPositionalEncoding._init_pe  sX    **h6<<K(P`deL  ,,8 ;;{NNCL||A{3==a@Hyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!'*;*;*==L'+;+;+=+BCL<<EJL
  223  4B  C r3   r   c                 x   U R                   (       a  U R                  XR                  SS 2S S 24   -   5      nU R                  U R                  S S2S S 24   -   nUR	                  UR
                  S   U R                  SS5      n[        R                  " X14SS9nU$ U R                  XR                  -   5      nU$ )Nr   r   r!   r#   r$   )	r  r5  r	  r  expandrR   r   r)   cat)rI   r   r  
cls_tokensr   s        r1   r`   "PatchTSTPositionalEncoding.forward  s    11+@Q@QRSRTVWRW@X2XYK):):2A2q5)AAI"))+*;*;A*>@W@WY[]_`J 99j%>AFL   22;ARAR3RSLr3   )r  r   r	  r5  r  )rb   rc   rd   re   rf   r   rg   rA   staticmethodr   r4  r  r)   rj   r`   rl   rm   rn   s   @r1   r   r   |  s]    
~ 
C 
  c bll  &5<<  r3   r   c            	       z   ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
S-  S	\
S-  S
\4S jjrSrU =r$ )r  i  z
PatchTST Encoder
r<   r   c                 ,  > [         TU ]  U5        SU l        [        U5      U l        [        X5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        U R                  5         g s  snf )NF)r@   rA   r  r%  embedderr   positional_encoderr   r)  r*  num_hidden_layersr   layers	post_init)rI   r<   r   r/  rJ   s       r1   rA   PatchTSTEncoder.__init__  sx     &+# *&1"<V"Qmm5QWQiQiKj$kKja%9&%AKj$kl 	 %ls   BNr   output_hidden_statesrN   rO   c                 h   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  U5      nU(       a  SOSnU(       a  SOSnU R
                   H+  nU(       a  Xe4-   nU" XSS9n	U	S   nU(       d  M#  XyS   4-   nM-     [        XVUS9$ )ar  
Parameters:
    patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
        Past values of the time series
    output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
    output_attentions (bool, optional): Indicates if attentions should be outputted.

return:
    `BaseModelOutput`
Nr   )r   rN   r   r   )last_hidden_staterL   
attentions)r<   rN   rV  rP  rQ  rS  r   )
rI   r   rV  rN   r   r   encoder_statesall_attentionsencoder_layerlayer_outputss
             r1   r`   PatchTSTEncoder.forward  s    " 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 mmK0..{;30d![[M#!//!A)|iM )+L  !/3C2E!E ) hvwwr3   )rP  r  rS  rQ  NN)rb   rc   rd   re   rf   r   rg   rA   r)   rj   ri   r   r`   rl   rm   rn   s   @r1   r  r    se    ~ C " -1)-	)x\\)x #Tk)x  $;	)x 
)x )xr3   r  zG
    Base class for model's outputs, with potential hidden states.
    )custom_introc                   >   \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                     S-  \S'   Sr\
\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Srg)PatchTSTModelOutputi  a  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
    Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
    the model at the output of each layer plus the optional initial embedding outputs.
mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
    Bool masked tensor indicating which patches are masked
loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
    Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
    Patched input to the Transformer
NrX  rL   rY  r   locscaler   r   )rb   rc   rd   re   rf   rX  r)   FloatTensorr  rL   rk   rY  r   rc  rd  r   rl   r   r3   r1   rb  rb    s    " 37u((4/659M5**+d2926Je''(4/6%)D%

d
")$(C		T	!(&*E5t#*,0K""T)0r3   rb  z4
    Output type of [`PatchTSTForPretraining`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	PatchTSTForPretrainingOutputi  a
  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    MSE loss.
prediction_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction outputs of the time series modeling heads.
Nlossprediction_outputrL   rY  r   )rb   rc   rd   re   rf   rh  r)   re  r  ri  rL   rk   rY  rl   r   r3   r1   rg  rg    sh     &*D%

d
")26u((4/659M5**+d2926Je''(4/6r3   rg  z3
    Output type of [`PatchTSTForRegression`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	PatchTSTForRegressionOutputi(  z
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    MSE loss.
regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
    Regression outputs of the time series modeling heads.
Nrh  regression_outputsrL   rY  r   )rb   rc   rd   re   rf   rh  r)   re  r  rl  rL   rk   rY  rl   r   r3   r1   rk  rk  (  sh     &*D%

d
")37))D0759M5**+d2926Je''(4/6r3   rk  z3
    Output type of [`PatchTSTForPrediction`].
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S	'   S
rg)PatchTSTForPredictionOutputi<  a  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    MSE loss.
prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
    Prediction outputs of the time series modeling heads.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.

    Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
    heads.
loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
    Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
    Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
Nrh  prediction_outputsrL   rY  rc  rd  r   )rb   rc   rd   re   rf   rh  r)   re  r  ro  rL   rk   rY  rc  rd  rl   r   r3   r1   rn  rn  <  s    " &*D%

d
")37))D0759M5**+d2926Je''(4/6$(C		T	!(&*E5t#*r3   rn  z7
    Output type of [`PatchTSTForClassification`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	PatchTSTForClassificationOutputi\  as  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
    Prediction scores of the PatchTST modeling head (scores before SoftMax).
Nrh  prediction_logitsrL   rY  r   )rb   rc   rd   re   rf   rh  r)   re  r  rr  rL   rk   rY  rl   r   r3   r1   rq  rq  \  sh     &*D%

d
")26u((4/659M5**+d2926Je''(4/6r3   rq  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)SamplePatchTSTOutputiq  z
sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, num_targets)`):
    Sampled values from the chosen distribution.
N	sequencesr   )
rb   rc   rd   re   rf   ru  r)   re  r  rl   r   r3   r1   rt  rt  q  s    
 +/Iu  4'.r3   rt  inputtargetrO   c                 &    U R                  U5      * $ )z[
Computes the negative log likelihood loss from input distribution with respect to target.
)log_prob)rv  rw  s     r1   nllrz    s     NN6"""r3   input_tensorweightsc                 R   Ub  [         R                  " US:g  X-  [         R                  " U 5      5      n[         R                  " U(       a  UR	                  US9OUR	                  5       SS9nU(       a  UR	                  US9U-  $ UR	                  5       U-  $ U R                  US9$ )a:  
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

Args:
    input_tensor (`torch.FloatTensor`):
        Input tensor, of which the average must be computed.
    weights (`torch.FloatTensor`, *optional*):
        Weights tensor, of the same shape as `input_tensor`.
    dim (`int`, *optional*):
        The dim along which to average `input_tensor`.

Returns:
    `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
r   r$         ?min)r)   where
zeros_likeclampr   r   )r{  r|  r%   weighted_tensorsum_weightss        r1   weighted_averager    s      ++glL4JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r3   c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSTStdScaleri  z
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
r<   c                   > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  U l        g SU l        g )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r@   rA   hasattrr  r%   r  r  ry   s     r1   rA   PatchTSTStdScaler.__init__  sd    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[_r3   dataobserved_indicatorrO   c                 r   UR                  U R                  U R                  S9nUR                  S5      nX-  R                  U R                  U R                  S9U-  nX-
  U-  S-  R                  U R                  U R                  S9U-  n[        R
                  " XPR                  -   5      nX-
  U-  XF4$ )  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
r  r~  r#   )r   r%   r  	clamp_minr)   sqrtr  )rI   r  r  denominatorrc  variancerd  s          r1   r`   PatchTSTStdScaler.forward  s     ),,TXXt||,L!++C0(--dhh-MP[[j$661<AA$((TXT`T`Aadoo

8&8&889
e#S//r3   )r%   r  r  rb   rc   rd   re   rf   r   rA   r)   rj   rk   r`   rl   rm   rn   s   @r1   r  r    sX    
`~ `0LL06;ll0	u||U\\5<<7	80 0r3   r  c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	\R                  \R                  \R                  4   4S jr
S	rU =r$ )
PatchTSTMeanScaleri  z~
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
r<   c                 N  > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  OSU l        [        US5      (       a  UR                  OSU l        [        US5      (       a  UR                  U l        g S U l        g )Nr  r   r  Tr  绽|=default_scale)r@   rA   r  r  r%   r  r  r  ry   s     r1   rA   PatchTSTMeanScaler.__init__  s    )0)G)G6%%Q)0)C)Cv~~5<V_5U5UV11[`5<V_5U5UV11[_r3   r  r  rO   c                    X-  R                  5       R                  U R                  SS9nUR                  U R                  SS9nU[        R                  " USS9-  nU R
                  cL  UR                  SS9n[        R                  " UR                  S5      SS9n[        R                  " Xg-  5      nO#U R
                  [        R                  " U5      -  n[        R                  " US:  XX5      n[        R                  " XPR                  S9nX-  n	U R                  (       d  UR                  U R                  S9nU	[        R                  " U5      U4$ )r  Tr  r   r  r   r$   )absr   r%   r)   r  r  squeeze	ones_liker  r  r  r  )
rI   r  r  ts_sumnum_observedrd  	batch_sumbatch_observationsr  scaled_datas
             r1   r`   PatchTSTMeanScaler.forward  s"    +00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)*HIM ..1GGM L1,eC E'9'9:l||MMdhhM/EE,,U3U::r3   )r  r%   r  r  r  rn   s   @r1   r  r    sX    
`~ `&;LL&;6;ll&;	u||U\\5<<7	8&; &;r3   r  c            
          ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S-  S\	\R                  \R                  \R                  4   4S	 jjr
S
rU =r$ )PatchTSTNOPScaleri  zt
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
r<   c                    > [         TU ]  5         [        US5      (       a  UR                  OSU l        [        US5      (       a  UR
                  U l        g SU l        g )Nr  r   r  T)r@   rA   r  r  r%   r  ry   s     r1   rA   PatchTSTNOPScaler.__init__  sF    )0)G)G6%%Q)0)C)Cv~~r3   Nr  r  rO   c                     [         R                  " USS9R                  U R                  U R                  S9n[         R
                  " USS9R                  U R                  U R                  S9nXU4$ )aP  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        input for Batch norm calculation
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, num_input_channels)`)
Fr9  r%   r  )r)   r  r   r%   r  r  )rI   r  r  rd  rc  s        r1   r`   PatchTSTNOPScaler.forward  sg     E:??DHHVZVbVb?ct59>>488UYUaUa>b%r3   r  r   r  rn   s   @r1   r  r    sd    N~ N MQ LL 6;llT6I 	u||U\\5<<7	8   r3   r  c            	          ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\\R                  \R                  \R                  4   4S jr	Sr
U =r$ )	PatchTSTScaleri  r<   c                    > [         TU ]  5         UR                  S:X  d  UR                  SL a  [        U5      U l        g UR                  S:X  a  [        U5      U l        g [        U5      U l        g )Nr   Tr   )r@   rA   r   r  scalerr  r  ry   s     r1   rA   PatchTSTScaler.__init__  sU    >>V#v~~'=,V4DK^^u$+F3DK+F3DKr3   r  r  rO   c                 2    U R                  X5      u  pnXU4$ )a  
Parameters:
    data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Input for scaler calculation
    observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Calculating the scale on the observed indicator.
Returns:
    tuple of `torch.Tensor` of shapes
        (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
        `(batch_size, 1, um_input_channels)`)
r  )rI   r  r  rc  rd  s        r1   r`   PatchTSTScaler.forward#  s"      ;;t@5%r3   r  )rb   rc   rd   re   r   rA   r)   rj   rk   r`   rl   rm   rn   s   @r1   r  r    sQ    4~ 4 LL 6;ll 	u||U\\5<<7	8   r3   r  c                      ^  \ rS rSrS\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S\S-  S	\S-  S
\S-  S\	\
-  4S jjrSrU =r$ )PatchTSTModeli5  r<   c                 f  > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  U l        U R
                  R                  nU R                  (       a  [        U5      U l	        O[        R                  " 5       U l	        [        XS9U l        U R                  5         g )N)r   )r@   rA   r  r  r   
patchifierdo_mask_inputr   r   maskingr   r   r  encoderrT  r6  s      r1   rA   PatchTSTModel.__init__7  s     $V,*62#11oo11*62DL;;=DL&vG 	r3   Nr   past_observed_maskfuture_valuesrV  rN   return_dictrO   c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [        R
                  " U5      nU R                  X5      u  pn
U R                  U5      nU R                  (       a  U R                  U5      u  pOU R                  U5      SpU R                  XUS9nU(       d<  UR                  UR                  UR                  4nXXU4-   n[        S U 5       5      $ [        UR                  UR                  UR                  UU	U
US9$ )a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
        Future target values associated with the `past_values`
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*):
        Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTModel

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

>>> # during training, one provides both past and future values
>>> outputs = model(
...     past_values=batch["past_values"],
...     future_values=batch["future_values"],
... )

>>> last_hidden_state = outputs.last_hidden_state
```N)r   rV  rN   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r1   	<genexpr>(PatchTSTModel.forward.<locals>.<genexpr>  s     =GqGs   	)rX  rL   rY  r   rc  rd  r   )r<   r  rN   rV  r)   r  r  r  r  r  r  rX  rL   rY  rk   rb  )rI   r   r  r  rV  rN   r  r   scaled_past_valuesrc  rd  patched_valuesmasked_valuesr   encoder_outputr   s                   r1   r`   PatchTSTModel.forwardI  sK   n &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 %!&!= *.[)U& );<"&,,~">M4"&,,~">4%du & 
 %779U9UWeWpWpqGs> BBG=G===",>>(66%00&
 	
r3   )r  r  r  r  r  NNNNN)rb   rc   rd   re   r   rA   r)   rj   ri   rk   rb  r`   rl   rm   rn   s   @r1   r  r  5  s    ~ * 37-1,0)-#'[
\\[
 "LL4/[
 ||d*	[

 #Tk[
  $;[
 D[[
 
$	$[
 [
r3   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	PatchTSTMaskPretrainHeadi  z%
Pretraining head for mask modelling
r<   c                 8  > [         TU ]  5         UR                  S:  a   [        R                  " UR                  5      O[        R
                  " 5       U l        [        R                  " UR                  UR                  5      U l
        UR                  U l        g Nr   )r@   rA   head_dropoutr   r   r   r   rD   rv   r   linearr  ry   s     r1   rA   !PatchTSTMaskPretrainHead.__init__  sh    :@:M:MPQ:Qrzz&"5"56WYWbWbWdii0C0CD#11r3   	embeddingrO   c                     U R                  U R                  U5      5      nU R                  (       a  USS2SS2SS2SS24   nU$ )a  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

Nr   )r  r   r  )rI   r  s     r1   r`    PatchTSTMaskPretrainHead.forward  s>     KKY 78	!!QA+.Ir3   )r   r  r  r   rn   s   @r1   r  r    s4    2~ 2 %,,  r3   r  z*
    The PatchTST for pretrain model.
    c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S-  S\S-  S\S-  S	\S-  S
\	\
-  4S jjrSrU =r$ )PatchTSTForPretrainingi  r<   c                    > [         TU ]  U5        SUl        [        US9U l        [        U5      U l        U R                  5         g )NT)r<   )r@   rA   r  r  r   r  headrT  ry   s     r1   rA   PatchTSTForPretraining.__init__  s<     #"&1
,V4	 	r3   Nr   r  rV  rN   r  rO   c                    Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      n[
        R                  " SS9n	U	" XR                  5      n
U
R                  SS9UR                  -  R                  5       UR                  R                  5       S-   -  nUR                  nU(       d  U4USS	 -   nUb  U4U-   nU$ UnU$ [        XXR                  S
9$ )a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
    `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTConfig, PatchTSTForPretraining

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> # Config for random mask pretraining
>>> config = PatchTSTConfig(
...     num_input_channels=7,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     mask_type='random',
...     random_mask_ratio=0.4,
...     use_cls_token=True,
... )
>>> # Config for forecast mask pretraining
>>> config = PatchTSTConfig(
...     num_input_channels=7,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     mask_type='forecast',
...     num_forecast_mask_patches=5,
...     use_cls_token=True,
... )
>>> model = PatchTSTForPretraining(config)

>>> # during training, one provides both past and future values
>>> outputs = model(past_values=batch["past_values"])

>>> loss = outputs.loss
>>> loss.backward()
```Tr   r  rV  rN   r  none	reductionr!   r$   r  r   )rh  ri  rL   rY  )r<   r  r   r  rX  r   MSELossr   r   r   r   rL   rg  rY  )rI   r   r  rV  rN   r  r   model_outputx_hatrh  loss_valmasked_lossrZ  r   s                 r1   r`   PatchTSTForPretraining.forward  s   L &1%<k$++BYBY zz#1!5/ " 
 		,889 zzF+778}}},|/@/@@EEG<K\K\K`K`KbejKjk%33ha!33G2=2I{nw.GN PWGN+^`w`w
 	
r3   r  r   )NNNN)rb   rc   rd   re   r   rA   r)   rj   ri   rk   rg  r`   rl   rm   rn   s   @r1   r  r    s    ~  37,0)-#'b
\\b
 "LL4/b
 #Tk	b

  $;b
 D[b
 
-	-b
 b
r3   r  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )PatchTSTClassificationHeadi8  r<   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " SS9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l
        [        R                  " UR                  UR                  -  UR                  5      U l        g Nr   	start_dimr   )r@   rA   r  pooling_typer   Flattenflattenr  r   r   r   rD   r   rv   num_targetsr  ry   s     r1   rA   #PatchTSTClassificationHead.__init__9  s    #11"//zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWdii 9 9FNN JFL^L^_r3   r  c                 p   U R                   (       a  USS2SS2SSS24   nOcU R                  S:X  a  UR                  SS9nOCU R                  S:X  a  UR                  SS9R                  nO[        SU R                   S35      eU R                  U5      nU R                  U R                  U5      5      nU$ )	a#  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
             `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, num_targets)`

Nr   r   r#   r$   r   pooling operator  is not implemented yet)	r  r  r   r   valuesrC   r  r  r   rI   r  pooled_embeddingr}   s       r1   r`   "PatchTSTClassificationHead.forwardA  s     (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\<<(89T\\*:;<r3   )r   r  r  r  r  r1  rn   s   @r1   r  r  8  s&    `~ `  r3   r  z0
    The PatchTST for classification model.
    c                      ^  \ rS rSrS\4U 4S jjr\     SS\R                  S\R                  S-  S\	S-  S\	S-  S	\	S-  S
\	S-  S\
\-  4S jj5       rSrU =r$ )PatchTSTForClassificationi]  r<   c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        [        U5      U l        U R                  5         g )N+Setting `do_mask_input` parameter to False.F)
r@   rA   r  loggerwarningr  r   r  r  rT  ry   s     r1   rA   "PatchTSTForClassification.__init__c  sT      NNHI#(F "6*
.v6	 	r3   Nr   target_valuesr  rV  rN   r  rO   c                 V   Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      n	Sn
Ub  [
        R                  " 5       nU" X5      n
U(       d  U	4USS -   nU
b  U
4U-   nU$ UnU$ [        U
U	UR                  UR                  S9$ )a  
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
    Input sequence to the model
target_values (`torch.Tensor`, *optional*):
    Labels associates with the `past_values`
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:

    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Examples:

```python
>>> from transformers import PatchTSTConfig, PatchTSTForClassification

>>> # classification task with two input channel2 and 3 classes
>>> config = PatchTSTConfig(
...     num_input_channels=2,
...     num_targets=3,
...     context_length=512,
...     patch_length=12,
...     stride=12,
...     use_cls_token=True,
... )
>>> model = PatchTSTForClassification(config=config)

>>> # during inference, one only provides past values
>>> past_values = torch.randn(20, 512, 2)
>>> outputs = model(past_values=past_values)
>>> labels = outputs.prediction_logits
```NTr  r   r   )rh  rr  rL   rY  )
r<   r  r   r  rX  r   CrossEntropyLossrq  rL   rY  )rI   r   r  r  rV  rN   r  r   r  y_hatr  rh  r   s                r1   r`   !PatchTSTForClassification.forwardq  s    Z &1%<k$++BYBYzz#1!5/ " 
 		,889$&&(DE1Hha!33G/7/CxkG+GN JQGN.#&44#..	
 	
r3   r  r  )rb   rc   rd   re   r   rA   r   r)   rj   ri   rk   rq  r`   rl   rm   rn   s   @r1   r  r  ]  s    ~   .2*.,0)-#'E
\\E
 ||d*E
 !4K	E

 #TkE
  $;E
 D[E
 
0	0E
 E
r3   r  z,
    The PatchTST for regression Model.
    c                   Z   ^  \ rS rSrSS\S\4U 4S jjjrS\R                  4S jr	Sr
U =r$ )	PatchTSTPredictionHeadi  r<   r   c                 H  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R
                  (       d  U R                  (       a  UR                  nOUR                  U-  nU R                  (       Gd]  [        R                  " 5       U l	        [        R                  " 5       U l
        [        R                  " 5       U l        [        U R                  5       H  nU R                  R                  [        R                  " SS95        Uc:  U R                  R                  [        R                  " XAR                   5      5        O*U R                  R                  UR#                  U5      5        U R                  R                  UR$                  S:  a   [        R&                  " UR$                  5      O[        R(                  " 5       5        M     g[        R                  " SS9U l        Uc&  [        R                  " XAR                   5      U l        OUR#                  U5      U l        UR$                  S:  a   [        R&                  " UR$                  5      O[        R(                  " 5       U l        g)z
num_patches (`int`):
    The number of patches in the input sequence.
distribution_output (`DistributionOutput`, *optional*):
    The distribution output layer for probabilistic forecasting. If None, a linear output layer is used.
r#   r  Nr   )r@   rA   share_projectionr   r  r  rv   r   r)  projectionsdropoutsflattensr*  r   r  rD   prediction_lengthget_parameter_projectionr  r   r   r  
projectionr   )rI   r<   r   distribution_outputrB   r/  rJ   s         r1   rA   PatchTSTPredictionHead.__init__  s    	 & 7 7"(";";#11"// 2 2~~H~~3H$$$!}}DMMODMMMODM4223$$RZZ!%<=&.$$++BIIh@X@X,YZ $$++,?,X,XYa,bc$$H[H[^_H_RZZ0C0C%Degepepers 4 ::2DL"*"$))H6N6N"O #6"N"Nx"X>D>Q>QTU>U2::f&9&9:[][f[f[hDLr3   r  c                    U R                   (       a  USS2SS2SSS24   nOLU R                  S:X  a  UR                  SS9nO,U R                  S:X  a  UR                  SS9R                  nOUnU R
                  (       d  / n[        U R                  5       H]  nU R                  U   " USS2USS24   5      nU R                  U   " U5      nU R                  U   " U5      nUR                  U5        M_     [        R                  " USS9nO3U R                  U5      nU R                  U5      nU R!                  U5      n[#        U[$        5      (       a  [%        S U 5       5      nU$ UR'                  SS5      nU$ )	a2  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
             `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

Nr   r   r#   r$   r   r   c              3   D   #    U  H  oR                  S S5      v   M     g7f)r#   r   N)r+   )r  zs     r1   r  1PatchTSTPredictionHead.forward.<locals>.<genexpr>  s     =f;;q!,,fs    )r  r  r   r   r  r  r*  r   r  r  r  r   r)   r-  r  r   r  r   rk   r+   )rI   r  r  r}   r/  s        r1   r`   PatchTSTPredictionHead.forward  sl    (Aq!4  F*#,>>a>#8 ""e+#,==Q=#7#>#>  $- $$F4223#'==#34DQ1W4M#N #'==#34D#E  $(#3#3A#67G#H ./ 4 [[Q/F  $||,<=#||,<= __%56Ffe$$=f==F  %%a+Fr3   )
r   r  r  r  r   r  r  r  r  r  r   )rb   rc   rd   re   r   rg   rA   r)   rj   r`   rl   rm   rn   s   @r1   r  r    s5    )i~ )iC )i )iV1 1 1r3   r  z,
    The PatchTST for prediction model.
    c                   6  ^  \ rS rSrS\4U 4S jjr     SS\R                  S\R                  S-  S\R                  S-  S\S-  S	\S-  S
\S-  S\	\
-  4S jjr\R                  " 5        SS\R                  S\R                  S-  S\4S jj5       rSrU =r$ )PatchTSTForPredictioni  r<   c                   > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        UR                  S:X  a  S U l        OUR                  S:X  a  [        UR                  S9U l        OjUR                  S:X  a  [        UR                  S9U l        OAUR                  S:X  a  [        UR                  S9U l        O[        SUR                   35      e[        XR                  R                  R                   U R                  S	9U l        U R%                  5         g )
Nr  Fmse	student_tr$   normalnegative_binomialUnknown distribution output )r  )r@   rA   r  r  r  r  r   rh  r  r   r  r   r   rC   r  r  r   r  rT  ry   s     r1   rA   PatchTSTForPrediction.__init__%  s     NNHI#(F "6*
;;%'+D$))[8+9f>V>V+W(++x7+7F<T<T+U(++/BB+AfF^F^+_( #?@Z@Z?[!\]]*JJ))554KcKc
	
 	r3   Nr   r  r  rV  rN   r  rO   c           	         Ub  UOU R                   R                  nU R                  UUUUSS9nU R                  UR                  5      n	Sn
U R
                  (       a  U	nOXR                  -  UR                  -   nUbr  U R
                  (       aE  U R
                  R                  XR                  UR                  S9n[        X5      n
[        U
5      n
O[        R                  " SS9nU" X5      n
UR                  nUR                  nU(       d  U4USS -   nU
b  U
4U-   nU$ UnU$ [        U
UUR                  UR                  UUS	9$ )
a  
Parameters:
    past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
        Input sequence to the model
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
        Future target values associated with the `past_values`
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers
    output_attentions (`bool`, *optional*):
        Whether or not to return the output attention of all layers
    return_dict (`bool`, *optional*):
        Whether or not to return a `ModelOutput` instead of a plain tuple.

Returns:
    `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
    `config.return_dict`=False)

Examples:

```python
>>> from huggingface_hub import hf_hub_download
>>> import torch
>>> from transformers import PatchTSTConfig, PatchTSTForPrediction

>>> file = hf_hub_download(
...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
... )
>>> batch = torch.load(file)

>>> # Prediction task with 7 input channels and prediction length is 96
>>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

>>> # during training, one provides both past and future values
>>> outputs = model(
...     past_values=batch["past_values"],
...     future_values=batch["future_values"],
... )

>>> loss = outputs.loss
>>> loss.backward()

>>> # during inference, one only provides past values, the model outputs future values
>>> outputs = model(past_values=batch["past_values"])
>>> prediction_outputs = outputs.prediction_outputs
```NTr  rc  rd  r   r  r   r!   )rh  ro  rL   rY  rc  rd  )r<   r  r   r  rX  r  rd  rc  distributionrz  r  r   r  rn  rL   rY  )rI   r   r  r  rV  rN   r  r   r  r  r  	y_hat_outr&  rh  rc  rd  r   s                    r1   r`   PatchTSTForPrediction.forwardB  sf   | &1%<k$++BYBY zz#1!5/ " 
 		,889##I 2 22\5E5EEI$''#77DD//|7I7I  E   |;+H5zzF3	9"" l\!B%77G/7/CxkG+GN JQGN*(&44#..
 	
r3   c                    U R                   R                  nU " USUSS9nU R                  (       aw  U R                  R                  UR                  UR
                  UR                  S9n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9nOUR                  R                  S5      n[        US9$ s  snf )a  
Generate sequences of sample predictions from a model with a probability distribution head.

Parameters:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
    samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
    for multivariate predictions.
NF)r   r  r  rV  r%  r   r$   ru  )r<   num_parallel_samplesr  r&  ro  rc  rd  r*  sampler)   r-  r   rt  rI   r   r  r+  r   r&  r   sampless           r1   generatePatchTSTForPrediction.generate  s    2  ${{?? #1!&	
 ##33@@**7== A L 7<<P6QR6Q**,6QGRkk'q1G00::1=G#g66 Ss   7Cr  r  r   r  r   )rb   rc   rd   re   r   rA   r)   rj   ri   rk   rn  r`   r"  rt  r/  rl   rm   rn   s   @r1   r  r    s    ~ @ 37-1,0)-#'l
\\l
 "LL4/l
 ||d*	l

 #Tkl
  $;l
 D[l
 
,	,l
\ ]]_ 37-7\\-7 "LL4/-7 
	-7 -7r3   r  c                   Z   ^  \ rS rSrSrSS\4U 4S jjjrS\R                  4S jr	Sr
U =r$ )	PatchTSTRegressionHeadi  z
Regression head
r<   c                 
  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        X l        UR                  UR                  -  n[        R                  " SS9U l        UR                  S:  a   [        R                  " UR                  5      O[        R                  " 5       U l        Uc&  [        R                   " X1R"                  5      U l        g UR'                  U5      U l        g r  )r@   rA   output_rangey_ranger  r  r  r   rv   r   r  r  r  r   r   r   rD   r  r  r  )rI   r<   r  rB   rJ   s       r1   rA   PatchTSTRegressionHead.__init__  s    **#11"//#6 ,,v~~=zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWd& ii2D2DEDO1JJ8TDOr3   r  c                 @   U R                   (       a  USS2SS2SSS24   nOcU R                  S:X  a  UR                  SS9nOCU R                  S:X  a  UR                  SS9R                  nO[        SU R                   S35      eU R                  U R                  U5      5      nU R                  U5      nU R                  SL U R                  SL-  (       aF  [        R                  " U5      U R                  S	   U R                  S   -
  -  U R                  S   -   nU$ )
a!  
Parameters:
    embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
        Embedding from the model
Returns:
    `torch.Tensor` of shape `(bs, output_dim)`

Nr   r   r#   r$   r   r  r  r   )r  r  r   r   r  rC   r   r  r  r  r6  r)   sigmoidr  s       r1   r`   PatchTSTRegressionHead.forward  s    (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\  <<5E(FG !12$$,T1IJ]]6*dll1oQ.OPSWS_S_`aSbbFr3   )r  r   r  r  r  r  r6  r   r   rn   s   @r1   r3  r3    s1    U~ U U"  r3   r3  z,
    The PatchTST for regression model.
    c                   @  ^  \ rS rSrS\4U 4S jjr\     SS\R                  S\R                  S-  S\R                  S-  S\	S-  S	\	S-  S
\	S-  S\
\-  4S jj5       r\R                  " 5        SS\R                  S\R                  S-  S\4S jj5       rSrU =r$ )PatchTSTForRegressioni  r<   c                 H  > [         TU ]  U5        UR                  (       a  [        R	                  S5        SUl        [        U5      U l        UR                  S:X  a  S U l        OUR                  S:X  a  [        UR                  S9U l        OjUR                  S:X  a  [        UR                  S9U l        OAUR                  S:X  a  [        UR                  S9U l        O[        SUR                   35      e[        XR                  5      U l        U R!                  5         g )	Nr  Fr  r  r$   r   r!  r"  )r@   rA   r  r  r  r  r   rh  r  r   r  r   r   rC   r3  r  rT  ry   s     r1   rA   PatchTSTForRegression.__init__  s      NNHI#(F "6*
;;%'+D$))[8+9f>P>P+Q(++x7+7F<N<N+O(++/BB+AfFXFX+Y( #?@Z@Z?[!\]]*63K3KL	 	r3   Nr   r  r  rV  rN   r  rO   c                   ^  Ub  UOT R                   R                  nT R                  UUUUSS9nT R                  UR                  5      n	Sn
Ubt  T R
                  (       aG  T R
                  R                  U	5      n[        U 4S jU	 5       5      n	[        X5      n
[        U
5      n
O[        R                  " SS9n
U
" X5      n
U(       d  U	4USS -   nU
b  U
4U-   nU$ UnU$ [        U
U	UR                  UR                  S	9$ )
a  
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
    Input sequence to the model
target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
    Target values associates with the `past_values`
past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
    Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
    in `[0, 1]`:

    - 1 for values that are **observed**,
    - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
    Whether or not to return a `ModelOutput` instead of a plain tuple.

Examples:

```python
>>> from transformers import PatchTSTConfig, PatchTSTForRegression

>>> # Regression task with 6 input channels and regress 2 targets
>>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

>>> # during inference, one only provides past values, the model outputs future values
>>> past_values = torch.randn(20, 512, 6)
>>> outputs = model(past_values=past_values)
>>> regression_outputs = outputs.regression_outputs
```NTr  c              3   n   >#    U  H*  oR                  S TR                  R                  5      v   M,     g7f)r!   N)rS   r<   r  )r  itemrI   s     r1   r  0PatchTSTForRegression.forward.<locals>.<genexpr>o  s)     WQViiDKK,C,CDDQVs   25r   r  r   r   )rh  rl  rL   rY  )r<   r  r   r  rX  r  r&  rk   rz  r  r   r  rk  rL   rY  )rI   r   r  r  rV  rN   r  r   r  r  rh  r&  r   s   `            r1   r`   PatchTSTForRegression.forward8  s   L &1%<k$++BYBYzz#1!5/ " 
 		,889$''#77DDUKWQVWW<7'-zzF3E1ha!33G+/+;tg'GN BIGN*$&44#..	
 	
r3   c                 h   U R                   R                  nU " USUSS9nU R                  R                  UR                  5      n[        U5       Vs/ s H  oeR                  5       PM     nn[        R                  " USS9R                  SX0R                   R                  5      n[        US9$ s  snf )a:  
Generate sequences of sample predictions from a model with a probability distribution head.

Parameters:
    past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
        Past values of the time series that serves as context in order to predict the future.
    past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
        Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
        in `[0, 1]`:

        - 1 for values that are **observed**,
        - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

Return:
    [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
    samples, num_targets)`.
NF)r   r  r  rV  r   r$   r!   r*  )r<   r+  r  r&  rl  r*  r,  r)   r-  rS   r  rt  r-  s           r1   r/  PatchTSTForRegression.generate  s    0  ${{?? #1!&	
 //<<W=W=WX278L2MN2MQ&&(2MN++g1-2227K[[MdMde#g66 Os   B/r1  r  r   )rb   rc   rd   re   r   rA   r   r)   rj   ri   rk   rk  r`   r"  rt  r/  rl   rm   rn   s   @r1   r<  r<    s    ~ 4  .226,0)-#'H
\\H
 ||d*H
 "LL4/	H

 #TkH
  $;H
 D[H
 
,	,H
 H
T ]]_ 37'7\\'7 "LL4/'7 
	'7 '7r3   r<  )r  r   r  r  r<  r  )NrQ   )NFr   r  r_  )Prf   rA  collections.abcr   dataclassesr   r)   r    r   r  activationsr   integrations.deepspeedr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   time_series_utilsr   r   r   utilsr   r   r   r   configuration_patchtstr   
get_loggerrb   r  r#  rj   rh   r2   r5   rp   listri   rg   r   r   r   r   r   r   r%  r   r  rb  rg  rk  rn  rq  rt  distributionsDistributionrz  r  r  r  r  r  r  r  r  r  r  r  r  r3  r<  __all__r   r3   r1   <module>rW     s     $ !   & " @ B / F & U U M M 2 
		H	% !%II%<<% 
% <<	%
 LL4'% T\% % '(%:R/		 R/j&		 &2 -1',7%LL7%7% #Tk7% !%	7%
 7%z -1	A%LLA%#czA% #TkA% 	A%H-ryy -`9"bii 9"xH299 HV 02o 02 02f!		 !H5 5p<x- <x~ 
 1+ 1 16 
 7; 7 7 
 7+ 7 7 
 ++ + +4 
 7k 7 7  /; / /#u""// # #%,, #*5<< *%,,:M *chcoco *2 0		  0H3; 3;n 		  6 RYY  8 n
+ n
 n
bryy 8 
m
4 m

m
`" "J 
U
 7 U

U
p 
]RYY ]
]@ 
z73 z7
z7z4RYY 4n 
N73 N7
N7br3   