From cfe164511d24c4b64dff12dc1fa086ac67f0ac71 Mon Sep 17 00:00:00 2001
From: lx <arthurlx94@163.com>
Date: Tue, 3 Dec 2024 13:44:01 +0800
Subject: [PATCH] 'addBigST'

---
 baselines/BigST/PEMS04.py                     | 162 -------------
 baselines/BigST/arch/__init__.py              |   3 -
 baselines/BigST/arch/bigst_arch.py            | 139 -----------
 baselines/BigST/arch/linear_conv.py           |  99 --------
 baselines/BigST/arch/pipeline.py              |  67 ------
 baselines/BigST/arch/preprocess/metrics.py    |  53 -----
 baselines/BigST/arch/preprocess/model.py      | 206 ----------------
 baselines/BigST/arch/preprocess/pipeline.py   |  38 ---
 baselines/BigST/arch/preprocess/preprocess.py | 127 ----------
 baselines/BigST/arch/preprocess/util.py       | 147 ------------
 baselines/BigST/arch/random_map.py            |  81 -------
 baselines/BigST/loss/__init__.py              |   1 -
 baselines/BigST/loss/loss.py                  |  35 ---
 baselines/BigSTPreprocess/PEMS08.py           | 153 ------------
 baselines/BigSTPreprocess/arch/__init__.py    |   3 -
 .../arch/bigst_preprocess_arch.py             | 220 ------------------
 baselines/BigSTPreprocess/runner/__init__.py  |   1 -
 .../runner/bigstpreprocess_runner.py          |  49 ----
 18 files changed, 1584 deletions(-)
 delete mode 100644 baselines/BigST/PEMS04.py
 delete mode 100644 baselines/BigST/arch/__init__.py
 delete mode 100644 baselines/BigST/arch/bigst_arch.py
 delete mode 100644 baselines/BigST/arch/linear_conv.py
 delete mode 100644 baselines/BigST/arch/pipeline.py
 delete mode 100644 baselines/BigST/arch/preprocess/metrics.py
 delete mode 100644 baselines/BigST/arch/preprocess/model.py
 delete mode 100644 baselines/BigST/arch/preprocess/pipeline.py
 delete mode 100644 baselines/BigST/arch/preprocess/preprocess.py
 delete mode 100644 baselines/BigST/arch/preprocess/util.py
 delete mode 100644 baselines/BigST/arch/random_map.py
 delete mode 100644 baselines/BigST/loss/__init__.py
 delete mode 100644 baselines/BigST/loss/loss.py
 delete mode 100644 baselines/BigSTPreprocess/PEMS08.py
 delete mode 100644 baselines/BigSTPreprocess/arch/__init__.py
 delete mode 100644 baselines/BigSTPreprocess/arch/bigst_preprocess_arch.py
 delete mode 100644 baselines/BigSTPreprocess/runner/__init__.py
 delete mode 100644 baselines/BigSTPreprocess/runner/bigstpreprocess_runner.py

diff --git a/baselines/BigST/PEMS04.py b/baselines/BigST/PEMS04.py
deleted file mode 100644
index 6dcc698a..00000000
--- a/baselines/BigST/PEMS04.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import os
-import sys
-import torch
-from easydict import EasyDict
-sys.path.append(os.path.abspath(__file__ + '/../../..'))
-
-from basicts.metrics import masked_mae, masked_mape, masked_rmse
-from basicts.data import TimeSeriesForecastingDataset
-from basicts.runners import SimpleTimeSeriesForecastingRunner
-from basicts.scaler import ZScoreScaler
-from basicts.utils import get_regular_settings, load_adj
-
-from .arch import BigST
-from .loss import bigst_loss
-
-############################## Hot Parameters ##############################
-# Dataset & Metrics configuration
-DATA_NAME = 'PEMS04'  # Dataset name
-regular_settings = get_regular_settings(DATA_NAME)
-INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
-OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
-TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
-NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
-RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
-NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
-# Model architecture and parameters
-MODEL_ARCH = BigST
-adj_mx, _ = load_adj("datasets/" + DATA_NAME +
-                     "/adj_mx.pkl", "doubletransition")
-MODEL_PARAM = {
-    "num_nodes": 307,
-    "seq_num": INPUT_LEN,
-    "in_dim": 3,
-    "out_dim": OUTPUT_LEN, 
-    "hid_dim": 32,
-    "tau" : 0.25,
-    "random_feature_dim": 64,
-    "node_emb_dim": 32,
-    "time_emb_dim": 32,
-    "use_residual": True,
-    "use_bn": True,
-    "use_spatial": True,
-    "use_long": False,
-    "dropout": 0.3,
-    "supports": [torch.tensor(i) for i in adj_mx],
-    "time_of_day_size": 288, 
-    "day_of_week_size": 7,
-}
-
-NUM_EPOCHS = 100
-
-############################## General Configuration ##############################
-CFG = EasyDict()
-# General settings
-CFG.DESCRIPTION = 'An Example Config'
-CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
-# Runner
-CFG.RUNNER = SimpleTimeSeriesForecastingRunner
-
-############################## Environment Configuration ##############################
-
-CFG.ENV = EasyDict() # Environment settings. Default: None
-CFG.ENV.SEED = 0 # Random seed. Default: None
-
-############################## Dataset Configuration ##############################
-CFG.DATASET = EasyDict()
-# Dataset settings
-CFG.DATASET.NAME = DATA_NAME
-CFG.DATASET.TYPE = TimeSeriesForecastingDataset
-CFG.DATASET.PARAM = EasyDict({
-    'dataset_name': DATA_NAME,
-    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
-    'input_len': INPUT_LEN,
-    'output_len': OUTPUT_LEN,
-    # 'mode' is automatically set by the runner
-})
-
-############################## Scaler Configuration ##############################
-CFG.SCALER = EasyDict()
-# Scaler settings
-CFG.SCALER.TYPE = ZScoreScaler # Scaler class
-CFG.SCALER.PARAM = EasyDict({
-    'dataset_name': DATA_NAME,
-    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
-    'norm_each_channel': NORM_EACH_CHANNEL,
-    'rescale': RESCALE,
-})
-
-############################## Model Configuration ##############################
-CFG.MODEL = EasyDict()
-# Model settings
-CFG.MODEL.NAME = MODEL_ARCH.__name__
-CFG.MODEL.ARCH = MODEL_ARCH
-CFG.MODEL.PARAM = MODEL_PARAM
-CFG.MODEL.FORWARD_FEATURES = [0, 1, 2]
-CFG.MODEL.TARGET_FEATURES = [0]
-
-############################## Metrics Configuration ##############################
-
-CFG.METRICS = EasyDict()
-# Metrics settings
-CFG.METRICS.FUNCS = EasyDict({
-                                'MAE': masked_mae,
-                                'MAPE': masked_mape,
-                                'RMSE': masked_rmse,
-                            })
-CFG.METRICS.TARGET = 'MAE'
-CFG.METRICS.NULL_VAL = NULL_VAL
-
-############################## Training Configuration ##############################
-CFG.TRAIN = EasyDict()
-CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
-CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
-    'checkpoints',
-    MODEL_ARCH.__name__,
-    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
-)
-CFG.TRAIN.LOSS = bigst_loss
-# Optimizer settings
-CFG.TRAIN.OPTIM = EasyDict()
-CFG.TRAIN.OPTIM.TYPE = "AdamW"
-CFG.TRAIN.OPTIM.PARAM = {
-    "lr": 0.002,
-    "weight_decay": 0.0001,
-}
-# Learning rate scheduler settings
-CFG.TRAIN.LR_SCHEDULER = EasyDict()
-CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
-CFG.TRAIN.LR_SCHEDULER.PARAM = {
-    "milestones": [1, 50],
-    "gamma": 0.5
-}
-# Train data loader settings
-CFG.TRAIN.DATA = EasyDict()
-CFG.TRAIN.DATA.BATCH_SIZE = 64
-CFG.TRAIN.DATA.SHUFFLE = True
-# Gradient clipping settings
-CFG.TRAIN.CLIP_GRAD_PARAM = {
-    "max_norm": 5.0
-}
-
-############################## Validation Configuration ##############################
-CFG.VAL = EasyDict()
-CFG.VAL.INTERVAL = 1
-CFG.VAL.DATA = EasyDict()
-CFG.VAL.DATA.BATCH_SIZE = 64
-
-############################## Test Configuration ##############################
-CFG.TEST = EasyDict()
-CFG.TEST.INTERVAL = 1
-CFG.TEST.DATA = EasyDict()
-CFG.TEST.DATA.BATCH_SIZE = 64
-
-############################## Evaluation Configuration ##############################
-
-CFG.EVAL = EasyDict()
-
-# Evaluation parameters
-CFG.EVAL.HORIZONS = [3, 6, 12] # Prediction horizons for evaluation. Default: []
-CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
-
-
diff --git a/baselines/BigST/arch/__init__.py b/baselines/BigST/arch/__init__.py
deleted file mode 100644
index 7cb17069..00000000
--- a/baselines/BigST/arch/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .bigst_arch import BigST
-
-__all__ = ["BigST"]
diff --git a/baselines/BigST/arch/bigst_arch.py b/baselines/BigST/arch/bigst_arch.py
deleted file mode 100644
index 5e8c6034..00000000
--- a/baselines/BigST/arch/bigst_arch.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .linear_conv import *
-from torch.autograd import Variable
-import pdb
-
-class BigST(nn.Module):
-    """
-    Paper: BigST: Linear Complexity Spatio-Temporal Graph Neural Network for Traffic Forecasting on Large-Scale Road Networks
-    Link: https://dl.acm.org/doi/10.14778/3641204.3641217
-    Official Code: https://github.com/usail-hkust/BigST?tab=readme-ov-file
-    Venue: VLDB 2024
-    Task: Spatial-Temporal Forecasting
-    """
-    def __init__(self, seq_num, in_dim, out_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
-                 use_residual, use_bn, use_spatial, use_long, dropout, time_of_day_size, day_of_week_size, supports=None, edge_indices=None):
-        super(BigST, self).__init__()
-        self.tau = tau
-        self.layer_num = 3
-        self.in_dim = in_dim
-        self.random_feature_dim = random_feature_dim
-        
-        self.use_residual = use_residual
-        self.use_bn = use_bn
-        self.use_spatial = use_spatial
-        self.use_long = use_long
-        
-        self.dropout = dropout
-        self.activation = nn.ReLU()
-        self.supports = supports
-        
-        self.time_num = time_of_day_size
-        self.week_num = day_of_week_size
-        
-        # node embedding layer
-        self.node_emb_layer = nn.Parameter(torch.empty(num_nodes, node_emb_dim))
-        nn.init.xavier_uniform_(self.node_emb_layer)
-        
-        # time embedding layer
-        self.time_emb_layer = nn.Parameter(torch.empty(self.time_num, time_emb_dim))
-        nn.init.xavier_uniform_(self.time_emb_layer)
-        self.week_emb_layer = nn.Parameter(torch.empty(self.week_num, time_emb_dim))
-        nn.init.xavier_uniform_(self.week_emb_layer)
-
-        # embedding layer
-        self.input_emb_layer = nn.Conv2d(seq_num*in_dim, hid_dim, kernel_size=(1, 1), bias=True)
-        
-        self.W_1 = nn.Conv2d(node_emb_dim+time_emb_dim*2, hid_dim, kernel_size=(1, 1), bias=True)
-        self.W_2 = nn.Conv2d(node_emb_dim+time_emb_dim*2, hid_dim, kernel_size=(1, 1), bias=True)
-        
-        self.linear_conv = nn.ModuleList()
-        self.bn = nn.ModuleList()
-        
-        self.supports_len = 0
-        if supports is not None:
-            self.supports_len += len(supports)
-        
-        for i in range(self.layer_num):
-            self.linear_conv.append(linearized_conv(hid_dim*4, hid_dim*4, self.dropout, self.tau, self.random_feature_dim))
-            self.bn.append(nn.LayerNorm(hid_dim*4))
-        
-        if self.use_long:
-            self.regression_layer = nn.Conv2d(hid_dim*4*2+hid_dim+seq_num, out_dim, kernel_size=(1, 1), bias=True)
-        else:
-            self.regression_layer = nn.Conv2d(hid_dim*4*2, out_dim, kernel_size=(1, 1), bias=True)
-
-    # def forward(self, x, feat=None):
-    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs) -> torch.Tensor:
-        x = history_data[:, :, :, range(self.in_dim)]         # (batch_size, in_len, data_dim)
-        x = x.transpose(1,2)
-        # input: (B, N, T, D)
-        B, N, T, D = x.size()
-        
-        time_emb = self.time_emb_layer[(x[:, :, -1, 1]*self.time_num).type(torch.LongTensor)]
-        week_emb = self.week_emb_layer[(x[:, :, -1, 2]).type(torch.LongTensor)]
-        
-        # input embedding
-        x = x.contiguous().view(B, N, -1).transpose(1, 2).unsqueeze(-1) # (B, D*T, N, 1)
-        input_emb = self.input_emb_layer(x)
-
-        # node embeddings
-        node_emb = self.node_emb_layer.unsqueeze(0).expand(B, -1, -1).transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
-
-        # time embeddings
-        time_emb = time_emb.transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
-        week_emb = week_emb.transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
-        
-        x_g = torch.cat([node_emb, time_emb, week_emb], dim=1) # (B, dim*4, N, 1)
-        x = torch.cat([input_emb, node_emb, time_emb, week_emb], dim=1) # (B, dim*4, N, 1)
-
-        # linearized spatial convolution
-        x_pool = [x] # (B, dim*4, N, 1)
-        node_vec1 = self.W_1(x_g) # (B, dim, N, 1)
-        node_vec2 = self.W_2(x_g) # (B, dim, N, 1)
-        node_vec1 = node_vec1.permute(0, 2, 3, 1) # (B, N, 1, dim)
-        node_vec2 = node_vec2.permute(0, 2, 3, 1) # (B, N, 1, dim)
-        for i in range(self.layer_num):
-            if self.use_residual:
-                residual = x
-            x, node_vec1_prime, node_vec2_prime = self.linear_conv[i](x, node_vec1, node_vec2)
-            
-            if self.use_residual:
-                x = x+residual 
-                
-            if self.use_bn:
-                x = x.permute(0, 2, 3, 1) # (B, N, 1, dim*4)
-                x = self.bn[i](x)
-                x = x.permute(0, 3, 1, 2)
-
-        x_pool.append(x)
-        x = torch.cat(x_pool, dim=1) # (B, dim*4, N, 1)
-        
-        x = self.activation(x) # (B, dim*4, N, 1)
-        
-        if self.use_long:
-            feat = feat.permute(0, 2, 1).unsqueeze(-1) # (B, F, N, 1)
-            x = torch.cat([x, feat], dim=1)
-            x = self.regression_layer(x) # (B, N, T)
-            x = x.squeeze(-1).permute(0, 2, 1)
-        else:
-            x = self.regression_layer(x) # (B, N, T)
-            x = x.squeeze(-1).permute(0, 2, 1)
-        
-        # if self.use_spatial:
-
-        #     supports = [support.to(x.device) for support in self.supports]
-        #     edge_indices = torch.nonzero(supports[0] > 0)
-
-        #     # s_loss = spatial_loss(node_vec1_prime, node_vec2_prime, supports, edge_indices)
-        #     return x.transpose(1,2).unsqueeze(-1), s_loss
-        # else:
-        #     return x.transpose(1,2).unsqueeze(-1), 0
-        return {"prediction": x.transpose(1,2).unsqueeze(-1)
-              , "node_vec1": node_vec1_prime
-              , "node_vec2": node_vec2_prime
-              , "supports": self.supports
-              , 'use_spatial': self.use_spatial}
\ No newline at end of file
diff --git a/baselines/BigST/arch/linear_conv.py b/baselines/BigST/arch/linear_conv.py
deleted file mode 100644
index 34d84eab..00000000
--- a/baselines/BigST/arch/linear_conv.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-
-from .random_map import *
-
-def linear_kernel(x, node_vec1, node_vec2):
-    # x: [B, N, 1, nhid] node_vec1: [B, N, 1, r], node_vec2: [B, N, 1, r]
-    node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
-    node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
-    x = x.permute(1, 0, 2, 3) # [N, B, 1, nhid]
-    
-    v2x = torch.einsum("nbhm,nbhd->bhmd", node_vec2, x)
-    out1 = torch.einsum("nbhm,bhmd->nbhd", node_vec1, v2x) # [N, B, 1, nhid]
-    
-    one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
-    node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
-    out2 = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum) # [N, 1]
-
-    out1 = out1.permute(1, 0, 2, 3)  # [B, N, 1, nhid]
-    out2 = out2.permute(1, 0, 2)
-    out2 = torch.unsqueeze(out2, len(out2.shape))
-    out = out1 / out2 # [B, N, 1, nhid]
-
-    return out
-
-# def spatial_loss(node_vec1, node_vec2, supports, edge_indices):
-#     B = node_vec1.size(0)
-#     node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
-#     node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
-    
-#     node_vec1_end, node_vec2_start = node_vec1[edge_indices[:, 0]], node_vec2[edge_indices[:, 1]] # [E, B, 1, r]
-#     attn1 = torch.einsum("ebhm,ebhm->ebh", node_vec1_end, node_vec2_start) # [E, B, 1]
-#     attn1 = attn1.permute(1, 0, 2) # [B, E, 1]
-
-#     one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
-#     node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
-#     attn_norm = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum)
-    
-#     attn2 = attn_norm[edge_indices[:, 0]]  # [E, B, 1]
-#     attn2 = attn2.permute(1, 0, 2) # [B, E, 1]
-#     attn_score = attn1 / attn2 # [B, E, 1]
-    
-#     d_norm = supports[0][edge_indices[:, 0], edge_indices[:, 1]]
-#     d_norm = d_norm.reshape(1, -1, 1).repeat(B, 1, attn_score.shape[-1])
-#     spatial_loss = torch.mean(attn_score.log() * d_norm)
-    
-#     return spatial_loss
-
-class conv_approximation(nn.Module):
-    def __init__(self, dropout, tau, random_feature_dim):
-        super(conv_approximation, self).__init__()
-        self.tau = tau
-        self.random_feature_dim = random_feature_dim
-        self.activation = nn.ReLU()
-        self.dropout = dropout
-
-    def forward(self, x, node_vec1, node_vec2):
-        B = x.size(0) # (B, N, 1, nhid)
-        dim = node_vec1.shape[-1] # (N, 1, d)
-        
-        random_seed = torch.ceil(torch.abs(torch.sum(node_vec1) * 1e8)).to(torch.int32)
-        random_matrix = create_random_matrix(self.random_feature_dim, dim, seed=random_seed).to(node_vec1.device) # (d, r)
-        
-        node_vec1 = node_vec1 / math.sqrt(self.tau)
-        node_vec2 = node_vec2 / math.sqrt(self.tau)
-        node_vec1_prime = random_feature_map(node_vec1, True, random_matrix) # [B, N, 1, r]
-        node_vec2_prime = random_feature_map(node_vec2, False, random_matrix) # [B, N, 1, r]
-        
-        x = linear_kernel(x, node_vec1_prime, node_vec2_prime)
-        
-        return x, node_vec1_prime, node_vec2_prime
-
-class linearized_conv(nn.Module):
-    def __init__(self, in_dim, hid_dim, dropout, tau=1.0, random_feature_dim=64):
-        super(linearized_conv, self).__init__()
-        
-        self.dropout = dropout
-        self.tau = tau
-        self.random_feature_dim = random_feature_dim
-        
-        self.input_fc = nn.Conv2d(in_channels=in_dim, out_channels=hid_dim, kernel_size=(1, 1), bias=True)
-        self.activation = nn.ReLU()
-        self.dropout_layer = nn.Dropout(p=dropout)
-        
-        self.conv_app_layer = conv_approximation(self.dropout, self.tau, self.random_feature_dim)
-        
-    def forward(self, input_data, node_vec1, node_vec2):
-        x = self.input_fc(input_data)
-        x = self.activation(x)
-        x = self.dropout_layer(x)
-        
-        x = x.permute(0, 2, 3, 1) # (B, N, 1, dim*4)
-        x, node_vec1_prime, node_vec2_prime = self.conv_app_layer(x, node_vec1, node_vec2)
-        x = x.permute(0, 3, 1, 2) # (B, dim*4, N, 1)
-        
-        return x, node_vec1_prime, node_vec2_prime
diff --git a/baselines/BigST/arch/pipeline.py b/baselines/BigST/arch/pipeline.py
deleted file mode 100644
index fd4122cb..00000000
--- a/baselines/BigST/arch/pipeline.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.autograd import Variable
-
-import metrics
-from bigst import bigst
-
-class train_pipeline():
-    def __init__(self, scaler, seq_num, in_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
-                 use_residual, use_bn, use_spatial, use_long, dropout, lrate, wdecay, device, supports, edge_indices):
-        self.model = bigst(device, seq_num, in_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
-                           use_residual, use_bn, use_spatial, use_long, dropout, supports=supports, edge_indices=edge_indices)
-        self.model.to(device)
-        self.optimizer = optim.Adam(self.model.parameters(), lr=lrate, weight_decay=wdecay)
-        self.loss = metrics.masked_mae
-        self.scaler = scaler
-        self.use_spatial = use_spatial
-        self.clip = 5
-
-    def train(self, input, real_val, feat=None):
-        self.model.train()
-        self.optimizer.zero_grad()
-        
-        if self.use_spatial:
-            output, spatial_loss = self.model(input, feat)
-            real = self.scaler.inverse_transform(real_val)
-            predict = self.scaler.inverse_transform(output)
-            loss = self.loss(predict, real, 0.0)-0.3*spatial_loss
-        else:
-            output, _ = self.model(input, feat)
-            real = self.scaler.inverse_transform(real_val)
-            predict = self.scaler.inverse_transform(output)
-            loss = self.loss(predict, real, 0.0)
-        
-        loss.backward()
-        if self.clip is not None:
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
-        self.optimizer.step()
-        mape = metrics.masked_mape(predict,real,0.0).item()
-        rmse = metrics.masked_rmse(predict,real,0.0).item()
-        return loss.item(), mape, rmse
-    
-    def eval(self, input, real_val, feat=None, flag='overall'):
-        if flag=='overall':
-            self.model.eval()
-            output, _ = self.model(input, feat)
-            real = self.scaler.inverse_transform(real_val)
-            predict = self.scaler.inverse_transform(output)
-            loss = self.loss(predict, real, 0.0)
-            mape = metrics.masked_mape(predict,real,0.0).item()
-            rmse = metrics.masked_rmse(predict,real,0.0).item()
-            return loss.item(), mape, rmse
-        elif flag=='horizon':
-            self.model.eval()
-            output, _ = self.model(input, feat)
-            real = self.scaler.inverse_transform(real_val)
-            predict = self.scaler.inverse_transform(output)
-            loss = []
-            mape = []
-            rmse = []
-            for i in range(12):
-                loss.append(self.loss(predict[..., i], real[..., i], 0.0).item())
-                mape.append(metrics.masked_mape(predict[..., i], real[..., i], 0.0).item())
-                rmse.append(metrics.masked_rmse(predict[..., i], real[..., i], 0.0).item())
-            return loss, mape, rmse
diff --git a/baselines/BigST/arch/preprocess/metrics.py b/baselines/BigST/arch/preprocess/metrics.py
deleted file mode 100644
index aac0af60..00000000
--- a/baselines/BigST/arch/preprocess/metrics.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch
-import numpy as np
-
-def masked_mse(preds, labels, null_val=np.nan):
-    if np.isnan(null_val):
-        mask = ~torch.isnan(labels)
-    else:
-        mask = (labels!=null_val)
-    mask = mask.float()
-    mask /= torch.mean((mask))
-    mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask)
-    loss = (preds-labels)**2
-    loss = loss * mask
-    loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss)
-    return torch.mean(loss)
-
-def masked_rmse(preds, labels, null_val=np.nan):
-    return torch.sqrt(masked_mse(preds=preds, labels=labels, null_val=null_val))
-
-def masked_mae(preds, labels, null_val=np.nan):
-    if np.isnan(null_val):
-        mask = ~torch.isnan(labels)
-    else:
-        mask = (labels!=null_val)
-    mask = mask.float()
-    mask /=  torch.mean((mask))
-    mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask)
-    loss = torch.abs(preds-labels)
-    loss = loss * mask
-    loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss)
-    return torch.mean(loss)
-
-
-def masked_mape(preds, labels, null_val=np.nan):
-    labels = torch.where(labels<0.01, torch.zeros_like(labels), labels)
-    if np.isnan(null_val):
-        mask = ~torch.isnan(labels)
-    else:
-        mask = (labels!=null_val)
-    mask = mask.float()
-    mask /=  torch.mean((mask))
-    mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask)
-    loss = torch.abs(preds-labels)/labels
-    loss = loss * mask
-    loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss)
-    return torch.mean(loss)
-
-
-def metric(pred, real):
-    mae = masked_mae(pred,real,0.0).item()
-    mape = masked_mape(pred,real,0.0).item()
-    rmse = masked_rmse(pred,real,0.0).item()
-    return mae,mape,rmse
\ No newline at end of file
diff --git a/baselines/BigST/arch/preprocess/model.py b/baselines/BigST/arch/preprocess/model.py
deleted file mode 100644
index 44bd07c1..00000000
--- a/baselines/BigST/arch/preprocess/model.py
+++ /dev/null
@@ -1,206 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-import sys
-
-def create_projection_matrix(m, d, seed=0, scaling=0, struct_mode=False):
-    nb_full_blocks = int(m/d)
-    block_list = []
-    current_seed = seed
-    for _ in range(nb_full_blocks):
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q)
-        current_seed += 1
-    remaining_rows = m - nb_full_blocks * d
-    if remaining_rows > 0:
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q[0:remaining_rows])
-    final_matrix = torch.vstack(block_list)
-
-    current_seed += 1
-    torch.manual_seed(current_seed)
-    if scaling == 0:
-        multiplier = torch.norm(torch.randn((m, d)), dim=1)
-    elif scaling == 1:
-        multiplier = torch.sqrt(torch.tensor(float(d))) * torch.ones(m)
-    else:
-        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
-
-    return torch.matmul(torch.diag(multiplier), final_matrix)
-
-def create_products_of_givens_rotations(dim, seed):
-    nb_givens_rotations = dim * int(math.ceil(math.log(float(dim))))
-    q = np.eye(dim, dim)
-    np.random.seed(seed)
-    for _ in range(nb_givens_rotations):
-        random_angle = math.pi * np.random.uniform()
-        random_indices = np.random.choice(dim, 2)
-        index_i = min(random_indices[0], random_indices[1])
-        index_j = max(random_indices[0], random_indices[1])
-        slice_i = q[index_i]
-        slice_j = q[index_j]
-        new_slice_i = math.cos(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        new_slice_j = -math.sin(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        q[index_i] = new_slice_i
-        q[index_j] = new_slice_j
-    return torch.tensor(q, dtype=torch.float32)
-
-def softmax_kernel_transformation(data, is_query, projection_matrix=None, numerical_stabilizer=0.000001):
-    data_normalizer = 1.0 / torch.sqrt(torch.sqrt(torch.tensor(data.shape[-1], dtype=torch.float32)))
-    data = data_normalizer * data
-    ratio = 1.0 / torch.sqrt(torch.tensor(projection_matrix.shape[0], dtype=torch.float32))
-    data_dash = torch.einsum("bnhd,md->bnhm", data, projection_matrix)
-    diag_data = torch.square(data)
-    diag_data = torch.sum(diag_data, dim=len(data.shape)-1)
-    diag_data = diag_data / 2.0
-    diag_data = torch.unsqueeze(diag_data, dim=len(data.shape)-1)
-    last_dims_t = len(data_dash.shape) - 1
-    attention_dims_t = len(data_dash.shape) - 3
-    if is_query:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(data_dash, dim=last_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    else:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(torch.max(data_dash, dim=last_dims_t, keepdim=True)[0],
-                    dim=attention_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    return data_dash
-
-def numerator(qs, ks, vs):
-    kvs = torch.einsum("nbhm,nbhd->bhmd", ks, vs) # kvs refers to U_k in the paper
-    return torch.einsum("nbhm,bhmd->nbhd", qs, kvs)
-
-def denominator(qs, ks):
-    all_ones = torch.ones([ks.shape[0]]).to(qs.device)
-    ks_sum = torch.einsum("nbhm,n->bhm", ks, all_ones) # ks_sum refers to O_k in the paper
-    return torch.einsum("nbhm,bhm->nbh", qs, ks_sum)
-
-def linearized_softmax(x, query, key):
-    # x: [B, N, H, D] query: [B, N, H, m], key: [B, N, H, m]
-    query = query.permute(1, 0, 2, 3) # [N, B, H, m]
-    key = key.permute(1, 0, 2, 3) # [N, B, H, m]
-    x = x.permute(1, 0, 2, 3) # [N, B, H, D]
-
-    z_num = numerator(query, key, x) # [N, B, H, D]
-    z_den = denominator(query, key) # [N, H]
-
-    z_num = z_num.permute(1, 0, 2, 3)  # [B, N, H, D]
-    z_den = z_den.permute(1, 0, 2)
-    z_den = torch.unsqueeze(z_den, len(z_den.shape))
-    z_output = z_num / z_den # # [B, N, H, D]
-
-    return z_output
-
-class linearized_attention(nn.Module):
-    def __init__(self, c_in, c_out, dropout, random_feature_dim=30, tau=1.0, num_heads=4):
-        super(linearized_attention, self).__init__()
-        self.Wk = nn.Linear(c_in, c_out * num_heads)
-        self.Wq = nn.Linear(c_in, c_out * num_heads)
-        self.Wv = nn.Linear(c_in, c_out * num_heads)
-        self.Wo = nn.Linear(c_out * num_heads, c_out)
-        self.c_in = c_in
-        self.c_out = c_out
-        self.num_heads = num_heads
-        self.tau = tau
-        self.random_feature_dim = random_feature_dim
-        self.activation = nn.ReLU
-        self.dropout = dropout
-        
-    def reset_parameters(self):
-        self.Wk.reset_parameters()
-        self.Wq.reset_parameters()
-        self.Wv.reset_parameters()
-        self.Wo.reset_parameters()
-
-    def forward(self, x):
-        B, T = x.size(0), x.size(1) # (B, T, D)
-        query = self.Wq(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        key = self.Wk(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        x = self.Wv(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        
-        dim = query.shape[-1] # (B, T, H, D)
-        seed = torch.ceil(torch.abs(torch.sum(query) * 1e8)).to(torch.int32)
-        projection_matrix = create_projection_matrix(self.random_feature_dim, dim, seed=seed).to(query.device) # (d, m)
-        query = query / math.sqrt(self.tau)
-        key = key / math.sqrt(self.tau)
-        query = softmax_kernel_transformation(query, True, projection_matrix) # [B, T, H, m]
-        key = softmax_kernel_transformation(key, False, projection_matrix) # [B, T, H, m]
-        
-        x = linearized_softmax(x, query, key)
-        
-        x = self.Wo(x.flatten(-2, -1)) # (B, T, D)
-        
-        return x
-
-class linear_transformer(nn.Module):
-    def __init__(self, input_length, output_length, in_dim, num_nodes, nhid, dropout=0.3):
-        super(linear_transformer, self).__init__()
-        self.tau = 1.0
-        self.layer_num = 3
-        self.random_feature_dim = nhid*2
-        
-        self.use_residual = True
-        self.use_bn = False
-        self.use_act = True
-        
-        self.dropout = dropout
-        self.activation = nn.ReLU()
-        
-        self.fc_convs = nn.ModuleList()
-        self.transformer_layer = nn.ModuleList()
-        self.bn = nn.ModuleList()
-        self.context_conv = nn.Conv2d(in_channels=in_dim, out_channels=nhid, kernel_size=(12, 1), stride=(12, 1))
-        
-        self.temporal_embedding = nn.Parameter(torch.empty(int(input_length/12), nhid), requires_grad=True) # (C, nhid)
-        nn.init.xavier_uniform_(self.temporal_embedding)
-        
-        for i in range(self.layer_num):
-            self.transformer_layer.append(linearized_attention(nhid, nhid, self.dropout, self.random_feature_dim, self.tau))
-            self.bn.append(nn.LayerNorm(nhid))
-        
-        self.regression_layer = nn.Linear(nhid, output_length)
-
-    def forward(self, x):
-        # input: (1, 9638, 2016, 3) (B, N, T, D)
-        B, N, T, D = x.size()
-        pe = self.temporal_embedding.unsqueeze(0).expand(B*N, -1, -1) # (B*N, T/12, nhid)
-        
-        x = x.reshape(B*N, T, D)
-        x = x.permute(0, 2, 1).unsqueeze(-1) # (B*N, T, D) -> (B*N, D, T, 1)
-
-        # convolution layer
-        x = self.context_conv(x) # (B*N, D, T, 1) -> (B*N, nhid, T/12, 1)
-        x = x.squeeze(-1) # (B*N, nhid, T/12)
-
-        # temporal embedding layer
-        x = x.permute(0, 2, 1) # (B*N, T/12, nhid)
-        x = x+pe # (B*N, T/12, nhid)
-
-        # linearized attention
-        for num in range(self.layer_num):
-            residual = x # (B*N, T/12, nhid)
-            x = self.transformer_layer[num](x) # (B*N, T/12, nhid)
-            x = self.bn[num](x)
-            x = x+residual # (B*N, T/12, nhid)
-
-        x = self.activation(x) # (B*N, T/12, nhid)
-        x = x[:, -1, :]
-        # x = torch.sum(x, dim=1) # (B*N, nhid)
-        feat = x.view(B, N, -1) # (B, N, nhid)
-        x = self.regression_layer(feat) # (B, N, output_length)
-        return x, feat
diff --git a/baselines/BigST/arch/preprocess/pipeline.py b/baselines/BigST/arch/preprocess/pipeline.py
deleted file mode 100644
index 46499b73..00000000
--- a/baselines/BigST/arch/preprocess/pipeline.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch.optim as optim
-from model import *
-import metrics
-
-class train_pipeline():
-    def __init__(self, scaler, input_length, output_length, in_dim, num_nodes, nhid, dropout, lrate, wdecay, device):
-        self.model = linear_transformer(input_length, output_length, in_dim, num_nodes, nhid, dropout)
-        self.model.to(device)
-        self.optimizer = optim.Adam(self.model.parameters(), lr=lrate, weight_decay=wdecay)
-        self.loss = metrics.masked_mae
-        self.scaler = scaler
-        self.clip = 5
-
-    def train(self, input, real_val):
-        self.model.train()
-        self.optimizer.zero_grad()
-        output, _ = self.model(input)
-        real = self.scaler.inverse_transform(real_val)
-        predict = self.scaler.inverse_transform(output)
-
-        loss = self.loss(predict, real, 0.0)
-        loss.backward()
-        if self.clip is not None:
-            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
-        self.optimizer.step()
-        mape = metrics.masked_mape(predict,real,0.0).item()
-        rmse = metrics.masked_rmse(predict,real,0.0).item()
-        return loss.item(), mape, rmse
-
-    def eval(self, input, real_val):
-        self.model.eval()
-        output, _ = self.model(input)
-        real = self.scaler.inverse_transform(real_val)
-        predict = self.scaler.inverse_transform(output)
-        loss = self.loss(predict, real, 0.0)
-        mape = metrics.masked_mape(predict,real,0.0).item()
-        rmse = metrics.masked_rmse(predict,real,0.0).item()
-        return loss.item(), mape, rmse
diff --git a/baselines/BigST/arch/preprocess/preprocess.py b/baselines/BigST/arch/preprocess/preprocess.py
deleted file mode 100644
index feb795e3..00000000
--- a/baselines/BigST/arch/preprocess/preprocess.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import torch
-import numpy as np
-import argparse
-import time
-import util
-from pipeline import train_pipeline
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--device',type=str,default='cuda:0',help='')
-parser.add_argument('--data',type=str,default='/data/pems_data/pems_vldb/long_term',help='data path')
-parser.add_argument('--input_length',type=int,default=2016,help='')
-parser.add_argument('--output_length',type=int,default=12,help='')
-parser.add_argument('--nhid',type=int,default=32,help='')
-parser.add_argument('--in_dim',type=int,default=3,help='inputs dimension')
-parser.add_argument('--num_nodes',type=int,default=9638,help='number of nodes')
-parser.add_argument('--batch_size',type=int,default=1,help='batch size')
-parser.add_argument('--tiny_batch_size',type=int,default=256,help='tiny batch size')
-parser.add_argument('--learning_rate',type=float,default=0.001,help='learning rate')
-parser.add_argument('--dropout',type=float,default=0.3,help='dropout rate')
-parser.add_argument('--weight_decay',type=float,default=0.0001,help='weight decay rate')
-parser.add_argument('--epochs',type=int,default=100,help='')
-parser.add_argument('--print_every',type=int,default=1,help='')
-#parser.add_argument('--seed',type=int,default=99,help='random seed')
-parser.add_argument('--save',type=str,default='checkpoint/',help='save path')
-parser.add_argument('--expid',type=int,default=1,help='experiment id')
-
-args = parser.parse_args()
-
-def main():
-    # set seed
-    # torch.manual_seed(args.seed)
-    # np.random.seed(args.seed)
-    # load data
-    device = torch.device(args.device)
-    dataloader = util.load_dataset(args.data, args.batch_size, args.batch_size, args.batch_size, 
-                                   args.input_length, args.output_length)
-    scaler = dataloader['scaler']
-    tiny_batch_size = args.tiny_batch_size
-
-    print(args)
-
-    trainer = train_pipeline(scaler, args.input_length, args.output_length, args.in_dim, args.num_nodes, 
-                             args.nhid, args.dropout, args.learning_rate, args.weight_decay, device)
-
-    print("start training...",flush=True)
-    his_loss =[]
-    train_time = []
-    val_time = []
-    
-    for i in range(1, args.epochs+1):
-        # train
-        train_loss = []
-        train_mape = []
-        train_rmse = []
-        t1 = time.time()
-        dataloader['train_loader'].shuffle()
-        for iter, (x, y) in enumerate(dataloader['train_loader'].get_iterator()):
-            B, T, N, F = x.shape
-            batch_num = int(B * N / tiny_batch_size)
-            idx_perm = np.random.permutation([i for i in range(B*N)])
-            for j in range(batch_num):
-                if j==batch_num-1:
-                    x_ = x[:, :, idx_perm[(j+1)*tiny_batch_size:], :]
-                    y_ = y[:, :, idx_perm[(j+1)*tiny_batch_size:], :]
-                else:
-                    x_ = x[:, :, idx_perm[j*tiny_batch_size:(j+1)*tiny_batch_size], :]
-                    y_ = y[:, :, idx_perm[j*tiny_batch_size:(j+1)*tiny_batch_size], :]
-
-                trainx = torch.Tensor(x_).to(device) # (B, T, N, F)
-                trainx = trainx.transpose(1, 2) # (B, N, T, F)
-                trainy = torch.Tensor(y_).to(device) # (B, T, N, F)
-                trainy = trainy.transpose(1, 2) # (B, N, T, F)
-                metrics = trainer.train(trainx, trainy[:,:,:,0])
-                train_loss.append(metrics[0])
-                train_mape.append(metrics[1])
-                train_rmse.append(metrics[2])
-                t2 = time.time()
-                train_time.append(t2-t1)
-
-            if iter % args.print_every == 0:
-                log = 'Iter: {:03d}, Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}'
-                print(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1]),flush=True)
-                # Save the model parameters for subsequent preprocessing
-                torch.save(trainer.model.state_dict(), args.save+"linear_transformer.pth")
-
-        # validation
-        valid_loss = []
-        valid_mape = []
-        valid_rmse = []
-
-        s1 = time.time()
-        for iter, (x, y) in enumerate(dataloader['val_loader'].get_iterator()):
-            B, T, N, F = x.shape
-            batch_num = int(B*N/tiny_batch_size)
-            for k in range(batch_num):
-                if k==batch_num-1:
-                    x_ = x[:, :, (k+1)*tiny_batch_size:, :]
-                    y_ = y[:, :, (k+1)*tiny_batch_size:, :]
-                else:
-                    x_ = x[:, :, k*tiny_batch_size:(k+1)*tiny_batch_size, :]
-                    y_ = y[:, :, k*tiny_batch_size:(k+1)*tiny_batch_size, :]
-            testx = torch.Tensor(x).to(device)
-            testx = testx.transpose(1, 2)
-            testy = torch.Tensor(y).to(device)
-            testy = testy.transpose(1, 2)
-            metrics = trainer.eval(testx, testy[:,:,:,0])
-            valid_loss.append(metrics[0])
-            valid_mape.append(metrics[1])
-            valid_rmse.append(metrics[2])
-        s2 = time.time()
-        mvalid_loss = np.mean(valid_loss)
-        mvalid_mape = np.mean(valid_mape)
-        mvalid_rmse = np.mean(valid_rmse)
-        log = 'Epoch: {:03d}, Validation Inference Time: {:.4f} secs'
-        print(log.format(i,(s2-s1)))
-        log = 'Valid MAE: {:.4f}, Valid MAPE: {:.4f}, Valid RMSE: {:.4f}'
-        print(log.format(mvalid_loss, mvalid_mape, mvalid_rmse), flush=True)
-        val_time.append(s2-s1)
-           
-    print("Average Training Time: {:.4f} secs/epoch".format(np.mean(train_time)))
-    print("Average Inference Time: {:.4f} secs".format(np.mean(val_time)))
-
-if __name__ == "__main__":
-    t1 = time.time()
-    main()
-    t2 = time.time()
-    print("Total time spent: {:.4f}".format(t2-t1))
diff --git a/baselines/BigST/arch/preprocess/util.py b/baselines/BigST/arch/preprocess/util.py
deleted file mode 100644
index 81bf2cd7..00000000
--- a/baselines/BigST/arch/preprocess/util.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import pickle
-import numpy as np
-import os
-import scipy.sparse as sp
-import torch
-from scipy.sparse import linalg
-
-class DataLoader(object):
-    def __init__(self, data, batch_size, input_length, output_length):
-        self.seq_length_x = input_length
-        self.seq_length_y = output_length
-        self.y_start = 1
-        self.batch_size = batch_size
-        self.current_ind = 0
-        self.x_offsets = np.sort(np.concatenate((np.arange(-(self.seq_length_x - 1), 1, 1),)))
-        self.y_offsets = np.sort(np.arange(self.y_start, (self.seq_length_y + 1), 1))
-        self.min_t = abs(min(self.x_offsets))
-        self.max_t = abs(data.shape[0] - abs(max(self.y_offsets)))
-        mod = (self.max_t-self.min_t) % batch_size
-        if mod != 0:
-            self.data = data[:-mod]
-        else:
-            self.data = data
-        self.max_t = abs(self.data.shape[0] - abs(max(self.y_offsets)))
-        self.permutation = [i for i in range(self.min_t, self.max_t)]
-
-    def shuffle(self):
-        self.permutation = np.random.permutation([i for i in range(self.min_t, self.max_t)])
-
-    def get_iterator(self):
-        self.current_ind = 0
-
-        def _wrapper():
-            while self.current_ind < len(self.permutation):
-                if self.batch_size > 1:
-                    x_batch = []
-                    y_batch = []
-                    for i in range(self.batch_size):  
-                        x_i = self.data[self.permutation[self.current_ind+i] + self.x_offsets, ...]
-                        y_i = self.data[self.permutation[self.current_ind+i] + self.y_offsets, ...]
-                        x_batch.append(x_i)
-                        y_batch.append(y_i)
-
-                    x_batch = np.stack(x_batch, axis=0)
-                    y_batch = np.stack(y_batch, axis=0)
-                else:
-                    x_batch = self.data[self.permutation[self.current_ind] + self.x_offsets, ...]
-                    y_batch = self.data[self.permutation[self.current_ind] + self.y_offsets, ...]
-                    x_batch = np.expand_dims(x_batch, axis=0)
-                    y_batch = np.expand_dims(y_batch, axis=0)
-                yield (x_batch, y_batch)
-                self.current_ind += self.batch_size
-
-        return _wrapper()
-
-class StandardScaler():
-    """
-    Standard the input
-    """
-
-    def __init__(self, mean, std):
-        self.mean = mean
-        self.std = std
-
-    def transform(self, data):
-        return (data - self.mean) / self.std
-
-    def inverse_transform(self, data):
-        return (data * self.std) + self.mean
-
-def sym_adj(adj):
-    """Symmetrically normalize adjacency matrix."""
-    adj = sp.coo_matrix(adj)
-    rowsum = np.array(adj.sum(1))
-    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
-    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
-    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
-    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).astype(np.float32).todense()
-
-def asym_adj(adj):
-    adj = sp.coo_matrix(adj)
-    rowsum = np.array(adj.sum(1)).flatten()
-    d_inv = np.power(rowsum, -1).flatten()
-    d_inv[np.isinf(d_inv)] = 0.
-    d_mat= sp.diags(d_inv)
-    return d_mat.dot(adj).astype(np.float32).todense()
-
-def calculate_normalized_laplacian(adj):
-    """
-    # L = D^-1/2 (D-A) D^-1/2 = I - D^-1/2 A D^-1/2
-    # D = diag(A 1)
-    :param adj:
-    :return:
-    """
-    adj = sp.coo_matrix(adj)
-    d = np.array(adj.sum(1))
-    d_inv_sqrt = np.power(d, -0.5).flatten()
-    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
-    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
-    normalized_laplacian = sp.eye(adj.shape[0]) - adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
-    return normalized_laplacian
-
-def calculate_scaled_laplacian(adj_mx, lambda_max=2, undirected=True):
-    if undirected:
-        adj_mx = np.maximum.reduce([adj_mx, adj_mx.T])
-    L = calculate_normalized_laplacian(adj_mx)
-    if lambda_max is None:
-        lambda_max, _ = linalg.eigsh(L, 1, which='LM')
-        lambda_max = lambda_max[0]
-    L = sp.csr_matrix(L)
-    M, _ = L.shape
-    I = sp.identity(M, format='csr', dtype=L.dtype)
-    L = (2 / lambda_max * L) - I
-    return L.astype(np.float32).todense()
-
-def load_pickle(pickle_file):
-    try:
-        with open(pickle_file, 'rb') as f:
-            pickle_data = pickle.load(f)
-    except UnicodeDecodeError as e:
-        with open(pickle_file, 'rb') as f:
-            pickle_data = pickle.load(f, encoding='latin1')
-    except Exception as e:
-        print('Unable to load data ', pickle_file, ':', e)
-        raise
-    return pickle_data
-
-def load_adj(adj_filename, adjtype):
-    adj_mx = np.load(adj_filename)
-    print('adj_mx: ', adj_mx.shape)
-    adj = [asym_adj(adj_mx)]
-    return adj
-
-def load_dataset(dataset_dir, batch_size, valid_batch_size, test_batch_size, input_length, output_length):
-    data = {}
-    for category in ['train', 'val', 'test']:
-        data[category] = np.load(os.path.join(dataset_dir, category + '.npy'))
-        print('*'*10, category, data[category].shape, '*'*10)
-    scaler = StandardScaler(mean=data['train'][..., 0].mean(), std=data['train'][..., 0].std())
-    # Data format
-    for category in ['train', 'val', 'test']:
-        data[category][..., 0] = scaler.transform(data[category][..., 0])
-    data['train_loader'] = DataLoader(data['train'], batch_size, input_length, output_length)
-    data['val_loader'] = DataLoader(data['val'], valid_batch_size, input_length, output_length)
-    data['test_loader'] = DataLoader(data['test'], test_batch_size, input_length, output_length)
-    data['scaler'] = scaler
-    return data
diff --git a/baselines/BigST/arch/random_map.py b/baselines/BigST/arch/random_map.py
deleted file mode 100644
index ea7e49d4..00000000
--- a/baselines/BigST/arch/random_map.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-
-def create_products_of_givens_rotations(dim, seed):
-    nb_givens_rotations = dim * int(math.ceil(math.log(float(dim))))
-    q = np.eye(dim, dim)
-    np.random.seed(seed)
-    for _ in range(nb_givens_rotations):
-        random_angle = math.pi * np.random.uniform()
-        random_indices = np.random.choice(dim, 2)
-        index_i = min(random_indices[0], random_indices[1])
-        index_j = max(random_indices[0], random_indices[1])
-        slice_i = q[index_i]
-        slice_j = q[index_j]
-        new_slice_i = math.cos(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        new_slice_j = -math.sin(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        q[index_i] = new_slice_i
-        q[index_j] = new_slice_j
-    return torch.tensor(q, dtype=torch.float32)
-
-def create_random_matrix(m, d, seed=0, scaling=0, struct_mode=False):
-    nb_full_blocks = int(m/d)
-    block_list = []
-    current_seed = seed
-    for _ in range(nb_full_blocks):
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q)
-        current_seed += 1
-    remaining_rows = m - nb_full_blocks * d
-    if remaining_rows > 0:
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q[0:remaining_rows])
-    final_matrix = torch.vstack(block_list)
-
-    current_seed += 1
-    torch.manual_seed(current_seed)
-    if scaling == 0:
-        multiplier = torch.norm(torch.randn((m, d)), dim=1)
-    elif scaling == 1:
-        multiplier = torch.sqrt(torch.tensor(float(d))) * torch.ones(m)
-    else:
-        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
-
-    return torch.matmul(torch.diag(multiplier), final_matrix)
-
-def random_feature_map(data, is_query, projection_matrix=None, numerical_stabilizer=0.000001):
-    data_normalizer = 1.0 / torch.sqrt(torch.sqrt(torch.tensor(data.shape[-1], dtype=torch.float32)))
-    data = data_normalizer * data
-    ratio = 1.0 / torch.sqrt(torch.tensor(projection_matrix.shape[0], dtype=torch.float32))
-    data_dash = torch.einsum("bnhd,md->bnhm", data, projection_matrix)
-    diag_data = torch.square(data)
-    diag_data = torch.sum(diag_data, dim=len(data.shape)-1)
-    diag_data = diag_data / 2.0
-    diag_data = torch.unsqueeze(diag_data, dim=len(data.shape)-1)
-    last_dims_t = len(data_dash.shape) - 1
-    attention_dims_t = len(data_dash.shape) - 3
-    if is_query:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(data_dash, dim=last_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    else:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(torch.max(data_dash, dim=last_dims_t, keepdim=True)[0],
-                    dim=attention_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    return data_dash
diff --git a/baselines/BigST/loss/__init__.py b/baselines/BigST/loss/__init__.py
deleted file mode 100644
index c22530d7..00000000
--- a/baselines/BigST/loss/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .loss import bigst_loss
\ No newline at end of file
diff --git a/baselines/BigST/loss/loss.py b/baselines/BigST/loss/loss.py
deleted file mode 100644
index 831f541e..00000000
--- a/baselines/BigST/loss/loss.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-import numpy as np
-from basicts.metrics import masked_mae
-
-def spatial_loss(node_vec1, node_vec2, supports, edge_indices):
-    B = node_vec1.size(0)
-    node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
-    node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
-    
-    node_vec1_end, node_vec2_start = node_vec1[edge_indices[:, 0]], node_vec2[edge_indices[:, 1]] # [E, B, 1, r]
-    attn1 = torch.einsum("ebhm,ebhm->ebh", node_vec1_end, node_vec2_start) # [E, B, 1]
-    attn1 = attn1.permute(1, 0, 2) # [B, E, 1]
-
-    one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
-    node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
-    attn_norm = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum)
-    
-    attn2 = attn_norm[edge_indices[:, 0]]  # [E, B, 1]
-    attn2 = attn2.permute(1, 0, 2) # [B, E, 1]
-    attn_score = attn1 / attn2 # [B, E, 1]
-    
-    d_norm = supports[0][edge_indices[:, 0], edge_indices[:, 1]]
-    d_norm = d_norm.reshape(1, -1, 1).repeat(B, 1, attn_score.shape[-1])
-    spatial_loss = torch.mean(attn_score.log() * d_norm)
-    
-    return spatial_loss
-
-def bigst_loss(prediction, target, node_vec1, node_vec2, supports, use_spatial):
-    if use_spatial:
-        supports = [support.to(prediction.device) for support in supports]
-        edge_indices = torch.nonzero(supports[0] > 0)
-        s_loss = spatial_loss(node_vec1, node_vec2, supports, edge_indices)
-        return masked_mae(prediction, target, 0.0) - 0.3 * s_loss # 源代码：pipline.py line30
-    else:
-        return masked_mae(prediction, target, 0.0)
\ No newline at end of file
diff --git a/baselines/BigSTPreprocess/PEMS08.py b/baselines/BigSTPreprocess/PEMS08.py
deleted file mode 100644
index 39d7f4b9..00000000
--- a/baselines/BigSTPreprocess/PEMS08.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import os
-import sys
-import torch
-from easydict import EasyDict
-sys.path.append(os.path.abspath(__file__ + '/../../..'))
-
-from basicts.metrics import masked_mae, masked_mape, masked_rmse
-from basicts.data import TimeSeriesForecastingDataset
-from basicts.runners import SimpleTimeSeriesForecastingRunner
-from basicts.scaler import ZScoreScaler
-from basicts.utils import get_regular_settings, load_adj
-
-from .arch import BigSTPreprocess
-from .runner import BigSTPreprocessRunner
-
-############################## Hot Parameters ##############################
-# Dataset & Metrics configuration
-DATA_NAME = 'PEMS08'  # Dataset name
-regular_settings = get_regular_settings(DATA_NAME)
-INPUT_LEN = 2016 
-OUTPUT_LEN = 12
-TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
-NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
-RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
-NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
-# Model architecture and parameters
-MODEL_ARCH = BigSTPreprocess
-adj_mx, _ = load_adj("datasets/" + DATA_NAME +
-                     "/adj_mx.pkl", "doubletransition")
-MODEL_PARAM = {
-    "num_nodes": 170,
-    "in_dim": 3,
-    "dropout": 0.3,
-    "input_length": INPUT_LEN,
-    "output_length": OUTPUT_LEN,
-    "nhid": 32,
-    "tiny_batch_size": 64,
-
-}
-
-NUM_EPOCHS = 100
-
-############################## General Configuration ##############################
-CFG = EasyDict()
-# General settings
-CFG.DESCRIPTION = 'An Example Config'
-CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
-# Runner
-CFG.RUNNER = BigSTPreprocessRunner
-
-############################## Environment Configuration ##############################
-
-CFG.ENV = EasyDict() # Environment settings. Default: None
-CFG.ENV.SEED = 0 # Random seed. Default: None
-
-############################## Dataset Configuration ##############################
-CFG.DATASET = EasyDict()
-# Dataset settings
-CFG.DATASET.NAME = DATA_NAME
-CFG.DATASET.TYPE = TimeSeriesForecastingDataset
-CFG.DATASET.PARAM = EasyDict({
-    'dataset_name': DATA_NAME,
-    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
-    'input_len': INPUT_LEN,
-    'output_len': OUTPUT_LEN,
-    # 'mode' is automatically set by the runner
-})
-
-############################## Scaler Configuration ##############################
-CFG.SCALER = EasyDict()
-# Scaler settings
-CFG.SCALER.TYPE = ZScoreScaler # Scaler class
-CFG.SCALER.PARAM = EasyDict({
-    'dataset_name': DATA_NAME,
-    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
-    'norm_each_channel': NORM_EACH_CHANNEL,
-    'rescale': RESCALE,
-})
-
-############################## Model Configuration ##############################
-CFG.MODEL = EasyDict()
-# Model settings
-CFG.MODEL.NAME = MODEL_ARCH.__name__
-CFG.MODEL.ARCH = MODEL_ARCH
-CFG.MODEL.PARAM = MODEL_PARAM
-CFG.MODEL.FORWARD_FEATURES = [0, 1, 2]
-CFG.MODEL.TARGET_FEATURES = [0]
-
-############################## Metrics Configuration ##############################
-
-CFG.METRICS = EasyDict()
-# Metrics settings
-CFG.METRICS.FUNCS = EasyDict({
-                                'MAE': masked_mae,
-                                'MAPE': masked_mape,
-                                'RMSE': masked_rmse,
-                            })
-CFG.METRICS.TARGET = 'MAE'
-CFG.METRICS.NULL_VAL = NULL_VAL
-
-############################## Training Configuration ##############################
-CFG.TRAIN = EasyDict()
-CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
-CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
-    'checkpoints',
-    MODEL_ARCH.__name__,
-    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
-)
-CFG.TRAIN.LOSS = masked_mae
-# Optimizer settings
-CFG.TRAIN.OPTIM = EasyDict()
-CFG.TRAIN.OPTIM.TYPE = "AdamW"
-CFG.TRAIN.OPTIM.PARAM = {
-    "lr": 0.002,
-    "weight_decay": 0.0001,
-}
-# Learning rate scheduler settings
-CFG.TRAIN.LR_SCHEDULER = EasyDict()
-CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
-CFG.TRAIN.LR_SCHEDULER.PARAM = {
-    "milestones": [1, 50],
-    "gamma": 0.5
-}
-# Train data loader settings
-CFG.TRAIN.DATA = EasyDict()
-CFG.TRAIN.DATA.BATCH_SIZE = 1
-CFG.TRAIN.DATA.SHUFFLE = True
-# Gradient clipping settings
-CFG.TRAIN.CLIP_GRAD_PARAM = {
-    "max_norm": 5.0
-}
-
-############################## Validation Configuration ##############################
-CFG.VAL = EasyDict()
-CFG.VAL.INTERVAL = 1
-CFG.VAL.DATA = EasyDict()
-CFG.VAL.DATA.BATCH_SIZE = 1
-
-############################## Test Configuration ##############################
-CFG.TEST = EasyDict()
-CFG.TEST.INTERVAL = 1
-CFG.TEST.DATA = EasyDict()
-CFG.TEST.DATA.BATCH_SIZE = 1
-
-############################## Evaluation Configuration ##############################
-
-CFG.EVAL = EasyDict()
-
-# Evaluation parameters
-CFG.EVAL.HORIZONS = [3, 6, 12] # Prediction horizons for evaluation. Default: []
-CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
-
-
diff --git a/baselines/BigSTPreprocess/arch/__init__.py b/baselines/BigSTPreprocess/arch/__init__.py
deleted file mode 100644
index b56180dd..00000000
--- a/baselines/BigSTPreprocess/arch/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .bigst_preprocess_arch import BigSTPreprocess
-
-__all__ = ["BigSTPreprocess"]
diff --git a/baselines/BigSTPreprocess/arch/bigst_preprocess_arch.py b/baselines/BigSTPreprocess/arch/bigst_preprocess_arch.py
deleted file mode 100644
index 4a38990a..00000000
--- a/baselines/BigSTPreprocess/arch/bigst_preprocess_arch.py
+++ /dev/null
@@ -1,220 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-import sys
-import numpy as np 
-import pdb
-
-def create_projection_matrix(m, d, seed=0, scaling=0, struct_mode=False):
-    nb_full_blocks = int(m/d)
-    block_list = []
-    current_seed = seed
-    for _ in range(nb_full_blocks):
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q)
-        current_seed += 1
-    remaining_rows = m - nb_full_blocks * d
-    if remaining_rows > 0:
-        torch.manual_seed(current_seed)
-        if struct_mode:
-            q = create_products_of_givens_rotations(d, current_seed)
-        else:
-            unstructured_block = torch.randn((d, d))
-            q, _ = torch.qr(unstructured_block)
-            q = torch.t(q)
-        block_list.append(q[0:remaining_rows])
-    final_matrix = torch.vstack(block_list)
-
-    current_seed += 1
-    torch.manual_seed(current_seed)
-    if scaling == 0:
-        multiplier = torch.norm(torch.randn((m, d)), dim=1)
-    elif scaling == 1:
-        multiplier = torch.sqrt(torch.tensor(float(d))) * torch.ones(m)
-    else:
-        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
-
-    return torch.matmul(torch.diag(multiplier), final_matrix)
-
-def create_products_of_givens_rotations(dim, seed):
-    nb_givens_rotations = dim * int(math.ceil(math.log(float(dim))))
-    q = np.eye(dim, dim)
-    np.random.seed(seed)
-    for _ in range(nb_givens_rotations):
-        random_angle = math.pi * np.random.uniform()
-        random_indices = np.random.choice(dim, 2)
-        index_i = min(random_indices[0], random_indices[1])
-        index_j = max(random_indices[0], random_indices[1])
-        slice_i = q[index_i]
-        slice_j = q[index_j]
-        new_slice_i = math.cos(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        new_slice_j = -math.sin(random_angle) * slice_i + math.cos(random_angle) * slice_j
-        q[index_i] = new_slice_i
-        q[index_j] = new_slice_j
-    return torch.tensor(q, dtype=torch.float32)
-
-def softmax_kernel_transformation(data, is_query, projection_matrix=None, numerical_stabilizer=0.000001):
-    data_normalizer = 1.0 / torch.sqrt(torch.sqrt(torch.tensor(data.shape[-1], dtype=torch.float32)))
-    data = data_normalizer * data
-    ratio = 1.0 / torch.sqrt(torch.tensor(projection_matrix.shape[0], dtype=torch.float32))
-    data_dash = torch.einsum("bnhd,md->bnhm", data, projection_matrix)
-    diag_data = torch.square(data)
-    diag_data = torch.sum(diag_data, dim=len(data.shape)-1)
-    diag_data = diag_data / 2.0
-    diag_data = torch.unsqueeze(diag_data, dim=len(data.shape)-1)
-    last_dims_t = len(data_dash.shape) - 1
-    attention_dims_t = len(data_dash.shape) - 3
-    if is_query:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(data_dash, dim=last_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    else:
-        data_dash = ratio * (
-            torch.exp(data_dash - diag_data - torch.max(torch.max(data_dash, dim=last_dims_t, keepdim=True)[0],
-                    dim=attention_dims_t, keepdim=True)[0]) + numerical_stabilizer
-        )
-    return data_dash
-
-def numerator(qs, ks, vs):
-    kvs = torch.einsum("nbhm,nbhd->bhmd", ks, vs) # kvs refers to U_k in the paper
-    return torch.einsum("nbhm,bhmd->nbhd", qs, kvs)
-
-def denominator(qs, ks):
-    all_ones = torch.ones([ks.shape[0]]).to(qs.device)
-    ks_sum = torch.einsum("nbhm,n->bhm", ks, all_ones) # ks_sum refers to O_k in the paper
-    return torch.einsum("nbhm,bhm->nbh", qs, ks_sum)
-
-def linearized_softmax(x, query, key):
-    # x: [B, N, H, D] query: [B, N, H, m], key: [B, N, H, m]
-    query = query.permute(1, 0, 2, 3) # [N, B, H, m]
-    key = key.permute(1, 0, 2, 3) # [N, B, H, m]
-    x = x.permute(1, 0, 2, 3) # [N, B, H, D]
-
-    z_num = numerator(query, key, x) # [N, B, H, D]
-    z_den = denominator(query, key) # [N, H]
-
-    z_num = z_num.permute(1, 0, 2, 3)  # [B, N, H, D]
-    z_den = z_den.permute(1, 0, 2)
-    z_den = torch.unsqueeze(z_den, len(z_den.shape))
-    z_output = z_num / z_den # # [B, N, H, D]
-
-    return z_output
-
-class linearized_attention(nn.Module):
-    def __init__(self, c_in, c_out, dropout, random_feature_dim=30, tau=1.0, num_heads=4):
-        super(linearized_attention, self).__init__()
-        self.Wk = nn.Linear(c_in, c_out * num_heads)
-        self.Wq = nn.Linear(c_in, c_out * num_heads)
-        self.Wv = nn.Linear(c_in, c_out * num_heads)
-        self.Wo = nn.Linear(c_out * num_heads, c_out)
-        self.c_in = c_in
-        self.c_out = c_out
-        self.num_heads = num_heads
-        self.tau = tau
-        self.random_feature_dim = random_feature_dim
-        self.activation = nn.ReLU
-        self.dropout = dropout
-        
-    def reset_parameters(self):
-        self.Wk.reset_parameters()
-        self.Wq.reset_parameters()
-        self.Wv.reset_parameters()
-        self.Wo.reset_parameters()
-
-    def forward(self, x):
-        B, T = x.size(0), x.size(1) # (B, T, D)
-        query = self.Wq(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        key = self.Wk(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        x = self.Wv(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
-        
-        dim = query.shape[-1] # (B, T, H, D)
-        seed = torch.ceil(torch.abs(torch.sum(query) * 1e8)).to(torch.int32)
-        projection_matrix = create_projection_matrix(self.random_feature_dim, dim, seed=seed).to(query.device) # (d, m)
-        query = query / math.sqrt(self.tau)
-        key = key / math.sqrt(self.tau)
-        query = softmax_kernel_transformation(query, True, projection_matrix) # [B, T, H, m]
-        key = softmax_kernel_transformation(key, False, projection_matrix) # [B, T, H, m]
-        
-        x = linearized_softmax(x, query, key)
-        
-        x = self.Wo(x.flatten(-2, -1)) # (B, T, D)
-        
-        return x
-
-
-class BigSTPreprocess(nn.Module):
-    """
-    Paper: BigST: Linear Complexity Spatio-Temporal Graph Neural Network for Traffic Forecasting on Large-Scale Road Networks
-    Link: https://dl.acm.org/doi/10.14778/3641204.3641217
-    Official Code: https://github.com/usail-hkust/BigST?tab=readme-ov-file
-    Venue: VLDB 2024
-    Task: Spatial-Temporal Forecasting
-    """
-    def __init__(self, input_length, output_length, in_dim, num_nodes, nhid, tiny_batch_size, dropout=0.3):
-    # def __init__(self, **model_kwargs):
-        super(BigSTPreprocess, self).__init__()
-        self.tau = 1.0
-        self.layer_num = 3
-        self.random_feature_dim = nhid*2
-        
-        self.use_residual = True
-        self.use_bn = False
-        self.use_act = True
-        
-        self.dropout = dropout
-        self.activation = nn.ReLU()
-        
-        self.fc_convs = nn.ModuleList()
-        self.transformer_layer = nn.ModuleList()
-        self.bn = nn.ModuleList()
-        self.context_conv = nn.Conv2d(in_channels=in_dim, out_channels=nhid, kernel_size=(12, 1), stride=(12, 1))
-        
-        self.temporal_embedding = nn.Parameter(torch.empty(int(input_length/12), nhid), requires_grad=True) # (C, nhid)
-        nn.init.xavier_uniform_(self.temporal_embedding)
-        
-        for i in range(self.layer_num):
-            self.transformer_layer.append(linearized_attention(nhid, nhid, self.dropout, self.random_feature_dim, self.tau))
-            self.bn.append(nn.LayerNorm(nhid))
-        
-        self.regression_layer = nn.Linear(nhid, output_length)
-
-        self.tiny_batch_size = tiny_batch_size
-
-    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs) -> torch.Tensor:
-        x = history_data.transpose(1,2)
-        # input: (1, 9638, 2016, 3) (B, N, T, D)
-        B, N, T, D = x.size()
-        pe = self.temporal_embedding.unsqueeze(0).expand(B*N, -1, -1) # (B*N, T/12, nhid)
-        
-        x = x.reshape(B*N, T, D)
-        x = x.permute(0, 2, 1).unsqueeze(-1) # (B*N, T, D) -> (B*N, D, T, 1)
-
-        # convolution layer
-        x = self.context_conv(x) # (B*N, D, T, 1) -> (B*N, nhid, T/12, 1)
-        x = x.squeeze(-1) # (B*N, nhid, T/12)
-
-        # temporal embedding layer
-        x = x.permute(0, 2, 1) # (B*N, T/12, nhid)
-        x = x+pe # (B*N, T/12, nhid)
-
-        # linearized attention
-        for num in range(self.layer_num):
-            residual = x # (B*N, T/12, nhid)
-            x = self.transformer_layer[num](x) # (B*N, T/12, nhid)
-            x = self.bn[num](x)
-            x = x+residual # (B*N, T/12, nhid)
-
-        x = self.activation(x) # (B*N, T/12, nhid)
-        x = x[:, -1, :]
-        # x = torch.sum(x, dim=1) # (B*N, nhid)
-        feat = x.view(B, N, -1) # (B, N, nhid)
-        x = self.regression_layer(feat) # (B, N, output_length)
-        return {'prediction': x.transpose(1,2).unsqueeze(-1), 'feat':feat}
\ No newline at end of file
diff --git a/baselines/BigSTPreprocess/runner/__init__.py b/baselines/BigSTPreprocess/runner/__init__.py
deleted file mode 100644
index 2a0ecce8..00000000
--- a/baselines/BigSTPreprocess/runner/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .bigstpreprocess_runner import BigSTPreprocessRunner
\ No newline at end of file
diff --git a/baselines/BigSTPreprocess/runner/bigstpreprocess_runner.py b/baselines/BigSTPreprocess/runner/bigstpreprocess_runner.py
deleted file mode 100644
index 74b9a607..00000000
--- a/baselines/BigSTPreprocess/runner/bigstpreprocess_runner.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import Tuple, Union, Dict
-import torch
-import numpy as np
-import wandb
-import pdb
-import os
-
-from basicts.runners import SimpleTimeSeriesForecastingRunner
-
-
-class BigSTPreprocessRunner(SimpleTimeSeriesForecastingRunner):
-    def __init__(self, cfg: dict):
-        super().__init__(cfg)
-        
-        self.tiny_batch_size = cfg.MODEL.PARAM.tiny_batch_size
-   
-    def preprocessing(self, input_data: Dict) -> Dict:
-        """Preprocess data.
-
-        Args:
-            input_data (Dict): Dictionary containing data to be processed.
-
-        Returns:
-            Dict: Processed data.
-        """
-
-        input_data = super().preprocessing(input_data)
-        
-        x = input_data['inputs']
-        y = input_data['target']
-        
-
-        B, T, N, F = x.shape
-        batch_num = int(B * N / self.tiny_batch_size) # 似乎要确保不能等于0
-        idx_perm = np.random.permutation([i for i in range(B*N)])
-
-        for j in range(batch_num):
-            if j==batch_num-1:
-                x_ = x[:, :, idx_perm[(j+1)*self.tiny_batch_size:], :]
-                y_ = y[:, :, idx_perm[(j+1)*self.tiny_batch_size:], :]
-            else:
-                x_ = x[:, :, idx_perm[j*self.tiny_batch_size:(j+1)*self.tiny_batch_size], :]
-                y_ = y[:, :, idx_perm[j*self.tiny_batch_size:(j+1)*self.tiny_batch_size], :]
-
-        input_data['inputs'] = x_
-        input_data['target'] = y_
-        return input_data
-
-   
\ No newline at end of file