diff --git a/baselines/BigST/PEMS08.py b/baselines/BigST/PEMS08.py
new file mode 100644
index 00000000..9c9a4b2e
--- /dev/null
+++ b/baselines/BigST/PEMS08.py
@@ -0,0 +1,182 @@
+import os
+import sys
+import torch
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+
+from basicts.metrics import masked_mae, masked_mape, masked_rmse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings, load_adj
+
+from .arch import BigST
+# from .runner import BigSTPreprocessRunner
+from .loss import bigst_loss
+
+import pdb
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'PEMS08'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = 2016 # regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = 12 # regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+PREPROCESSED_FILE = "checkpoints\\BigSTPreprocess\\PEMS08_100_2016_12\\db8308a2c87de35e5f3db6177c5714ff\\BigSTPreprocess_best_val_MAE.pt"
+MODEL_ARCH = BigST
+
+adj_mx, _ = load_adj("datasets/" + DATA_NAME +
+                     "/adj_mx.pkl", "doubletransition")
+MODEL_PARAM = {
+    "bigst_args":{
+                "num_nodes": 170,
+                "seq_num": 12, 
+                "in_dim": 3,
+                "out_dim": OUTPUT_LEN, #  源代码固定成12了 
+                "hid_dim": 32,
+                "tau" : 0.25,
+                "random_feature_dim": 64,
+                "node_emb_dim": 32,
+                "time_emb_dim": 32,
+                "use_residual": True,
+                "use_bn": True,
+                "use_long": True,
+                "use_spatial": True,
+                "dropout": 0.3,
+                "supports": [torch.tensor(i) for i in adj_mx],
+                "time_of_day_size": 288, 
+                "day_of_week_size": 7
+    },
+    "preprocess_path": PREPROCESSED_FILE,
+    "preprocess_args":{
+                "num_nodes": 170,
+                "in_dim": 3,
+                "dropout": 0.3,
+                "input_length": 2016,
+                "output_length": 12,
+                "nhid": 32,
+                "tiny_batch_size": 64,
+    }
+
+
+}
+
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner 
+
+############################## Environment Configuration ##############################
+
+CFG.ENV = EasyDict() # Environment settings. Default: None
+CFG.ENV.SEED = 0 # Random seed. Default: None
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MAPE': masked_mape,
+                                'RMSE': masked_rmse,
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+
+
+CFG.TRAIN.LOSS = bigst_loss if MODEL_PARAM['bigst_args']['use_spatial'] else masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "AdamW"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.002,
+    "weight_decay": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 50],
+    "gamma": 0.5
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64 
+CFG.TRAIN.DATA.SHUFFLE = True
+# Gradient clipping settings
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    "max_norm": 5.0
+}
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64 
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64 
+
+############################## Evaluation Configuration ##############################
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [3, 6, 12] # Prediction horizons for evaluation. Default: []
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
+
+
diff --git a/baselines/BigST/PreprocessPEMS08.py b/baselines/BigST/PreprocessPEMS08.py
new file mode 100644
index 00000000..39d7f4b9
--- /dev/null
+++ b/baselines/BigST/PreprocessPEMS08.py
@@ -0,0 +1,153 @@
+import os
+import sys
+import torch
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+
+from basicts.metrics import masked_mae, masked_mape, masked_rmse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings, load_adj
+
+from .arch import BigSTPreprocess
+from .runner import BigSTPreprocessRunner
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'PEMS08'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = 2016 
+OUTPUT_LEN = 12
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = BigSTPreprocess
+adj_mx, _ = load_adj("datasets/" + DATA_NAME +
+                     "/adj_mx.pkl", "doubletransition")
+MODEL_PARAM = {
+    "num_nodes": 170,
+    "in_dim": 3,
+    "dropout": 0.3,
+    "input_length": INPUT_LEN,
+    "output_length": OUTPUT_LEN,
+    "nhid": 32,
+    "tiny_batch_size": 64,
+
+}
+
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = BigSTPreprocessRunner
+
+############################## Environment Configuration ##############################
+
+CFG.ENV = EasyDict() # Environment settings. Default: None
+CFG.ENV.SEED = 0 # Random seed. Default: None
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MAPE': masked_mape,
+                                'RMSE': masked_rmse,
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "AdamW"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.002,
+    "weight_decay": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 50],
+    "gamma": 0.5
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 1
+CFG.TRAIN.DATA.SHUFFLE = True
+# Gradient clipping settings
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    "max_norm": 5.0
+}
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 1
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 1
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [3, 6, 12] # Prediction horizons for evaluation. Default: []
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
+
+
diff --git a/baselines/BigST/arch/__init__.py b/baselines/BigST/arch/__init__.py
new file mode 100644
index 00000000..e2d419fd
--- /dev/null
+++ b/baselines/BigST/arch/__init__.py
@@ -0,0 +1,5 @@
+from .bigst_arch import BigST
+from .preprocess import BigSTPreprocess
+
+
+__all__ = ["BigST", "BigSTPreprocess"]
diff --git a/baselines/BigST/arch/bigst_arch.py b/baselines/BigST/arch/bigst_arch.py
new file mode 100644
index 00000000..dd3d0342
--- /dev/null
+++ b/baselines/BigST/arch/bigst_arch.py
@@ -0,0 +1,78 @@
+import os
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .linear_conv import *
+from torch.autograd import Variable
+import pdb
+from .preprocess import BigSTPreprocess
+from .model import Model
+
+def sample_period(x, time_num):
+    # trainx (B, N, T, F)
+    history_length = x.shape[-2]
+    idx_list = [i for i in range(history_length)]
+    period_list = [idx_list[i:i+12] for i in range(0, history_length, time_num)]
+    period_feat = [x[:,:,sublist,0] for sublist in period_list]
+    period_feat = torch.stack(period_feat)
+    period_feat = torch.mean(period_feat, dim=0)
+    
+    return period_feat
+
+class BigST(nn.Module):
+    """
+    Paper: BigST: Linear Complexity Spatio-Temporal Graph Neural Network for Traffic Forecasting on Large-Scale Road Networks
+    Link: https://dl.acm.org/doi/10.14778/3641204.3641217
+    Official Code: https://github.com/usail-hkust/BigST?tab=readme-ov-file
+    Venue: VLDB 2024
+    Task: Spatial-Temporal Forecasting
+    """
+
+    def __init__(self, bigst_args, preprocess_path, preprocess_args):
+        super(BigST, self).__init__()
+
+        self.use_long = bigst_args['use_long']
+        self.in_dim = bigst_args['in_dim']
+        self.out_dim = bigst_args['out_dim']
+        self.time_num = bigst_args['time_of_day_size']
+        self.bigst = Model(**bigst_args) 
+
+        if self.use_long:
+            self.feat_extractor = BigSTPreprocess(**preprocess_args)
+            self.load_pre_trained_model(preprocess_path)
+            
+    def load_pre_trained_model(self, preprocess_path):
+        """Load pre-trained model"""
+
+        # load parameters
+        checkpoint_dict = torch.load(preprocess_path)
+        self.feat_extractor.load_state_dict(checkpoint_dict["model_state_dict"])
+        # freeze parameters
+        for param in self.feat_extractor.parameters():
+            param.requires_grad = False
+
+        self.feat_extractor.eval()
+
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs) -> torch.Tensor:
+        history_data = history_data.transpose(1,2) # (B, N, T, D)
+        x = history_data[:, :, -self.out_dim:]         # (batch_size, in_len, data_dim)
+
+        if self.use_long:
+            feat = []
+            for i in range(history_data.shape[0]):
+                with torch.no_grad():
+                     feat_sample = self.feat_extractor(history_data[[i],:,:,:], future_data, batch_seen, epoch, train)
+                feat.append(feat_sample['feat'])
+
+            feat = torch.cat(feat, dim=0)
+            feat_period = sample_period(history_data, self.time_num)
+            feat = torch.cat([feat, feat_period], dim=2)
+
+            return self.bigst(x, feat)
+
+        else:
+            return self.bigst(x)
+
+        
\ No newline at end of file
diff --git a/baselines/BigST/arch/linear_conv.py b/baselines/BigST/arch/linear_conv.py
new file mode 100644
index 00000000..34d84eab
--- /dev/null
+++ b/baselines/BigST/arch/linear_conv.py
@@ -0,0 +1,99 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from .random_map import *
+
+def linear_kernel(x, node_vec1, node_vec2):
+    # x: [B, N, 1, nhid] node_vec1: [B, N, 1, r], node_vec2: [B, N, 1, r]
+    node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
+    node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
+    x = x.permute(1, 0, 2, 3) # [N, B, 1, nhid]
+    
+    v2x = torch.einsum("nbhm,nbhd->bhmd", node_vec2, x)
+    out1 = torch.einsum("nbhm,bhmd->nbhd", node_vec1, v2x) # [N, B, 1, nhid]
+    
+    one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
+    node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
+    out2 = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum) # [N, 1]
+
+    out1 = out1.permute(1, 0, 2, 3)  # [B, N, 1, nhid]
+    out2 = out2.permute(1, 0, 2)
+    out2 = torch.unsqueeze(out2, len(out2.shape))
+    out = out1 / out2 # [B, N, 1, nhid]
+
+    return out
+
+# def spatial_loss(node_vec1, node_vec2, supports, edge_indices):
+#     B = node_vec1.size(0)
+#     node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
+#     node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
+    
+#     node_vec1_end, node_vec2_start = node_vec1[edge_indices[:, 0]], node_vec2[edge_indices[:, 1]] # [E, B, 1, r]
+#     attn1 = torch.einsum("ebhm,ebhm->ebh", node_vec1_end, node_vec2_start) # [E, B, 1]
+#     attn1 = attn1.permute(1, 0, 2) # [B, E, 1]
+
+#     one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
+#     node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
+#     attn_norm = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum)
+    
+#     attn2 = attn_norm[edge_indices[:, 0]]  # [E, B, 1]
+#     attn2 = attn2.permute(1, 0, 2) # [B, E, 1]
+#     attn_score = attn1 / attn2 # [B, E, 1]
+    
+#     d_norm = supports[0][edge_indices[:, 0], edge_indices[:, 1]]
+#     d_norm = d_norm.reshape(1, -1, 1).repeat(B, 1, attn_score.shape[-1])
+#     spatial_loss = torch.mean(attn_score.log() * d_norm)
+    
+#     return spatial_loss
+
+class conv_approximation(nn.Module):
+    def __init__(self, dropout, tau, random_feature_dim):
+        super(conv_approximation, self).__init__()
+        self.tau = tau
+        self.random_feature_dim = random_feature_dim
+        self.activation = nn.ReLU()
+        self.dropout = dropout
+
+    def forward(self, x, node_vec1, node_vec2):
+        B = x.size(0) # (B, N, 1, nhid)
+        dim = node_vec1.shape[-1] # (N, 1, d)
+        
+        random_seed = torch.ceil(torch.abs(torch.sum(node_vec1) * 1e8)).to(torch.int32)
+        random_matrix = create_random_matrix(self.random_feature_dim, dim, seed=random_seed).to(node_vec1.device) # (d, r)
+        
+        node_vec1 = node_vec1 / math.sqrt(self.tau)
+        node_vec2 = node_vec2 / math.sqrt(self.tau)
+        node_vec1_prime = random_feature_map(node_vec1, True, random_matrix) # [B, N, 1, r]
+        node_vec2_prime = random_feature_map(node_vec2, False, random_matrix) # [B, N, 1, r]
+        
+        x = linear_kernel(x, node_vec1_prime, node_vec2_prime)
+        
+        return x, node_vec1_prime, node_vec2_prime
+
+class linearized_conv(nn.Module):
+    def __init__(self, in_dim, hid_dim, dropout, tau=1.0, random_feature_dim=64):
+        super(linearized_conv, self).__init__()
+        
+        self.dropout = dropout
+        self.tau = tau
+        self.random_feature_dim = random_feature_dim
+        
+        self.input_fc = nn.Conv2d(in_channels=in_dim, out_channels=hid_dim, kernel_size=(1, 1), bias=True)
+        self.activation = nn.ReLU()
+        self.dropout_layer = nn.Dropout(p=dropout)
+        
+        self.conv_app_layer = conv_approximation(self.dropout, self.tau, self.random_feature_dim)
+        
+    def forward(self, input_data, node_vec1, node_vec2):
+        x = self.input_fc(input_data)
+        x = self.activation(x)
+        x = self.dropout_layer(x)
+        
+        x = x.permute(0, 2, 3, 1) # (B, N, 1, dim*4)
+        x, node_vec1_prime, node_vec2_prime = self.conv_app_layer(x, node_vec1, node_vec2)
+        x = x.permute(0, 3, 1, 2) # (B, dim*4, N, 1)
+        
+        return x, node_vec1_prime, node_vec2_prime
diff --git a/baselines/BigST/arch/model.py b/baselines/BigST/arch/model.py
new file mode 100644
index 00000000..31063da6
--- /dev/null
+++ b/baselines/BigST/arch/model.py
@@ -0,0 +1,122 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .linear_conv import *
+from torch.autograd import Variable
+import pdb
+
+class Model(nn.Module):
+    def __init__(self, seq_num, in_dim, out_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
+                 use_residual, use_bn, use_spatial, use_long, dropout, time_of_day_size, day_of_week_size, supports=None, edge_indices=None):
+        super(Model, self).__init__()
+
+        self.tau = tau
+        self.layer_num = 3
+        self.in_dim = in_dim
+        self.random_feature_dim = random_feature_dim
+        
+        self.use_residual = use_residual
+        self.use_bn = use_bn
+        self.use_spatial = use_spatial
+        self.use_long = use_long
+        
+        self.dropout = dropout
+        self.activation = nn.ReLU()
+        self.supports = supports
+        
+        self.time_num = time_of_day_size
+        self.week_num = day_of_week_size
+        
+        # node embedding layer
+        self.node_emb_layer = nn.Parameter(torch.empty(num_nodes, node_emb_dim))
+        nn.init.xavier_uniform_(self.node_emb_layer)
+        
+        # time embedding layer
+        self.time_emb_layer = nn.Parameter(torch.empty(self.time_num, time_emb_dim))
+        nn.init.xavier_uniform_(self.time_emb_layer)
+        self.week_emb_layer = nn.Parameter(torch.empty(self.week_num, time_emb_dim))
+        nn.init.xavier_uniform_(self.week_emb_layer)
+
+        # embedding layer
+        self.input_emb_layer = nn.Conv2d(seq_num*in_dim, hid_dim, kernel_size=(1, 1), bias=True)
+        
+        self.W_1 = nn.Conv2d(node_emb_dim+time_emb_dim*2, hid_dim, kernel_size=(1, 1), bias=True)
+        self.W_2 = nn.Conv2d(node_emb_dim+time_emb_dim*2, hid_dim, kernel_size=(1, 1), bias=True)
+        
+        self.linear_conv = nn.ModuleList()
+        self.bn = nn.ModuleList()
+        
+        self.supports_len = 0
+        if supports is not None:
+            self.supports_len += len(supports)
+        
+        for i in range(self.layer_num):
+            self.linear_conv.append(linearized_conv(hid_dim*4, hid_dim*4, self.dropout, self.tau, self.random_feature_dim))
+            self.bn.append(nn.LayerNorm(hid_dim*4))
+        
+        if self.use_long:
+            self.regression_layer = nn.Conv2d(hid_dim*4*2+hid_dim+seq_num, out_dim, kernel_size=(1, 1), bias=True)
+        else:
+            self.regression_layer = nn.Conv2d(hid_dim*4*2, out_dim, kernel_size=(1, 1), bias=True)
+
+    def forward(self, x, feat=None):
+        
+        # x: (B, N, T, D)
+        B, N, T, D = x.size()
+        
+        time_emb = self.time_emb_layer[(x[:, :, -1, 1]*self.time_num).type(torch.LongTensor)]
+        week_emb = self.week_emb_layer[(x[:, :, -1, 2]).type(torch.LongTensor)]
+        
+        # input embedding
+        x = x.contiguous().view(B, N, -1).transpose(1, 2).unsqueeze(-1) # (B, D*T, N, 1)
+        input_emb = self.input_emb_layer(x)
+
+        # node embeddings
+        node_emb = self.node_emb_layer.unsqueeze(0).expand(B, -1, -1).transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
+
+        # time embeddings
+        time_emb = time_emb.transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
+        week_emb = week_emb.transpose(1, 2).unsqueeze(-1) # (B, dim, N, 1)
+        
+        x_g = torch.cat([node_emb, time_emb, week_emb], dim=1) # (B, dim*4, N, 1)
+        x = torch.cat([input_emb, node_emb, time_emb, week_emb], dim=1) # (B, dim*4, N, 1)
+
+        # linearized spatial convolution
+        x_pool = [x] # (B, dim*4, N, 1)
+        node_vec1 = self.W_1(x_g) # (B, dim, N, 1)
+        node_vec2 = self.W_2(x_g) # (B, dim, N, 1)
+        node_vec1 = node_vec1.permute(0, 2, 3, 1) # (B, N, 1, dim)
+        node_vec2 = node_vec2.permute(0, 2, 3, 1) # (B, N, 1, dim)
+        for i in range(self.layer_num):
+            if self.use_residual:
+                residual = x
+            x, node_vec1_prime, node_vec2_prime = self.linear_conv[i](x, node_vec1, node_vec2)
+            
+            if self.use_residual:
+                x = x+residual 
+                
+            if self.use_bn:
+                x = x.permute(0, 2, 3, 1) # (B, N, 1, dim*4)
+                x = self.bn[i](x)
+                x = x.permute(0, 3, 1, 2)
+
+        x_pool.append(x)
+        x = torch.cat(x_pool, dim=1) # (B, dim*4, N, 1)
+        
+        x = self.activation(x) # (B, dim*4, N, 1)
+        
+        if self.use_long:
+            feat = feat.permute(0, 2, 1).unsqueeze(-1) # (B, F, N, 1)
+            x = torch.cat([x, feat], dim=1)
+            x = self.regression_layer(x) # (B, N, T)
+            x = x.squeeze(-1).permute(0, 2, 1)
+        else:
+            x = self.regression_layer(x) # (B, N, T)
+            x = x.squeeze(-1).permute(0, 2, 1)
+
+        return {"prediction": x.transpose(1,2).unsqueeze(-1)
+              , "node_vec1": node_vec1_prime
+              , "node_vec2": node_vec2_prime
+              , "supports": self.supports
+              , 'use_spatial': self.use_spatial}
\ No newline at end of file
diff --git a/baselines/BigST/arch/pipeline.py b/baselines/BigST/arch/pipeline.py
new file mode 100644
index 00000000..fd4122cb
--- /dev/null
+++ b/baselines/BigST/arch/pipeline.py
@@ -0,0 +1,67 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import metrics
+from bigst import bigst
+
+class train_pipeline():
+    def __init__(self, scaler, seq_num, in_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
+                 use_residual, use_bn, use_spatial, use_long, dropout, lrate, wdecay, device, supports, edge_indices):
+        self.model = bigst(device, seq_num, in_dim, hid_dim, num_nodes, tau, random_feature_dim, node_emb_dim, time_emb_dim, \
+                           use_residual, use_bn, use_spatial, use_long, dropout, supports=supports, edge_indices=edge_indices)
+        self.model.to(device)
+        self.optimizer = optim.Adam(self.model.parameters(), lr=lrate, weight_decay=wdecay)
+        self.loss = metrics.masked_mae
+        self.scaler = scaler
+        self.use_spatial = use_spatial
+        self.clip = 5
+
+    def train(self, input, real_val, feat=None):
+        self.model.train()
+        self.optimizer.zero_grad()
+        
+        if self.use_spatial:
+            output, spatial_loss = self.model(input, feat)
+            real = self.scaler.inverse_transform(real_val)
+            predict = self.scaler.inverse_transform(output)
+            loss = self.loss(predict, real, 0.0)-0.3*spatial_loss
+        else:
+            output, _ = self.model(input, feat)
+            real = self.scaler.inverse_transform(real_val)
+            predict = self.scaler.inverse_transform(output)
+            loss = self.loss(predict, real, 0.0)
+        
+        loss.backward()
+        if self.clip is not None:
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
+        self.optimizer.step()
+        mape = metrics.masked_mape(predict,real,0.0).item()
+        rmse = metrics.masked_rmse(predict,real,0.0).item()
+        return loss.item(), mape, rmse
+    
+    def eval(self, input, real_val, feat=None, flag='overall'):
+        if flag=='overall':
+            self.model.eval()
+            output, _ = self.model(input, feat)
+            real = self.scaler.inverse_transform(real_val)
+            predict = self.scaler.inverse_transform(output)
+            loss = self.loss(predict, real, 0.0)
+            mape = metrics.masked_mape(predict,real,0.0).item()
+            rmse = metrics.masked_rmse(predict,real,0.0).item()
+            return loss.item(), mape, rmse
+        elif flag=='horizon':
+            self.model.eval()
+            output, _ = self.model(input, feat)
+            real = self.scaler.inverse_transform(real_val)
+            predict = self.scaler.inverse_transform(output)
+            loss = []
+            mape = []
+            rmse = []
+            for i in range(12):
+                loss.append(self.loss(predict[..., i], real[..., i], 0.0).item())
+                mape.append(metrics.masked_mape(predict[..., i], real[..., i], 0.0).item())
+                rmse.append(metrics.masked_rmse(predict[..., i], real[..., i], 0.0).item())
+            return loss, mape, rmse
diff --git a/baselines/BigST/arch/preprocess.py b/baselines/BigST/arch/preprocess.py
new file mode 100644
index 00000000..02926a47
--- /dev/null
+++ b/baselines/BigST/arch/preprocess.py
@@ -0,0 +1,219 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import sys
+import numpy as np 
+import pdb
+
+def create_projection_matrix(m, d, seed=0, scaling=0, struct_mode=False):
+    nb_full_blocks = int(m/d)
+    block_list = []
+    current_seed = seed
+    for _ in range(nb_full_blocks):
+        torch.manual_seed(current_seed)
+        if struct_mode:
+            q = create_products_of_givens_rotations(d, current_seed)
+        else:
+            unstructured_block = torch.randn((d, d))
+            q, _ = torch.qr(unstructured_block)
+            q = torch.t(q)
+        block_list.append(q)
+        current_seed += 1
+    remaining_rows = m - nb_full_blocks * d
+    if remaining_rows > 0:
+        torch.manual_seed(current_seed)
+        if struct_mode:
+            q = create_products_of_givens_rotations(d, current_seed)
+        else:
+            unstructured_block = torch.randn((d, d))
+            q, _ = torch.qr(unstructured_block)
+            q = torch.t(q)
+        block_list.append(q[0:remaining_rows])
+    final_matrix = torch.vstack(block_list)
+
+    current_seed += 1
+    torch.manual_seed(current_seed)
+    if scaling == 0:
+        multiplier = torch.norm(torch.randn((m, d)), dim=1)
+    elif scaling == 1:
+        multiplier = torch.sqrt(torch.tensor(float(d))) * torch.ones(m)
+    else:
+        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
+
+    return torch.matmul(torch.diag(multiplier), final_matrix)
+
+def create_products_of_givens_rotations(dim, seed):
+    nb_givens_rotations = dim * int(math.ceil(math.log(float(dim))))
+    q = np.eye(dim, dim)
+    np.random.seed(seed)
+    for _ in range(nb_givens_rotations):
+        random_angle = math.pi * np.random.uniform()
+        random_indices = np.random.choice(dim, 2)
+        index_i = min(random_indices[0], random_indices[1])
+        index_j = max(random_indices[0], random_indices[1])
+        slice_i = q[index_i]
+        slice_j = q[index_j]
+        new_slice_i = math.cos(random_angle) * slice_i + math.cos(random_angle) * slice_j
+        new_slice_j = -math.sin(random_angle) * slice_i + math.cos(random_angle) * slice_j
+        q[index_i] = new_slice_i
+        q[index_j] = new_slice_j
+    return torch.tensor(q, dtype=torch.float32)
+
+def softmax_kernel_transformation(data, is_query, projection_matrix=None, numerical_stabilizer=0.000001):
+    data_normalizer = 1.0 / torch.sqrt(torch.sqrt(torch.tensor(data.shape[-1], dtype=torch.float32)))
+    data = data_normalizer * data
+    ratio = 1.0 / torch.sqrt(torch.tensor(projection_matrix.shape[0], dtype=torch.float32))
+    data_dash = torch.einsum("bnhd,md->bnhm", data, projection_matrix)
+    diag_data = torch.square(data)
+    diag_data = torch.sum(diag_data, dim=len(data.shape)-1)
+    diag_data = diag_data / 2.0
+    diag_data = torch.unsqueeze(diag_data, dim=len(data.shape)-1)
+    last_dims_t = len(data_dash.shape) - 1
+    attention_dims_t = len(data_dash.shape) - 3
+    if is_query:
+        data_dash = ratio * (
+            torch.exp(data_dash - diag_data - torch.max(data_dash, dim=last_dims_t, keepdim=True)[0]) + numerical_stabilizer
+        )
+    else:
+        data_dash = ratio * (
+            torch.exp(data_dash - diag_data - torch.max(torch.max(data_dash, dim=last_dims_t, keepdim=True)[0],
+                    dim=attention_dims_t, keepdim=True)[0]) + numerical_stabilizer
+        )
+    return data_dash
+
+def numerator(qs, ks, vs):
+    kvs = torch.einsum("nbhm,nbhd->bhmd", ks, vs) # kvs refers to U_k in the paper
+    return torch.einsum("nbhm,bhmd->nbhd", qs, kvs)
+
+def denominator(qs, ks):
+    all_ones = torch.ones([ks.shape[0]]).to(qs.device)
+    ks_sum = torch.einsum("nbhm,n->bhm", ks, all_ones) # ks_sum refers to O_k in the paper
+    return torch.einsum("nbhm,bhm->nbh", qs, ks_sum)
+
+def linearized_softmax(x, query, key):
+    # x: [B, N, H, D] query: [B, N, H, m], key: [B, N, H, m]
+    query = query.permute(1, 0, 2, 3) # [N, B, H, m]
+    key = key.permute(1, 0, 2, 3) # [N, B, H, m]
+    x = x.permute(1, 0, 2, 3) # [N, B, H, D]
+
+    z_num = numerator(query, key, x) # [N, B, H, D]
+    z_den = denominator(query, key) # [N, H]
+
+    z_num = z_num.permute(1, 0, 2, 3)  # [B, N, H, D]
+    z_den = z_den.permute(1, 0, 2)
+    z_den = torch.unsqueeze(z_den, len(z_den.shape))
+    z_output = z_num / z_den # # [B, N, H, D]
+
+    return z_output
+
+class linearized_attention(nn.Module):
+    def __init__(self, c_in, c_out, dropout, random_feature_dim=30, tau=1.0, num_heads=4):
+        super(linearized_attention, self).__init__()
+        self.Wk = nn.Linear(c_in, c_out * num_heads)
+        self.Wq = nn.Linear(c_in, c_out * num_heads)
+        self.Wv = nn.Linear(c_in, c_out * num_heads)
+        self.Wo = nn.Linear(c_out * num_heads, c_out)
+        self.c_in = c_in
+        self.c_out = c_out
+        self.num_heads = num_heads
+        self.tau = tau
+        self.random_feature_dim = random_feature_dim
+        self.activation = nn.ReLU
+        self.dropout = dropout
+        
+    def reset_parameters(self):
+        self.Wk.reset_parameters()
+        self.Wq.reset_parameters()
+        self.Wv.reset_parameters()
+        self.Wo.reset_parameters()
+
+    def forward(self, x):
+        B, T = x.size(0), x.size(1) # (B, T, D)
+        query = self.Wq(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
+        key = self.Wk(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
+        x = self.Wv(x).reshape(-1, T, self.num_heads, self.c_out) # (B, T, H, D)
+        
+        dim = query.shape[-1] # (B, T, H, D)
+        seed = torch.ceil(torch.abs(torch.sum(query) * 1e8)).to(torch.int32)
+        projection_matrix = create_projection_matrix(self.random_feature_dim, dim, seed=seed).to(query.device) # (d, m)
+        query = query / math.sqrt(self.tau)
+        key = key / math.sqrt(self.tau)
+        query = softmax_kernel_transformation(query, True, projection_matrix) # [B, T, H, m]
+        key = softmax_kernel_transformation(key, False, projection_matrix) # [B, T, H, m]
+        
+        x = linearized_softmax(x, query, key)
+        
+        x = self.Wo(x.flatten(-2, -1)) # (B, T, D)
+        
+        return x
+
+
+class BigSTPreprocess(nn.Module):
+    """
+    Paper: BigST: Linear Complexity Spatio-Temporal Graph Neural Network for Traffic Forecasting on Large-Scale Road Networks
+    Link: https://dl.acm.org/doi/10.14778/3641204.3641217
+    Official Code: https://github.com/usail-hkust/BigST?tab=readme-ov-file
+    Venue: VLDB 2024
+    Task: Spatial-Temporal Forecasting
+    """
+    def __init__(self, input_length, output_length, in_dim, num_nodes, nhid, tiny_batch_size, dropout=0.3):
+        super(BigSTPreprocess, self).__init__()
+        self.tau = 1.0
+        self.layer_num = 3
+        self.random_feature_dim = nhid*2
+        
+        self.use_residual = True
+        self.use_bn = False
+        self.use_act = True
+        
+        self.dropout = dropout
+        self.activation = nn.ReLU()
+        
+        self.fc_convs = nn.ModuleList()
+        self.transformer_layer = nn.ModuleList()
+        self.bn = nn.ModuleList()
+        self.context_conv = nn.Conv2d(in_channels=in_dim, out_channels=nhid, kernel_size=(12, 1), stride=(12, 1))
+        
+        self.temporal_embedding = nn.Parameter(torch.empty(int(input_length/12), nhid), requires_grad=True) # (C, nhid)
+        nn.init.xavier_uniform_(self.temporal_embedding)
+        
+        for i in range(self.layer_num):
+            self.transformer_layer.append(linearized_attention(nhid, nhid, self.dropout, self.random_feature_dim, self.tau))
+            self.bn.append(nn.LayerNorm(nhid))
+        
+        self.regression_layer = nn.Linear(nhid, output_length)
+
+        self.tiny_batch_size = tiny_batch_size
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs) -> torch.Tensor:
+        x = history_data
+        # input: (1, 9638, 2016, 3) (B, N, T, D)
+        B, N, T, D = x.size()
+        pe = self.temporal_embedding.unsqueeze(0).expand(B*N, -1, -1) # (B*N, T/12, nhid)
+        
+        x = x.reshape(B*N, T, D)
+        x = x.permute(0, 2, 1).unsqueeze(-1) # (B*N, T, D) -> (B*N, D, T, 1)
+
+        # convolution layer
+        x = self.context_conv(x) # (B*N, D, T, 1) -> (B*N, nhid, T/12, 1)
+        x = x.squeeze(-1) # (B*N, nhid, T/12)
+
+        # temporal embedding layer
+        x = x.permute(0, 2, 1) # (B*N, T/12, nhid)
+        x = x+pe # (B*N, T/12, nhid)
+
+        # linearized attention
+        for num in range(self.layer_num):
+            residual = x # (B*N, T/12, nhid)
+            x = self.transformer_layer[num](x) # (B*N, T/12, nhid)
+            x = self.bn[num](x)
+            x = x+residual # (B*N, T/12, nhid)
+
+        x = self.activation(x) # (B*N, T/12, nhid)
+        x = x[:, -1, :]
+        # x = torch.sum(x, dim=1) # (B*N, nhid)
+        feat = x.view(B, N, -1) # (B, N, nhid)
+        x = self.regression_layer(feat) # (B, N, output_length)
+        return {'prediction': x.transpose(1,2).unsqueeze(-1), 'feat':feat}
\ No newline at end of file
diff --git a/baselines/BigST/arch/random_map.py b/baselines/BigST/arch/random_map.py
new file mode 100644
index 00000000..ea7e49d4
--- /dev/null
+++ b/baselines/BigST/arch/random_map.py
@@ -0,0 +1,81 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+def create_products_of_givens_rotations(dim, seed):
+    nb_givens_rotations = dim * int(math.ceil(math.log(float(dim))))
+    q = np.eye(dim, dim)
+    np.random.seed(seed)
+    for _ in range(nb_givens_rotations):
+        random_angle = math.pi * np.random.uniform()
+        random_indices = np.random.choice(dim, 2)
+        index_i = min(random_indices[0], random_indices[1])
+        index_j = max(random_indices[0], random_indices[1])
+        slice_i = q[index_i]
+        slice_j = q[index_j]
+        new_slice_i = math.cos(random_angle) * slice_i + math.cos(random_angle) * slice_j
+        new_slice_j = -math.sin(random_angle) * slice_i + math.cos(random_angle) * slice_j
+        q[index_i] = new_slice_i
+        q[index_j] = new_slice_j
+    return torch.tensor(q, dtype=torch.float32)
+
+def create_random_matrix(m, d, seed=0, scaling=0, struct_mode=False):
+    nb_full_blocks = int(m/d)
+    block_list = []
+    current_seed = seed
+    for _ in range(nb_full_blocks):
+        torch.manual_seed(current_seed)
+        if struct_mode:
+            q = create_products_of_givens_rotations(d, current_seed)
+        else:
+            unstructured_block = torch.randn((d, d))
+            q, _ = torch.qr(unstructured_block)
+            q = torch.t(q)
+        block_list.append(q)
+        current_seed += 1
+    remaining_rows = m - nb_full_blocks * d
+    if remaining_rows > 0:
+        torch.manual_seed(current_seed)
+        if struct_mode:
+            q = create_products_of_givens_rotations(d, current_seed)
+        else:
+            unstructured_block = torch.randn((d, d))
+            q, _ = torch.qr(unstructured_block)
+            q = torch.t(q)
+        block_list.append(q[0:remaining_rows])
+    final_matrix = torch.vstack(block_list)
+
+    current_seed += 1
+    torch.manual_seed(current_seed)
+    if scaling == 0:
+        multiplier = torch.norm(torch.randn((m, d)), dim=1)
+    elif scaling == 1:
+        multiplier = torch.sqrt(torch.tensor(float(d))) * torch.ones(m)
+    else:
+        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)
+
+    return torch.matmul(torch.diag(multiplier), final_matrix)
+
+def random_feature_map(data, is_query, projection_matrix=None, numerical_stabilizer=0.000001):
+    data_normalizer = 1.0 / torch.sqrt(torch.sqrt(torch.tensor(data.shape[-1], dtype=torch.float32)))
+    data = data_normalizer * data
+    ratio = 1.0 / torch.sqrt(torch.tensor(projection_matrix.shape[0], dtype=torch.float32))
+    data_dash = torch.einsum("bnhd,md->bnhm", data, projection_matrix)
+    diag_data = torch.square(data)
+    diag_data = torch.sum(diag_data, dim=len(data.shape)-1)
+    diag_data = diag_data / 2.0
+    diag_data = torch.unsqueeze(diag_data, dim=len(data.shape)-1)
+    last_dims_t = len(data_dash.shape) - 1
+    attention_dims_t = len(data_dash.shape) - 3
+    if is_query:
+        data_dash = ratio * (
+            torch.exp(data_dash - diag_data - torch.max(data_dash, dim=last_dims_t, keepdim=True)[0]) + numerical_stabilizer
+        )
+    else:
+        data_dash = ratio * (
+            torch.exp(data_dash - diag_data - torch.max(torch.max(data_dash, dim=last_dims_t, keepdim=True)[0],
+                    dim=attention_dims_t, keepdim=True)[0]) + numerical_stabilizer
+        )
+    return data_dash
diff --git a/baselines/BigST/loss/__init__.py b/baselines/BigST/loss/__init__.py
new file mode 100644
index 00000000..c22530d7
--- /dev/null
+++ b/baselines/BigST/loss/__init__.py
@@ -0,0 +1 @@
+from .loss import bigst_loss
\ No newline at end of file
diff --git a/baselines/BigST/loss/loss.py b/baselines/BigST/loss/loss.py
new file mode 100644
index 00000000..831f541e
--- /dev/null
+++ b/baselines/BigST/loss/loss.py
@@ -0,0 +1,35 @@
+import torch
+import numpy as np
+from basicts.metrics import masked_mae
+
+def spatial_loss(node_vec1, node_vec2, supports, edge_indices):
+    B = node_vec1.size(0)
+    node_vec1 = node_vec1.permute(1, 0, 2, 3) # [N, B, 1, r]
+    node_vec2 = node_vec2.permute(1, 0, 2, 3) # [N, B, 1, r]
+    
+    node_vec1_end, node_vec2_start = node_vec1[edge_indices[:, 0]], node_vec2[edge_indices[:, 1]] # [E, B, 1, r]
+    attn1 = torch.einsum("ebhm,ebhm->ebh", node_vec1_end, node_vec2_start) # [E, B, 1]
+    attn1 = attn1.permute(1, 0, 2) # [B, E, 1]
+
+    one_matrix = torch.ones([node_vec2.shape[0]]).to(node_vec1.device)
+    node_vec2_sum = torch.einsum("nbhm,n->bhm", node_vec2, one_matrix)
+    attn_norm = torch.einsum("nbhm,bhm->nbh", node_vec1, node_vec2_sum)
+    
+    attn2 = attn_norm[edge_indices[:, 0]]  # [E, B, 1]
+    attn2 = attn2.permute(1, 0, 2) # [B, E, 1]
+    attn_score = attn1 / attn2 # [B, E, 1]
+    
+    d_norm = supports[0][edge_indices[:, 0], edge_indices[:, 1]]
+    d_norm = d_norm.reshape(1, -1, 1).repeat(B, 1, attn_score.shape[-1])
+    spatial_loss = torch.mean(attn_score.log() * d_norm)
+    
+    return spatial_loss
+
+def bigst_loss(prediction, target, node_vec1, node_vec2, supports, use_spatial):
+    if use_spatial:
+        supports = [support.to(prediction.device) for support in supports]
+        edge_indices = torch.nonzero(supports[0] > 0)
+        s_loss = spatial_loss(node_vec1, node_vec2, supports, edge_indices)
+        return masked_mae(prediction, target, 0.0) - 0.3 * s_loss # 源代码：pipline.py line30
+    else:
+        return masked_mae(prediction, target, 0.0)
\ No newline at end of file
diff --git a/baselines/BigST/runner/__init__.py b/baselines/BigST/runner/__init__.py
new file mode 100644
index 00000000..2a0ecce8
--- /dev/null
+++ b/baselines/BigST/runner/__init__.py
@@ -0,0 +1 @@
+from .bigstpreprocess_runner import BigSTPreprocessRunner
\ No newline at end of file
diff --git a/baselines/BigST/runner/bigstpreprocess_runner.py b/baselines/BigST/runner/bigstpreprocess_runner.py
new file mode 100644
index 00000000..fbff8c45
--- /dev/null
+++ b/baselines/BigST/runner/bigstpreprocess_runner.py
@@ -0,0 +1,48 @@
+from typing import Tuple, Union, Dict
+import torch
+import numpy as np
+import wandb
+import pdb
+import os
+
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+
+
+class BigSTPreprocessRunner(SimpleTimeSeriesForecastingRunner):
+    def __init__(self, cfg: dict):
+        super().__init__(cfg)
+        
+        self.tiny_batch_size = cfg.MODEL.PARAM['tiny_batch_size']
+
+    def preprocessing(self, input_data: Dict) -> Dict:
+        """Preprocess data.
+
+        Args:
+            input_data (Dict): Dictionary containing data to be processed.
+
+        Returns:
+            Dict: Processed data.
+        """
+
+        input_data = super().preprocessing(input_data)
+        
+        x = input_data['inputs']
+        y = input_data['target']
+        
+        B, T, N, F = x.shape
+        batch_num = int(B * N / self.tiny_batch_size) # 似乎要确保不能等于0
+        idx_perm = np.random.permutation([i for i in range(B*N)])
+
+        for j in range(batch_num):
+            if j==batch_num-1:
+                x_ = x[:, :, idx_perm[(j+1)*self.tiny_batch_size:], :]
+                y_ = y[:, :, idx_perm[(j+1)*self.tiny_batch_size:], :]
+            else:
+                x_ = x[:, :, idx_perm[j*self.tiny_batch_size:(j+1)*self.tiny_batch_size], :]
+                y_ = y[:, :, idx_perm[j*self.tiny_batch_size:(j+1)*self.tiny_batch_size], :]
+
+        input_data['inputs'] = x_.transpose(1,2)
+        input_data['target'] = y_
+        return input_data
+
+   
\ No newline at end of file