Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: Insample predictions with series of varying lengths #1246

Merged
merged 14 commits into from
Feb 18, 2025
Merged
Prev Previous commit
Next Next commit
Adjust tests and allow many forecasts per timestep
marcopeix committed Jan 13, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 330c7f89a6a987ea90405715a77671ce8d59af23
176 changes: 51 additions & 125 deletions nbs/core.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f54b70ca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: PYTORCH_ENABLE_MPS_FALLBACK=1\n"
]
}
],
"source": [
"%set_env PYTORCH_ENABLE_MPS_FALLBACK=1"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -44,8 +62,8 @@
"text": [
"/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"2025-01-10 14:12:15,552\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
"2025-01-10 14:12:15,609\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
"2025-01-13 15:36:00,304\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
"2025-01-13 15:36:00,365\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
]
}
],
@@ -1439,11 +1457,6 @@
" fcsts_df[invert_cols].to_numpy(),\n",
" indptr\n",
" )\n",
" # Drop duplicates when step_size < h\n",
" if isinstance(fcsts_df, polars.DataFrame):\n",
" fcsts_df = fcsts_df.unique(subset=[self.id_col, self.time_col], keep='first')\n",
" else:\n",
" fcsts_df = fcsts_df.drop_duplicates(subset=[self.id_col, self.time_col], keep='first') \n",
" return fcsts_df\n",
"\n",
" # Save list of models with pytorch lightning save_checkpoint function\n",
@@ -2555,98 +2568,27 @@
"id": "340dd8a9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(_train_tune pid=3565)\u001b[0m /Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/ray/tune/integration/pytorch_lightning.py:198: `ray.tune.integration.pytorch_lightning.TuneReportCallback` is deprecated. Use `ray.tune.integration.pytorch_lightning.TuneReportCheckpointCallback` instead.\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m Seed set to 1\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m GPU available: True (mps), used: True\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m TPU available: False, using: 0 TPU cores\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m HPU available: False, using: 0 HPUs\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m \n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m | Name | Type | Params | Mode \n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m -------------------------------------------------------\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 0 | loss | MAE | 0 | train\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 1 | padder_train | ConstantPad1d | 0 | train\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 2 | scaler | TemporalNorm | 0 | train\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 3 | mlp | ModuleList | 18.2 K | train\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 4 | out | Linear | 1.5 K | train\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m -------------------------------------------------------\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 19.7 K Trainable params\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 0 Non-trainable params\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 19.7 K Total params\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 0.079 Total estimated model params size (MB)\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 7 Modules in train mode\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m 0 Modules in eval mode\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sanity Checking DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]\n",
"Epoch 0: 0%| | 0/1 [00:00<?, ?it/s] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-01-10 10:23:03,337\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/marcopeix/ray_results/_train_tune_2025-01-10_10-22-58' in 0.0020s.\n",
"Seed set to 1\n",
"Seed set to 1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 5.81it/s, v_num=0, train_loss_step=408.0]ain_tune pid=3565)\u001b[0m \n",
"Validation: | | 0/? [00:00<?, ?it/s]\u001b[A\n",
"Validation: 0%| | 0/1 [00:00<?, ?it/s]\u001b[A\n",
"Validation DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]\u001b[A\n",
"Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 144.15it/s]\u001b[A\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 5.22it/s, v_num=0, train_loss_step=408.0, valid_loss=538.0, train_loss_epoch=408.0]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m(_train_tune pid=3565)\u001b[0m `Trainer.fit` stopped: `max_steps=1` reached.\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m /Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown\n",
"\u001b[36m(_train_tune pid=3565)\u001b[0m warnings.warn('resource_tracker: There appear to be %d '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 7.18it/s, v_num=691, train_loss_step=421.0, train_loss_epoch=421.0]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 73.13it/s]\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 2.87it/s, v_num=693, train_loss_step=403.0, train_loss_epoch=403.0]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 37.62it/s]\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 2.89it/s, v_num=714, train_loss_step=25.70, train_loss_epoch=25.70]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 22.69it/s]\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 2.39it/s, v_num=716, train_loss_step=403.0, train_loss_epoch=403.0]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 38.80it/s]\n",
"WARNING: Predict insample might not provide accurate predictions for recurrent model RNN class yet due to scaling.\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.11it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 130.86it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 46.63it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 301.40it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.78it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 46.05it/s]\n",
"Original cols: ['NHITS-median', 'NHITS-lo-80', 'NHITS-hi-80', 'AutoMLP', 'RNN']\n",
"Selected cols: ['NHITS-median', 'AutoMLP', 'RNN']\n",
"Shape of fcsts: (2616, 3)\n"
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.53it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 112.65it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.57it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 42.03it/s]\n"
]
}
],
@@ -2657,15 +2599,16 @@
"n_series = 2\n",
"h = 12\n",
"\n",
"config = {'input_size': tune.choice([12, 24]), \n",
" 'hidden_size': 128,\n",
" 'max_steps': 1,\n",
" 'val_check_steps': 1,\n",
" 'step_size': 12}\n",
"\n",
"def get_expected_size(df, h, test_size, step_size):\n",
" expected_size = 0\n",
" uids = df['unique_id'].unique()\n",
" for uid in uids:\n",
" input_len = len(df[df['unique_id'] == uid])\n",
" expected_size += ((input_len - test_size - h) / step_size + 1)*h\n",
" return expected_size\n",
" \n",
"models = [\n",
" NHITS(h=h, input_size=24, loss=MQLoss(level=[80]), max_steps=1, alias='NHITS', scaler_type=None),\n",
" AutoMLP(h=h, config=config, cpus=1, num_samples=1),\n",
" RNN(h=h, input_size=-1, loss=MAE(), max_steps=1, alias='RNN', scaler_type=None),\n",
" ]\n",
"\n",
@@ -2674,7 +2617,7 @@
"\n",
"forecasts = nf.predict_insample(step_size=1)\n",
"\n",
"expected_size = len(AirPassengersPanel_train) - (n_series*test_size)\n",
"expected_size = get_expected_size(AirPassengersPanel_train, h, test_size, 1)\n",
"assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'"
]
},
@@ -2695,9 +2638,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 5.15it/s, v_num=701, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 22.26it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 164.86it/s]"
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 4.99it/s, v_num=722, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 21.14it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 160.05it/s]"
]
},
{
@@ -2712,12 +2655,9 @@
"output_type": "stream",
"text": [
"\n",
"Original cols: ['NHITS']\n",
"Selected cols: ['NHITS']\n",
"Shape of fcsts: (432, 1)\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 23.24it/s, v_num=704, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.64it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 52.31it/s]"
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 23.43it/s, v_num=725, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.93it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 85.76it/s]\n"
]
},
{
@@ -2731,13 +2671,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Original cols: ['NHITS']\n",
"Selected cols: ['NHITS']\n",
"Shape of fcsts: (336, 1)\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 19.88it/s, v_num=707, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.46it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 116.28it/s]\n"
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 19.91it/s, v_num=728, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.95it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 141.58it/s]\n"
]
},
{
@@ -2751,15 +2687,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Original cols: ['NHITS']\n",
"Selected cols: ['NHITS']\n",
"Shape of fcsts: (408, 1)\n",
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 21.43it/s, v_num=710, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.80it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 98.27it/s] \n",
"Original cols: ['NHITS']\n",
"Selected cols: ['NHITS']\n",
"Shape of fcsts: (312, 1)\n"
"Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 17.76it/s, v_num=731, train_loss_step=46.40, train_loss_epoch=46.40]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.18it/s]\n",
"Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 103.60it/s]\n"
]
}
],
@@ -3576,13 +3506,9 @@
"\n",
"def assert_equal_dfs(pandas_df, polars_df):\n",
" mapping = {k: v for k, v in inverse_renamer.items() if k in polars_df}\n",
" polars_df = polars_df.rename(mapping).to_pandas()\\\n",
" .sort_values(['unique_id', 'ds'], ascending=True)\\\n",
" .reset_index(drop=True)\n",
" pandas_df = pandas_df.reset_index(drop=True)\n",
" pd.testing.assert_frame_equal(\n",
" pandas_df,\n",
" polars_df,\n",
" polars_df.rename(mapping).to_pandas(),\n",
" )\n",
"\n",
"assert_equal_dfs(preds, preds_pl)\n",
19 changes: 5 additions & 14 deletions neuralforecast/core.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
# %% auto 0
__all__ = ['NeuralForecast']

# %% ../nbs/core.ipynb 4
# %% ../nbs/core.ipynb 5
import pickle
import warnings
from copy import deepcopy
@@ -71,7 +71,7 @@
from .common._base_auto import BaseAuto, MockTrial
from .utils import PredictionIntervals, get_prediction_interval_method

# %% ../nbs/core.ipynb 5
# %% ../nbs/core.ipynb 6
# this disables warnings about the number of workers in the dataloaders
# which the user can't control
warnings.filterwarnings("ignore", category=pl.utilities.warnings.PossibleUserWarning)
@@ -129,7 +129,7 @@ def _insample_times(
out = ufp.assign_columns(out, "cutoff", actual_cutoffs)
return out

# %% ../nbs/core.ipynb 7
# %% ../nbs/core.ipynb 8
MODEL_FILENAME_DICT = {
"autoformer": Autoformer,
"autoautoformer": Autoformer,
@@ -196,7 +196,7 @@ def _insample_times(
"autormok": RMoK,
}

# %% ../nbs/core.ipynb 8
# %% ../nbs/core.ipynb 9
_type2scaler = {
"standard": LocalStandardScaler,
"robust": lambda: LocalRobustScaler(scale="mad"),
@@ -205,7 +205,7 @@ def _insample_times(
"boxcox": lambda: LocalBoxCoxScaler(method="loglik", lower=0.0),
}

# %% ../nbs/core.ipynb 9
# %% ../nbs/core.ipynb 10
class NeuralForecast:

def __init__(
@@ -1415,15 +1415,6 @@ def predict_insample(self, step_size: int = 1):
fcsts_df[invert_cols] = self._scalers_target_inverse_transform(
fcsts_df[invert_cols].to_numpy(), indptr
)
# Drop duplicates when step_size < h
if isinstance(fcsts_df, polars.DataFrame):
fcsts_df = fcsts_df.unique(
subset=[self.id_col, self.time_col], keep="first"
)
else:
fcsts_df = fcsts_df.drop_duplicates(
subset=[self.id_col, self.time_col], keep="first"
)
return fcsts_df

# Save list of models with pytorch lightning save_checkpoint function