Adjust tests and allow many forecasts per timestep

Nixtla · marcopeix · Feb 18, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
commit 330c7f89a6a987ea90405715a77671ce8d59af23
diff --git a/nbs/core.ipynb b/nbs/core.ipynb
@@ -1,5 +1,23 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f54b70ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "env: PYTORCH_ENABLE_MPS_FALLBACK=1\n"
+     ]
+    }
+   ],
+   "source": [
+    "%set_env PYTORCH_ENABLE_MPS_FALLBACK=1"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -44,8 +62,8 @@
      "text": [
       "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "2025-01-10 14:12:15,552\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
-      "2025-01-10 14:12:15,609\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+      "2025-01-13 15:36:00,304\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n",
+      "2025-01-13 15:36:00,365\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
      ]
     }
    ],
@@ -1439,11 +1457,6 @@
     "                fcsts_df[invert_cols].to_numpy(),\n",
     "                indptr\n",
     "            )\n",
-    "        # Drop duplicates when step_size < h\n",
-    "        if isinstance(fcsts_df, polars.DataFrame):\n",
-    "            fcsts_df = fcsts_df.unique(subset=[self.id_col, self.time_col], keep='first')\n",
-    "        else:\n",
-    "            fcsts_df = fcsts_df.drop_duplicates(subset=[self.id_col, self.time_col], keep='first')  \n",
     "        return fcsts_df\n",
     "\n",
     "    # Save list of models with pytorch lightning save_checkpoint function\n",
@@ -2555,98 +2568,27 @@
    "id": "340dd8a9",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m /Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/ray/tune/integration/pytorch_lightning.py:198: `ray.tune.integration.pytorch_lightning.TuneReportCallback` is deprecated. Use `ray.tune.integration.pytorch_lightning.TuneReportCheckpointCallback` instead.\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m Seed set to 1\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m GPU available: True (mps), used: True\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m TPU available: False, using: 0 TPU cores\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m HPU available: False, using: 0 HPUs\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m `Trainer(val_check_interval=1)` was configured so validation will run after every batch.\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m \n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m   | Name         | Type          | Params | Mode \n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m -------------------------------------------------------\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 0 | loss         | MAE           | 0      | train\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 1 | padder_train | ConstantPad1d | 0      | train\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 2 | scaler       | TemporalNorm  | 0      | train\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 3 | mlp          | ModuleList    | 18.2 K | train\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 4 | out          | Linear        | 1.5 K  | train\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m -------------------------------------------------------\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 19.7 K    Trainable params\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 0         Non-trainable params\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 19.7 K    Total params\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 0.079     Total estimated model params size (MB)\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 7         Modules in train mode\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m 0         Modules in eval mode\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]\n",
-      "Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s]                             \n"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2025-01-10 10:23:03,337\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/marcopeix/ray_results/_train_tune_2025-01-10_10-22-58' in 0.0020s.\n",
+      "Seed set to 1\n",
       "Seed set to 1\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  5.81it/s, v_num=0, train_loss_step=408.0]ain_tune pid=3565)\u001b[0m \n",
-      "Validation: |          | 0/? [00:00<?, ?it/s]\u001b[A\n",
-      "Validation:   0%|          | 0/1 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]\u001b[A\n",
-      "Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 144.15it/s]\u001b[A\n",
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  5.22it/s, v_num=0, train_loss_step=408.0, valid_loss=538.0, train_loss_epoch=408.0]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m `Trainer.fit` stopped: `max_steps=1` reached.\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m /Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown\n",
-      "\u001b[36m(_train_tune pid=3565)\u001b[0m   warnings.warn('resource_tracker: There appear to be %d '\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  7.18it/s, v_num=691, train_loss_step=421.0, train_loss_epoch=421.0]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 73.13it/s]\n",
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  2.87it/s, v_num=693, train_loss_step=403.0, train_loss_epoch=403.0]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 37.62it/s]\n",
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s, v_num=714, train_loss_step=25.70, train_loss_epoch=25.70]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 22.69it/s]\n",
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  2.39it/s, v_num=716, train_loss_step=403.0, train_loss_epoch=403.0]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 38.80it/s]\n",
       "WARNING: Predict insample might not provide accurate predictions for                     recurrent model RNN class yet due to scaling.\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.11it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 130.86it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 46.63it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 301.40it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.78it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 46.05it/s]\n",
-      "Original cols: ['NHITS-median', 'NHITS-lo-80', 'NHITS-hi-80', 'AutoMLP', 'RNN']\n",
-      "Selected cols: ['NHITS-median', 'AutoMLP', 'RNN']\n",
-      "Shape of fcsts: (2616, 3)\n"
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.53it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 112.65it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.57it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 42.03it/s]\n"
      ]
     }
    ],
@@ -2657,15 +2599,16 @@
     "n_series = 2\n",
     "h = 12\n",
     "\n",
-    "config = {'input_size': tune.choice([12, 24]), \n",
-    "          'hidden_size': 128,\n",
-    "          'max_steps': 1,\n",
-    "          'val_check_steps': 1,\n",
-    "          'step_size': 12}\n",
-    "\n",
+    "def get_expected_size(df, h, test_size, step_size):\n",
+    "    expected_size = 0\n",
+    "    uids = df['unique_id'].unique()\n",
+    "    for uid in uids:\n",
+    "        input_len = len(df[df['unique_id'] == uid])\n",
+    "        expected_size += ((input_len - test_size - h) / step_size + 1)*h\n",
+    "    return expected_size\n",
+    "        \n",
     "models = [\n",
     "    NHITS(h=h, input_size=24, loss=MQLoss(level=[80]), max_steps=1, alias='NHITS', scaler_type=None),\n",
-    "    AutoMLP(h=h, config=config, cpus=1, num_samples=1),\n",
     "    RNN(h=h, input_size=-1, loss=MAE(), max_steps=1, alias='RNN', scaler_type=None),\n",
     "    ]\n",
     "\n",
@@ -2674,7 +2617,7 @@
     "\n",
     "forecasts = nf.predict_insample(step_size=1)\n",
     "\n",
-    "expected_size = len(AirPassengersPanel_train) - (n_series*test_size)\n",
+    "expected_size = get_expected_size(AirPassengersPanel_train, h, test_size, 1)\n",
     "assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'"
    ]
   },
@@ -2695,9 +2638,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  5.15it/s, v_num=701, train_loss_step=46.40, train_loss_epoch=46.40]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 22.26it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 164.86it/s]"
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  4.99it/s, v_num=722, train_loss_step=46.40, train_loss_epoch=46.40]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 21.14it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 160.05it/s]"
      ]
     },
     {
@@ -2712,12 +2655,9 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "Original cols: ['NHITS']\n",
-      "Selected cols: ['NHITS']\n",
-      "Shape of fcsts: (432, 1)\n",
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 23.24it/s, v_num=704, train_loss_step=46.40, train_loss_epoch=46.40]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.64it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 52.31it/s]"
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 23.43it/s, v_num=725, train_loss_step=46.40, train_loss_epoch=46.40]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.93it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 85.76it/s]\n"
      ]
     },
     {
@@ -2731,13 +2671,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "Original cols: ['NHITS']\n",
-      "Selected cols: ['NHITS']\n",
-      "Shape of fcsts: (336, 1)\n",
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 19.88it/s, v_num=707, train_loss_step=46.40, train_loss_epoch=46.40]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.46it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 116.28it/s]\n"
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 19.91it/s, v_num=728, train_loss_step=46.40, train_loss_epoch=46.40]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 15.95it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 141.58it/s]\n"
      ]
     },
     {
@@ -2751,15 +2687,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Original cols: ['NHITS']\n",
-      "Selected cols: ['NHITS']\n",
-      "Shape of fcsts: (408, 1)\n",
-      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 21.43it/s, v_num=710, train_loss_step=46.40, train_loss_epoch=46.40]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 18.80it/s]\n",
-      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 98.27it/s] \n",
-      "Original cols: ['NHITS']\n",
-      "Selected cols: ['NHITS']\n",
-      "Shape of fcsts: (312, 1)\n"
+      "Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 17.76it/s, v_num=731, train_loss_step=46.40, train_loss_epoch=46.40]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 14.18it/s]\n",
+      "Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 103.60it/s]\n"
      ]
     }
    ],
@@ -3576,13 +3506,9 @@
     "\n",
     "def assert_equal_dfs(pandas_df, polars_df):\n",
     "    mapping = {k: v for k, v in inverse_renamer.items() if k in polars_df}\n",
-    "    polars_df = polars_df.rename(mapping).to_pandas()\\\n",
-    "                                         .sort_values(['unique_id', 'ds'], ascending=True)\\\n",
-    "                                         .reset_index(drop=True)\n",
-    "    pandas_df = pandas_df.reset_index(drop=True)\n",
     "    pd.testing.assert_frame_equal(\n",
     "        pandas_df,\n",
-    "        polars_df,\n",
+    "        polars_df.rename(mapping).to_pandas(),\n",
     "    )\n",
     "\n",
     "assert_equal_dfs(preds, preds_pl)\n",

diff --git a/neuralforecast/core.py b/neuralforecast/core.py
@@ -3,7 +3,7 @@
 # %% auto 0
 __all__ = ['NeuralForecast']
 
-# %% ../nbs/core.ipynb 4
+# %% ../nbs/core.ipynb 5
 import pickle
 import warnings
 from copy import deepcopy
@@ -71,7 +71,7 @@
 from .common._base_auto import BaseAuto, MockTrial
 from .utils import PredictionIntervals, get_prediction_interval_method
 
-# %% ../nbs/core.ipynb 5
+# %% ../nbs/core.ipynb 6
 # this disables warnings about the number of workers in the dataloaders
 # which the user can't control
 warnings.filterwarnings("ignore", category=pl.utilities.warnings.PossibleUserWarning)
@@ -129,7 +129,7 @@ def _insample_times(
     out = ufp.assign_columns(out, "cutoff", actual_cutoffs)
     return out
 
-# %% ../nbs/core.ipynb 7
+# %% ../nbs/core.ipynb 8
 MODEL_FILENAME_DICT = {
     "autoformer": Autoformer,
     "autoautoformer": Autoformer,
@@ -196,7 +196,7 @@ def _insample_times(
     "autormok": RMoK,
 }
 
-# %% ../nbs/core.ipynb 8
+# %% ../nbs/core.ipynb 9
 _type2scaler = {
     "standard": LocalStandardScaler,
     "robust": lambda: LocalRobustScaler(scale="mad"),
@@ -205,7 +205,7 @@ def _insample_times(
     "boxcox": lambda: LocalBoxCoxScaler(method="loglik", lower=0.0),
 }
 
-# %% ../nbs/core.ipynb 9
+# %% ../nbs/core.ipynb 10
 class NeuralForecast:
 
     def __init__(
@@ -1415,15 +1415,6 @@ def predict_insample(self, step_size: int = 1):
             fcsts_df[invert_cols] = self._scalers_target_inverse_transform(
                 fcsts_df[invert_cols].to_numpy(), indptr
             )
-        # Drop duplicates when step_size < h
-        if isinstance(fcsts_df, polars.DataFrame):
-            fcsts_df = fcsts_df.unique(
-                subset=[self.id_col, self.time_col], keep="first"
-            )
-        else:
-            fcsts_df = fcsts_df.drop_duplicates(
-                subset=[self.id_col, self.time_col], keep="first"
-            )
         return fcsts_df
 
     # Save list of models with pytorch lightning save_checkpoint function