From fd00d552b152f454bd5c2a778cbbc83d2d36f639 Mon Sep 17 00:00:00 2001 From: Olivier Sprangers <45119856+elephaint@users.noreply.github.com> Date: Thu, 16 Jan 2025 19:20:53 +0100 Subject: [PATCH] fix(pandas): use arrays for values and indices in time_features (#143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Morales --- nbs/feature_engineering.ipynb | 64 ++++++++++++++++++---------- settings.ini | 2 +- utilsforecast/__init__.py | 2 +- utilsforecast/feature_engineering.py | 4 +- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/nbs/feature_engineering.ipynb b/nbs/feature_engineering.ipynb index f3cc951..a630404 100644 --- a/nbs/feature_engineering.ipynb +++ b/nbs/feature_engineering.ipynb @@ -849,7 +849,7 @@ " if isinstance(times, pd.DatetimeIndex):\n", " if feature in (\"week\", \"weekofyear\"):\n", " times = times.isocalendar()\n", - " feat_vals = getattr(times, feature)\n", + " feat_vals = getattr(times, feature).to_numpy()\n", " else:\n", " feat_vals = getattr(times.dt, feature)()\n", " return feat_name, feat_vals\n", @@ -864,7 +864,7 @@ " if isinstance(df, pd.DataFrame):\n", " times = pd.Index(unique_times)\n", " time2pos = {time: i for i, time in enumerate(times)}\n", - " restore_idxs = df[time_col].map(time2pos)\n", + " restore_idxs = df[time_col].map(time2pos).to_numpy()\n", " for feature in features:\n", " name, vals = _compute_time_feature(times, feature)\n", " df[name] = vals[restore_idxs]\n", @@ -971,6 +971,7 @@ " y\n", " month\n", " day\n", + " week\n", " \n", " \n", " \n", @@ -981,6 +982,7 @@ " 0.428973\n", " 10\n", " 5\n", + " 40\n", " \n", " \n", " 1\n", @@ -989,6 +991,7 @@ " 1.423626\n", " 10\n", " 6\n", + " 40\n", " \n", " \n", " 2\n", @@ -997,6 +1000,7 @@ " 2.311782\n", " 10\n", " 7\n", + " 40\n", " \n", " \n", " 3\n", @@ -1005,6 +1009,7 @@ " 3.192191\n", " 10\n", " 8\n", + " 40\n", " \n", " \n", " 4\n", @@ -1013,6 +1018,7 @@ " 4.148767\n", " 10\n", " 9\n", + " 41\n", " \n", " \n", " ...\n", @@ -1021,6 +1027,7 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 1096\n", @@ -1029,6 +1036,7 @@ " 4.058910\n", " 5\n", " 10\n", + " 19\n", " \n", " \n", " 1097\n", @@ -1037,6 +1045,7 @@ " 5.178157\n", " 5\n", " 11\n", + " 19\n", " \n", " \n", " 1098\n", @@ -1045,6 +1054,7 @@ " 6.133142\n", " 5\n", " 12\n", + " 19\n", " \n", " \n", " 1099\n", @@ -1053,6 +1063,7 @@ " 0.403709\n", " 5\n", " 13\n", + " 19\n", " \n", " \n", " 1100\n", @@ -1061,27 +1072,28 @@ " 1.081779\n", " 5\n", " 14\n", + " 20\n", " \n", " \n", "\n", - "

1101 rows × 5 columns

\n", + "

1101 rows × 6 columns

\n", "" ], "text/plain": [ - " unique_id ds y month day\n", - "0 0 2000-10-05 0.428973 10 5\n", - "1 0 2000-10-06 1.423626 10 6\n", - "2 0 2000-10-07 2.311782 10 7\n", - "3 0 2000-10-08 3.192191 10 8\n", - "4 0 2000-10-09 4.148767 10 9\n", - "... ... ... ... ... ...\n", - "1096 4 2001-05-10 4.058910 5 10\n", - "1097 4 2001-05-11 5.178157 5 11\n", - "1098 4 2001-05-12 6.133142 5 12\n", - "1099 4 2001-05-13 0.403709 5 13\n", - "1100 4 2001-05-14 1.081779 5 14\n", + " unique_id ds y month day week\n", + "0 0 2000-10-05 0.428973 10 5 40\n", + "1 0 2000-10-06 1.423626 10 6 40\n", + "2 0 2000-10-07 2.311782 10 7 40\n", + "3 0 2000-10-08 3.192191 10 8 40\n", + "4 0 2000-10-09 4.148767 10 9 41\n", + "... ... ... ... ... ... ...\n", + "1096 4 2001-05-10 4.058910 5 10 19\n", + "1097 4 2001-05-11 5.178157 5 11 19\n", + "1098 4 2001-05-12 6.133142 5 12 19\n", + "1099 4 2001-05-13 0.403709 5 13 19\n", + "1100 4 2001-05-14 1.081779 5 14 20\n", "\n", - "[1101 rows x 5 columns]" + "[1101 rows x 6 columns]" ] }, "execution_count": null, @@ -1090,7 +1102,7 @@ } ], "source": [ - "transformed_df, future_df = time_features(series, freq='D', features=['month', 'day'], h=1)\n", + "transformed_df, future_df = time_features(series, freq='D', features=['month', 'day', 'week'], h=1)\n", "transformed_df" ] }, @@ -1125,6 +1137,7 @@ " ds\n", " month\n", " day\n", + " week\n", " \n", " \n", " \n", @@ -1134,6 +1147,7 @@ " 2001-05-15\n", " 5\n", " 15\n", + " 20\n", " \n", " \n", " 1\n", @@ -1141,6 +1155,7 @@ " 2001-05-15\n", " 5\n", " 15\n", + " 20\n", " \n", " \n", " 2\n", @@ -1148,6 +1163,7 @@ " 2001-05-15\n", " 5\n", " 15\n", + " 20\n", " \n", " \n", " 3\n", @@ -1155,6 +1171,7 @@ " 2001-05-15\n", " 5\n", " 15\n", + " 20\n", " \n", " \n", " 4\n", @@ -1162,18 +1179,19 @@ " 2001-05-15\n", " 5\n", " 15\n", + " 20\n", " \n", " \n", "\n", "" ], "text/plain": [ - " unique_id ds month day\n", - "0 0 2001-05-15 5 15\n", - "1 1 2001-05-15 5 15\n", - "2 2 2001-05-15 5 15\n", - "3 3 2001-05-15 5 15\n", - "4 4 2001-05-15 5 15" + " unique_id ds month day week\n", + "0 0 2001-05-15 5 15 20\n", + "1 1 2001-05-15 5 15 20\n", + "2 2 2001-05-15 5 15 20\n", + "3 3 2001-05-15 5 15 20\n", + "4 4 2001-05-15 5 15 20" ] }, "execution_count": null, diff --git a/settings.ini b/settings.ini index 1132ea8..92978fc 100644 --- a/settings.ini +++ b/settings.ini @@ -1,7 +1,7 @@ [DEFAULT] repo = utilsforecast lib_name = utilsforecast -version = 0.2.10 +version = 0.2.11 min_python = 3.8 license = apache2 black_formatting = True diff --git a/utilsforecast/__init__.py b/utilsforecast/__init__.py index 6232f7a..5635676 100644 --- a/utilsforecast/__init__.py +++ b/utilsforecast/__init__.py @@ -1 +1 @@ -__version__ = "0.2.10" +__version__ = "0.2.11" diff --git a/utilsforecast/feature_engineering.py b/utilsforecast/feature_engineering.py index 9502136..bb3c085 100644 --- a/utilsforecast/feature_engineering.py +++ b/utilsforecast/feature_engineering.py @@ -212,7 +212,7 @@ def _compute_time_feature( if isinstance(times, pd.DatetimeIndex): if feature in ("week", "weekofyear"): times = times.isocalendar() - feat_vals = getattr(times, feature) + feat_vals = getattr(times, feature).to_numpy() else: feat_vals = getattr(times.dt, feature)() return feat_name, feat_vals @@ -228,7 +228,7 @@ def _add_time_features( if isinstance(df, pd.DataFrame): times = pd.Index(unique_times) time2pos = {time: i for i, time in enumerate(times)} - restore_idxs = df[time_col].map(time2pos) + restore_idxs = df[time_col].map(time2pos).to_numpy() for feature in features: name, vals = _compute_time_feature(times, feature) df[name] = vals[restore_idxs]