IAMconsortium · danielhuppmann · Aug 22, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 18, 2023
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -19,6 +19,7 @@ instead of `pyam.to_list()`.
 
 ## Individual updates
 
+- [#771](https://github.com/IAMconsortium/pyam/pull/771) Refactor to start a separate validation module
 - [#764](https://github.com/IAMconsortium/pyam/pull/764) Clean-up exposing internal methods and attributes
 - [#763](https://github.com/IAMconsortium/pyam/pull/763) Implement a fix against carrying over unused levels when initializing from an indexed pandas object
 - [#759](https://github.com/IAMconsortium/pyam/pull/759) Excise "exclude" column from meta and add a own attribute

diff --git a/pyam/core.py b/pyam/core.py
@@ -30,6 +30,7 @@
     read_file,
     read_pandas,
     format_data,
+    make_index,
     merge_meta,
     merge_exclude,
     pattern_match,
@@ -67,6 +68,7 @@
 )
 from pyam.time import swap_time_for_year, swap_year_for_time
 from pyam.logging import raise_data_error, deprecation_warning
+from pyam.validation import _exclude_on_fail
 
 logger = logging.getLogger(__name__)
 
@@ -178,7 +180,7 @@
         self._data, index, self.time_col, self.extra_cols = _data
 
         # define `meta` dataframe for categorization & quantitative indicators
-        _index = _make_index(self._data, cols=index)
+        _index = make_index(self._data, cols=index)
         self.meta = pd.DataFrame(index=_index)
         self.exclude = False
 
@@ -948,7 +950,7 @@
                 run_control().update({kind: {name: {value: arg}}})
         # find all data that matches categorization
         rows = _apply_criteria(self._data, criteria, in_range=True, return_test="all")
-        idx = _make_index(rows, cols=self.index.names)
+        idx = make_index(rows, cols=self.index.names)
 
         if len(idx) == 0:
             logger.info("No scenarios satisfy the criteria")
@@ -1037,7 +1039,7 @@
         index_missing_required = pd.concat([index_none, index_incomplete])
         if not index_missing_required.empty:
             if exclude_on_fail:
-                self._exclude_on_fail(index_missing_required)
+                _exclude_on_fail(self, index_missing_required)
             return index_missing_required
 
     def require_variable(self, variable, unit=None, year=None, exclude_on_fail=False):
@@ -1082,7 +1084,7 @@
         )
 
         if exclude_on_fail:
-            self._exclude_on_fail(idx)
+            _exclude_on_fail(self, idx)
 
         logger.info(msg.format(n, variable))
         return pd.DataFrame(index=idx).reset_index()
@@ -1117,7 +1119,7 @@
             logger.info(msg.format(len(df), len(self.data)))
 
             if exclude_on_fail and len(df) > 0:
-                self._exclude_on_fail(df)
+                _exclude_on_fail(self, df)
             return df.reset_index()
 
     def rename(
@@ -1193,7 +1195,7 @@
 
         # renaming is only applied where a filter matches for all given columns
         rows = ret._apply_filters(**filters)
-        idx = ret.meta.index.isin(_make_index(ret._data[rows], cols=meta_idx))
+        idx = ret.meta.index.isin(make_index(ret._data[rows], cols=meta_idx))
 
         # apply renaming changes (for `data` only on the index)
         _data_index = ret._data.index
@@ -1486,7 +1488,7 @@
             logger.info(msg.format(variable, sum(rows), len(df_var)))
 
             if exclude_on_fail:
-                self._exclude_on_fail(_meta_idx(df_var[rows].reset_index()))
+                _exclude_on_fail(self, _meta_idx(df_var[rows].reset_index()))
 
             return pd.concat(
                 [df_var[rows], df_components[rows]],
@@ -1641,7 +1643,7 @@
             logger.info(msg.format(variable, sum(rows), len(df_region)))
 
             if exclude_on_fail:
-                self._exclude_on_fail(_meta_idx(df_region[rows].reset_index()))
+                _exclude_on_fail(self, _meta_idx(df_region[rows].reset_index()))
 
             _df = pd.concat(
                 [
@@ -1822,19 +1824,6 @@
                 ]
             ]
 
-    def _exclude_on_fail(self, df):
-        """Assign a selection of scenarios as `exclude: True`"""
-        idx = (
-            df
-            if isinstance(df, pd.MultiIndex)
-            else _make_index(df, cols=self.index.names)
-        )
-        self.exclude[idx] = True
-        n = len(idx)
-        logger.info(
-            f"{n} scenario{s(n)} failed validation and will be set as `exclude=True`."
-        )
-
     def slice(self, keep=True, **kwargs):
         """Return a (filtered) slice object of the IamDataFrame timeseries data index
 
@@ -1908,7 +1897,7 @@
                 logger.info(msg)
 
         # downselect `meta` dataframe
-        idx = _make_index(ret._data, cols=self.index.names)
+        idx = make_index(ret._data, cols=self.index.names)
         if len(idx) == 0:
             logger.warning("Filtered IamDataFrame is empty!")
         ret.meta = ret.meta.loc[idx]
@@ -1944,7 +1933,7 @@
                         f"Filter by `exclude` requires a boolean, found: {values}"
                     )
                 exclude_index = (self.exclude[self.exclude == values]).index
-                keep_col = _make_index(
+                keep_col = make_index(
                     self._data, cols=self.index.names, unique=False
                 ).isin(exclude_index)
 
@@ -1953,7 +1942,7 @@
                     self.meta[col], values, regexp=regexp, has_nan=True
                 )
                 cat_idx = self.meta[matches].index
-                keep_col = _make_index(
+                keep_col = make_index(
                     self._data, cols=self.index.names, unique=False
                 ).isin(cat_idx)
 
@@ -2741,21 +2730,6 @@
     return df
 
 
-def _make_index(df, cols=META_IDX, unique=True):
-    """Create an index from the columns/index of a dataframe or series"""
-
-    def _get_col(c):
-        try:
-            return df.index.get_level_values(c)
-        except KeyError:
-            return df[c]
-
-    index = list(zip(*[_get_col(col) for col in cols]))
-    if unique:
-        index = pd.unique(index)
-    return pd.MultiIndex.from_tuples(index, names=tuple(cols))
-
-
 def _empty_iamframe(index):
     """Return an empty IamDataFrame with the correct index columns"""
     return IamDataFrame(pd.DataFrame([], columns=index))

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -520,6 +520,21 @@ def merge_exclude(left, right, ignore_conflict=False):
     return pd.concat([left, right.loc[diff]], sort=False)
 
 
+def make_index(df, cols=META_IDX, unique=True):
+    """Create an index from the columns/index of a dataframe or series"""
+
+    def _get_col(c):
+        try:
+            return df.index.get_level_values(c)
+        except KeyError:
+            return df[c]
+
+    index = list(zip(*[_get_col(col) for col in cols]))
+    if unique:
+        index = pd.unique(index)
+    return pd.MultiIndex.from_tuples(index, names=tuple(cols))
+
+
 def pattern_match(
     data, values, level=None, regexp=False, has_nan=False, return_codes=False
 ):

diff --git a/pyam/validation.py b/pyam/validation.py
@@ -0,0 +1,19 @@
+import logging
+import pandas as pd
+
+from pyam.utils import make_index, s
+
+logger = logging.getLogger(__name__)
+
+
+def _exclude_on_fail(df, index):
+    """Assign a selection of scenarios as `exclude: True`"""
+
+    if not isinstance(index, pd.MultiIndex):
+        index = make_index(index, cols=df.index.names)
+
+    df.exclude[index] = True
+    n = len(index)
+    logger.info(
+        f"{n} scenario{s(n)} failed validation and will be set as `exclude=True`."
+    )
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,6 +1,5 @@
 import logging
 import pytest
-import re
 import datetime
 
 import numpy as np

diff --git a/tests/test_feature_validation.py b/tests/test_feature_validation.py
@@ -3,43 +3,61 @@
 import pytest
 
 from pyam import IamDataFrame, validate, categorize, require_variable
-from pyam.utils import META_IDX
+from pyam.utils import IAMC_IDX, META_IDX
+
+from .conftest import TEST_YEARS
+
+
+# add row for `Primary Energy|Gas` to test that all permutations
+DATA_GAS = pd.DataFrame(
+    [
+        ["model_a", "scen_a", "World", "Primary Energy|Gas", "EJ/yr", 0.5, 3],
+        ["model_a", "scen_b", "World", "Primary Energy|Gas", "EJ/yr", 1, 2],
+    ],
+    columns=IAMC_IDX + TEST_YEARS,
+)
 
 
 @pytest.mark.parametrize(
     "kwargs",
     (
         dict(),
         dict(variable="Primary Energy"),
+        dict(variable="Primary Energy", year=[2005, 2010]),
         dict(variable=["Primary Energy"], year=[2005, 2010]),
+        dict(variable=["Primary Energy", "Primary Energy|Gas"], year=[2005, 2010]),
     ),
 )
 def test_require_data_pass(test_df_year, kwargs):
     # check that IamDataFrame with all required data returns None
-    assert test_df_year.require_data(**kwargs) is None
+    df = test_df_year.append(IamDataFrame(DATA_GAS))
+    assert df.require_data(**kwargs) is None
 
 
 @pytest.mark.parametrize(
     "kwargs",
     (
         dict(variable="Primary Energy|Coal"),
+        dict(variable="Primary Energy", year=[2005, 2010]),
         dict(variable=["Primary Energy"], year=[2005, 2010]),
+        dict(variable=["Primary Energy", "Primary Energy|Gas"], year=[2005, 2010]),
     ),
 )
 @pytest.mark.parametrize("exclude_on_fail", (False, True))
 def test_require_data(test_df_year, kwargs, exclude_on_fail):
     # check different ways of failing when not all required data is present
 
     test_df_year._data = test_df_year._data[0:5]  # remove value for scen_b & 2010
+    df = test_df_year.append(IamDataFrame(DATA_GAS))
 
-    obs = test_df_year.require_data(**kwargs, exclude_on_fail=exclude_on_fail)
+    obs = df.require_data(**kwargs, exclude_on_fail=exclude_on_fail)
     exp = pd.DataFrame([["model_a", "scen_b"]], columns=["model", "scenario"])
     pdt.assert_frame_equal(obs, exp)
 
     if exclude_on_fail:
-        list(test_df_year.exclude) == [False, True]
+        list(df.exclude) == [False, True]
     else:
-        list(test_df_year.exclude) == [False, False]
+        list(df.exclude) == [False, False]
 
 
 def test_require_variable_pass(test_df):