Add option to filter by measurand (#868)

Co-authored-by: Philip Hackstock <[email protected]>
IAMconsortium · Aug 5, 2024 · 9e2117e · 9e2117e
1 parent 0ce8ce9
commit 9e2117e
Show file tree

Hide file tree

Showing 8 changed files with 609 additions and 878 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,8 @@
+# Next release
+
+- [#866](https://github.com/IAMconsortium/pyam/pull/888) Support filtering by a `measurand` argument with tuples of
+  variable and units
+
 # Release v2.2.3
 
 Bump **ixmp4** minimum dependency to latest version and pin **numpy** to <2.0.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyam/core.py b/pyam/core.py
@@ -31,7 +31,9 @@
 from pyam.compute import IamComputeAccessor
 from pyam.filter import (
     datetime_match,
+    filter_by_col,
     filter_by_dt_arg,
+    filter_by_measurand,
     filter_by_time_domain,
     filter_by_year,
 )
@@ -306,8 +308,8 @@ def print_meta_row(m, t, lst):
                 [
                     print_meta_row(m, t, self.meta[m].unique())
                     for m, t in zip(
-                    self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
-                )
+                        self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
+                    )
                 ]
             )
             # print `...` if more than `meta_rows` columns
@@ -1851,7 +1853,8 @@ def slice(self, *, keep=True, **kwargs):
         The following arguments are available for filtering:
 
          - 'model', 'scenario', 'region', 'variable', 'unit':
-           string or list of strings, where `*` can be used as a wildcard
+           string or list of strings
+         - 'measurand': a tuple (or list of tuples) of 'variable' and 'unit'
          - 'meta' columns: mapping of column name to allowed values
          - 'exclude': values of :attr:`exclude`
          - 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex`
@@ -1865,6 +1868,8 @@ def slice(self, *, keep=True, **kwargs):
            ('month', 'hour', 'time')
          - 'regexp=True' disables pseudo-regexp syntax in `pattern_match()`
 
+        In any string filters, `*` is interpreted as wildcard.
+
         """
 
         _keep = self._apply_filters(**kwargs)
@@ -1929,6 +1934,9 @@ def _apply_filters(self, level=None, **filters):  # noqa: C901
         regexp = filters.pop("regexp", False)
         keep = np.ones(len(self), dtype=bool)
 
+        if "variable" in filters and "measurand" in filters:
+            raise ValueError("Filter by `variable` and `measurand` not supported")
+
         # filter by columns and list of values
         for col, values in filters.items():
             # treat `_apply_filters(col=None)` as no filter applied
@@ -1998,25 +2006,20 @@ def _apply_filters(self, level=None, **filters):  # noqa: C901
 
                 keep_col = datetime_match(self.get_data_column("time"), values)
 
-            elif col in self.dimensions:
-                levels, codes = get_index_levels_codes(self._data, col)
+            elif col == "measurand":
+                keep_col = filter_by_measurand(self._data, values, regexp, level)
 
-                matches = pattern_match(
-                    levels,
-                    values,
-                    regexp=regexp,
-                    level=level if col == "variable" else None,
-                    has_nan=True,
-                    return_codes=True,
-                )
-                keep_col = get_keep_col(codes, matches)
+            elif col in self.dimensions:
+                _level = level if col == "variable" else None
+                keep_col = filter_by_col(self._data, col, values, regexp, _level)
 
             else:
-                raise ValueError(f"Filter by `{col}` not supported!")
+                raise ValueError(f"Filter by `{col}` not supported")
 
             keep = np.logical_and(keep, keep_col)
 
-        if level is not None and "variable" not in filters:
+        if level is not None and not ("variable" in filters or "measurand" in filters):
+            # if level and variable/measurand is given, level-filter is applied there
             col = "variable"
             lvl_index, lvl_codes = get_index_levels_codes(self._data, col)
             matches = find_depth(lvl_index, level=level)

diff --git a/pyam/filter.py b/pyam/filter.py
@@ -4,15 +4,44 @@
 import numpy as np
 import pandas as pd
 
-from pyam.index import get_keep_col
-from pyam.utils import to_list
+from pyam.index import get_index_levels_codes, get_keep_col
+from pyam.utils import is_str, pattern_match, to_list
 
 FILTER_DATETIME_ATTRS = {
     "month": (["%b", "%B"], "tm_mon", "months"),
     "day": (["%a", "%A"], "tm_wday", "days"),
 }
 
 
+def filter_by_col(data, col, values, regexp, level=None):
+    levels, codes = get_index_levels_codes(data, col)
+    matches = pattern_match(
+        levels,
+        values,
+        regexp=regexp,
+        level=level,
+        has_nan=True,
+        return_codes=True,
+    )
+    return get_keep_col(codes, matches)
+
+
+def filter_by_measurand(data, values, regexp, level=None):
+    variable, unit = values
+    if is_str(variable) and is_str(unit):
+        return np.logical_and(
+            filter_by_col(data, "variable", variable, regexp, level),
+            filter_by_col(data, "unit", unit, regexp),
+        )
+    # values is an iterable of measurands
+    keep_col = np.zeros(len(data), dtype=bool)
+    for measurand in values:
+        keep_col = np.logical_or(
+            keep_col, filter_by_measurand(data, measurand, regexp, level)
+        )
+    return keep_col
+
+
 def filter_by_time_domain(values, levels, codes):
     """Internal implementation to filter by time domain"""
 

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -27,7 +27,7 @@
 REQUIRED_COLS = ["region", "variable", "unit"]
 
 # illegal terms for data/meta column names to prevent attribute conflicts
-ILLEGAL_COLS = ["data", "meta", "level", "exclude", ""]
+ILLEGAL_COLS = ["data", "meta", "level", "exclude", "measurand", ""]
 
 # dictionary to translate column count to Excel column names
 NUMERIC_TO_STR = dict(

diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,8 @@ classifiers = [
 python = ">=3.10, <3.13"
 iam-units = ">=2020.4.21"
 ixmp4 = ">=0.9.0"
-matplotlib = ">=3.6.0"
+matplotlib = ">=3.6.0, <=3.9.0"  # quickfix due to failing tests on Windows
+# see github.com/matplotlib/matplotlib/issues/28551
 numpy = ">=1.26.0, <2.0"
 openpyxl = ">=3.1.2"
 pandas = ">=2.1.2"

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -431,14 +431,35 @@ def test_filter_empty_df():
     assert len(obs) == 0
 
 
-def test_filter_variable_and_depth(test_df):
-    obs = test_df.filter(variable="*rimary*C*", level=0).variable
+def test_variable_and_measurand_raises(test_df):
+    pytest.raises(ValueError, test_df.filter, variable="foo", measurand=("foo", "bar"))
+
+
+@pytest.mark.parametrize(
+    "filter_args",
+    (dict(variable="*rimary*C*"), dict(measurand=("*rimary*C*", "EJ/*"))),
+)
+def test_filter_variable_and_depth(test_df, filter_args):
+    obs = test_df.filter(**filter_args, level=0).variable
     assert obs == ["Primary Energy|Coal"]
 
-    obs = test_df.filter(variable="*rimary*C*", level=1).variable
+    obs = test_df.filter(**filter_args, level=1).variable
     assert obs == []
 
 
+def test_filter_measurand_list(test_df):
+    data = test_df.data
+    data.loc[4, "variable"] = "foo"
+    data.loc[5, "unit"] = "bar"
+    df = IamDataFrame(data)
+
+    obs = df.filter(measurand=(("foo", "EJ/yr"), ("Primary Energy", "bar")))
+
+    assert set(obs.variable) == {"Primary Energy", "foo"}
+    assert set(obs.unit) == {"EJ/yr", "bar"}
+    assert obs.scenario == ["scen_b"]
+
+
 def test_variable_depth_0_keep_false(test_df):
     obs = test_df.filter(level=0, keep=False).variable
     assert obs == ["Primary Energy|Coal"]
@@ -545,13 +566,13 @@ def test_meta_idx(test_df):
     assert len(_meta_idx(test_df.data)) == 2
 
 
-def test_filter_by_bool(test_df):
+def test_filter_meta_by_bool(test_df):
     test_df.set_meta([True, False], name="meta_bool")
     obs = test_df.filter(meta_bool=True)
     assert obs.scenario == ["scen_a"]
 
 
-def test_filter_by_int(test_df):
+def test_filter_meta_by_int(test_df):
     test_df.set_meta([1, 2], name="meta_int")
     obs = test_df.filter(meta_int=[1, 3])
     assert obs.scenario == ["scen_a"]

diff --git a/tests/test_feature_interpolate.py b/tests/test_feature_interpolate.py
@@ -38,7 +38,7 @@ def test_interpolate_with_list(test_df_year):
 
 
 def test_interpolate_with_numpy_list(test_df_year):
-    test_df_year.interpolate(np.r_[2007: 2008 + 1], inplace=True)
+    test_df_year.interpolate(np.r_[2007 : 2008 + 1], inplace=True)
     obs = test_df_year.filter(year=[2007, 2008])._data.values
     npt.assert_allclose(obs, [3, 4, 1.5, 2, 4, 5])