Skip to content

Commit

Permalink
Add option to filter by measurand (#868)
Browse files Browse the repository at this point in the history
Co-authored-by: Philip Hackstock <[email protected]>
  • Loading branch information
danielhuppmann and phackstock authored Aug 5, 2024
1 parent 0ce8ce9 commit 9e2117e
Show file tree
Hide file tree
Showing 8 changed files with 609 additions and 878 deletions.
5 changes: 5 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Next release

- [#866](https://github.com/IAMconsortium/pyam/pull/888) Support filtering by a `measurand` argument with tuples of
variable and units

# Release v2.2.3

Bump **ixmp4** minimum dependency to latest version and pin **numpy** to <2.0.
Expand Down
1,376 changes: 524 additions & 852 deletions poetry.lock

Large diffs are not rendered by default.

35 changes: 19 additions & 16 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
from pyam.compute import IamComputeAccessor
from pyam.filter import (
datetime_match,
filter_by_col,
filter_by_dt_arg,
filter_by_measurand,
filter_by_time_domain,
filter_by_year,
)
Expand Down Expand Up @@ -306,8 +308,8 @@ def print_meta_row(m, t, lst):
[
print_meta_row(m, t, self.meta[m].unique())
for m, t in zip(
self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
)
self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
)
]
)
# print `...` if more than `meta_rows` columns
Expand Down Expand Up @@ -1851,7 +1853,8 @@ def slice(self, *, keep=True, **kwargs):
The following arguments are available for filtering:
- 'model', 'scenario', 'region', 'variable', 'unit':
string or list of strings, where `*` can be used as a wildcard
string or list of strings
- 'measurand': a tuple (or list of tuples) of 'variable' and 'unit'
- 'meta' columns: mapping of column name to allowed values
- 'exclude': values of :attr:`exclude`
- 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex`
Expand All @@ -1865,6 +1868,8 @@ def slice(self, *, keep=True, **kwargs):
('month', 'hour', 'time')
- 'regexp=True' disables pseudo-regexp syntax in `pattern_match()`
In any string filters, `*` is interpreted as wildcard.
"""

_keep = self._apply_filters(**kwargs)
Expand Down Expand Up @@ -1929,6 +1934,9 @@ def _apply_filters(self, level=None, **filters): # noqa: C901
regexp = filters.pop("regexp", False)
keep = np.ones(len(self), dtype=bool)

if "variable" in filters and "measurand" in filters:
raise ValueError("Filter by `variable` and `measurand` not supported")

# filter by columns and list of values
for col, values in filters.items():
# treat `_apply_filters(col=None)` as no filter applied
Expand Down Expand Up @@ -1998,25 +2006,20 @@ def _apply_filters(self, level=None, **filters): # noqa: C901

keep_col = datetime_match(self.get_data_column("time"), values)

elif col in self.dimensions:
levels, codes = get_index_levels_codes(self._data, col)
elif col == "measurand":
keep_col = filter_by_measurand(self._data, values, regexp, level)

matches = pattern_match(
levels,
values,
regexp=regexp,
level=level if col == "variable" else None,
has_nan=True,
return_codes=True,
)
keep_col = get_keep_col(codes, matches)
elif col in self.dimensions:
_level = level if col == "variable" else None
keep_col = filter_by_col(self._data, col, values, regexp, _level)

else:
raise ValueError(f"Filter by `{col}` not supported!")
raise ValueError(f"Filter by `{col}` not supported")

keep = np.logical_and(keep, keep_col)

if level is not None and "variable" not in filters:
if level is not None and not ("variable" in filters or "measurand" in filters):
# if level and variable/measurand is given, level-filter is applied there
col = "variable"
lvl_index, lvl_codes = get_index_levels_codes(self._data, col)
matches = find_depth(lvl_index, level=level)
Expand Down
33 changes: 31 additions & 2 deletions pyam/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,44 @@
import numpy as np
import pandas as pd

from pyam.index import get_keep_col
from pyam.utils import to_list
from pyam.index import get_index_levels_codes, get_keep_col
from pyam.utils import is_str, pattern_match, to_list

FILTER_DATETIME_ATTRS = {
"month": (["%b", "%B"], "tm_mon", "months"),
"day": (["%a", "%A"], "tm_wday", "days"),
}


def filter_by_col(data, col, values, regexp, level=None):
levels, codes = get_index_levels_codes(data, col)
matches = pattern_match(
levels,
values,
regexp=regexp,
level=level,
has_nan=True,
return_codes=True,
)
return get_keep_col(codes, matches)


def filter_by_measurand(data, values, regexp, level=None):
variable, unit = values
if is_str(variable) and is_str(unit):
return np.logical_and(
filter_by_col(data, "variable", variable, regexp, level),
filter_by_col(data, "unit", unit, regexp),
)
# values is an iterable of measurands
keep_col = np.zeros(len(data), dtype=bool)
for measurand in values:
keep_col = np.logical_or(
keep_col, filter_by_measurand(data, measurand, regexp, level)
)
return keep_col


def filter_by_time_domain(values, levels, codes):
"""Internal implementation to filter by time domain"""

Expand Down
2 changes: 1 addition & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
REQUIRED_COLS = ["region", "variable", "unit"]

# illegal terms for data/meta column names to prevent attribute conflicts
ILLEGAL_COLS = ["data", "meta", "level", "exclude", ""]
ILLEGAL_COLS = ["data", "meta", "level", "exclude", "measurand", ""]

# dictionary to translate column count to Excel column names
NUMERIC_TO_STR = dict(
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ classifiers = [
python = ">=3.10, <3.13"
iam-units = ">=2020.4.21"
ixmp4 = ">=0.9.0"
matplotlib = ">=3.6.0"
matplotlib = ">=3.6.0, <=3.9.0" # quickfix due to failing tests on Windows
# see github.com/matplotlib/matplotlib/issues/28551
numpy = ">=1.26.0, <2.0"
openpyxl = ">=3.1.2"
pandas = ">=2.1.2"
Expand Down
31 changes: 26 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,14 +431,35 @@ def test_filter_empty_df():
assert len(obs) == 0


def test_filter_variable_and_depth(test_df):
obs = test_df.filter(variable="*rimary*C*", level=0).variable
def test_variable_and_measurand_raises(test_df):
pytest.raises(ValueError, test_df.filter, variable="foo", measurand=("foo", "bar"))


@pytest.mark.parametrize(
"filter_args",
(dict(variable="*rimary*C*"), dict(measurand=("*rimary*C*", "EJ/*"))),
)
def test_filter_variable_and_depth(test_df, filter_args):
obs = test_df.filter(**filter_args, level=0).variable
assert obs == ["Primary Energy|Coal"]

obs = test_df.filter(variable="*rimary*C*", level=1).variable
obs = test_df.filter(**filter_args, level=1).variable
assert obs == []


def test_filter_measurand_list(test_df):
data = test_df.data
data.loc[4, "variable"] = "foo"
data.loc[5, "unit"] = "bar"
df = IamDataFrame(data)

obs = df.filter(measurand=(("foo", "EJ/yr"), ("Primary Energy", "bar")))

assert set(obs.variable) == {"Primary Energy", "foo"}
assert set(obs.unit) == {"EJ/yr", "bar"}
assert obs.scenario == ["scen_b"]


def test_variable_depth_0_keep_false(test_df):
obs = test_df.filter(level=0, keep=False).variable
assert obs == ["Primary Energy|Coal"]
Expand Down Expand Up @@ -545,13 +566,13 @@ def test_meta_idx(test_df):
assert len(_meta_idx(test_df.data)) == 2


def test_filter_by_bool(test_df):
def test_filter_meta_by_bool(test_df):
test_df.set_meta([True, False], name="meta_bool")
obs = test_df.filter(meta_bool=True)
assert obs.scenario == ["scen_a"]


def test_filter_by_int(test_df):
def test_filter_meta_by_int(test_df):
test_df.set_meta([1, 2], name="meta_int")
obs = test_df.filter(meta_int=[1, 3])
assert obs.scenario == ["scen_a"]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_feature_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_interpolate_with_list(test_df_year):


def test_interpolate_with_numpy_list(test_df_year):
test_df_year.interpolate(np.r_[2007: 2008 + 1], inplace=True)
test_df_year.interpolate(np.r_[2007 : 2008 + 1], inplace=True)
obs = test_df_year.filter(year=[2007, 2008])._data.values
npt.assert_allclose(obs, [3, 4, 1.5, 2, 4, 5])

Expand Down

0 comments on commit 9e2117e

Please sign in to comment.