Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to filter by measurand #868

Merged
merged 10 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Next release

- [#866](https://github.com/IAMconsortium/pyam/pull/888) Support filtering by a `measurand` argument with tuples of
variable and units

# Release v2.2.3

Bump **ixmp4** minimum dependency to latest version and pin **numpy** to <2.0.
Expand Down
1,376 changes: 524 additions & 852 deletions poetry.lock

Large diffs are not rendered by default.

35 changes: 19 additions & 16 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
from pyam.compute import IamComputeAccessor
from pyam.filter import (
datetime_match,
filter_by_col,
filter_by_dt_arg,
filter_by_measurand,
filter_by_time_domain,
filter_by_year,
)
Expand Down Expand Up @@ -306,8 +308,8 @@ def print_meta_row(m, t, lst):
[
print_meta_row(m, t, self.meta[m].unique())
for m, t in zip(
self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
)
self.meta.columns[0:meta_rows], self.meta.dtypes[0:meta_rows]
)
]
)
# print `...` if more than `meta_rows` columns
Expand Down Expand Up @@ -1851,7 +1853,8 @@ def slice(self, *, keep=True, **kwargs):
The following arguments are available for filtering:

- 'model', 'scenario', 'region', 'variable', 'unit':
string or list of strings, where `*` can be used as a wildcard
string or list of strings
- 'measurand': a tuple (or list of tuples) of 'variable' and 'unit'
- 'meta' columns: mapping of column name to allowed values
- 'exclude': values of :attr:`exclude`
- 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex`
Expand All @@ -1865,6 +1868,8 @@ def slice(self, *, keep=True, **kwargs):
('month', 'hour', 'time')
- 'regexp=True' disables pseudo-regexp syntax in `pattern_match()`

In any string filters, `*` is interpreted as wildcard.

"""

_keep = self._apply_filters(**kwargs)
Expand Down Expand Up @@ -1929,6 +1934,9 @@ def _apply_filters(self, level=None, **filters): # noqa: C901
regexp = filters.pop("regexp", False)
keep = np.ones(len(self), dtype=bool)

if "variable" in filters and "measurand" in filters:
raise ValueError("Filter by `variable` and `measurand` not supported")

# filter by columns and list of values
for col, values in filters.items():
# treat `_apply_filters(col=None)` as no filter applied
Expand Down Expand Up @@ -1998,25 +2006,20 @@ def _apply_filters(self, level=None, **filters): # noqa: C901

keep_col = datetime_match(self.get_data_column("time"), values)

elif col in self.dimensions:
levels, codes = get_index_levels_codes(self._data, col)
elif col == "measurand":
keep_col = filter_by_measurand(self._data, values, regexp, level)

matches = pattern_match(
levels,
values,
regexp=regexp,
level=level if col == "variable" else None,
has_nan=True,
return_codes=True,
)
keep_col = get_keep_col(codes, matches)
elif col in self.dimensions:
_level = level if col == "variable" else None
keep_col = filter_by_col(self._data, col, values, regexp, _level)

else:
raise ValueError(f"Filter by `{col}` not supported!")
raise ValueError(f"Filter by `{col}` not supported")

keep = np.logical_and(keep, keep_col)

if level is not None and "variable" not in filters:
if level is not None and not ("variable" in filters or "measurand" in filters):
# if level and variable/measurand is given, level-filter is applied there
col = "variable"
lvl_index, lvl_codes = get_index_levels_codes(self._data, col)
matches = find_depth(lvl_index, level=level)
Expand Down
33 changes: 31 additions & 2 deletions pyam/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,44 @@
import numpy as np
import pandas as pd

from pyam.index import get_keep_col
from pyam.utils import to_list
from pyam.index import get_index_levels_codes, get_keep_col
from pyam.utils import is_str, pattern_match, to_list

FILTER_DATETIME_ATTRS = {
"month": (["%b", "%B"], "tm_mon", "months"),
"day": (["%a", "%A"], "tm_wday", "days"),
}


def filter_by_col(data, col, values, regexp, level=None):
levels, codes = get_index_levels_codes(data, col)
matches = pattern_match(
levels,
values,
regexp=regexp,
level=level,
has_nan=True,
return_codes=True,
)
return get_keep_col(codes, matches)


def filter_by_measurand(data, values, regexp, level=None):
variable, unit = values
if is_str(variable) and is_str(unit):
return np.logical_and(
filter_by_col(data, "variable", variable, regexp, level),
filter_by_col(data, "unit", unit, regexp),
)
# values is an iterable of measurands
keep_col = np.zeros(len(data), dtype=bool)
for measurand in values:
keep_col = np.logical_or(
keep_col, filter_by_measurand(data, measurand, regexp, level)
)
return keep_col


def filter_by_time_domain(values, levels, codes):
"""Internal implementation to filter by time domain"""

Expand Down
2 changes: 1 addition & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
REQUIRED_COLS = ["region", "variable", "unit"]

# illegal terms for data/meta column names to prevent attribute conflicts
ILLEGAL_COLS = ["data", "meta", "level", "exclude", ""]
ILLEGAL_COLS = ["data", "meta", "level", "exclude", "measurand", ""]

# dictionary to translate column count to Excel column names
NUMERIC_TO_STR = dict(
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ classifiers = [
python = ">=3.10, <3.13"
iam-units = ">=2020.4.21"
ixmp4 = ">=0.9.0"
matplotlib = ">=3.6.0"
matplotlib = ">=3.6.0, <=3.9.0" # quickfix due to failing tests on Windows
# see github.com/matplotlib/matplotlib/issues/28551
numpy = ">=1.26.0, <2.0"
openpyxl = ">=3.1.2"
pandas = ">=2.1.2"
Expand Down
31 changes: 26 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,14 +431,35 @@ def test_filter_empty_df():
assert len(obs) == 0


def test_filter_variable_and_depth(test_df):
obs = test_df.filter(variable="*rimary*C*", level=0).variable
def test_variable_and_measurand_raises(test_df):
pytest.raises(ValueError, test_df.filter, variable="foo", measurand=("foo", "bar"))


@pytest.mark.parametrize(
"filter_args",
(dict(variable="*rimary*C*"), dict(measurand=("*rimary*C*", "EJ/*"))),
)
def test_filter_variable_and_depth(test_df, filter_args):
obs = test_df.filter(**filter_args, level=0).variable
assert obs == ["Primary Energy|Coal"]

obs = test_df.filter(variable="*rimary*C*", level=1).variable
obs = test_df.filter(**filter_args, level=1).variable
assert obs == []


def test_filter_measurand_list(test_df):
data = test_df.data
data.loc[4, "variable"] = "foo"
data.loc[5, "unit"] = "bar"
df = IamDataFrame(data)

obs = df.filter(measurand=(("foo", "EJ/yr"), ("Primary Energy", "bar")))

assert set(obs.variable) == {"Primary Energy", "foo"}
assert set(obs.unit) == {"EJ/yr", "bar"}
assert obs.scenario == ["scen_b"]


def test_variable_depth_0_keep_false(test_df):
obs = test_df.filter(level=0, keep=False).variable
assert obs == ["Primary Energy|Coal"]
Expand Down Expand Up @@ -545,13 +566,13 @@ def test_meta_idx(test_df):
assert len(_meta_idx(test_df.data)) == 2


def test_filter_by_bool(test_df):
def test_filter_meta_by_bool(test_df):
test_df.set_meta([True, False], name="meta_bool")
obs = test_df.filter(meta_bool=True)
assert obs.scenario == ["scen_a"]


def test_filter_by_int(test_df):
def test_filter_meta_by_int(test_df):
test_df.set_meta([1, 2], name="meta_int")
obs = test_df.filter(meta_int=[1, 3])
assert obs.scenario == ["scen_a"]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_feature_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_interpolate_with_list(test_df_year):


def test_interpolate_with_numpy_list(test_df_year):
test_df_year.interpolate(np.r_[2007: 2008 + 1], inplace=True)
test_df_year.interpolate(np.r_[2007 : 2008 + 1], inplace=True)
obs = test_df_year.filter(year=[2007, 2008])._data.values
npt.assert_allclose(obs, [3, 4, 1.5, 2, 4, 5])

Expand Down