Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to use validation module #771

Merged
merged 8 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ instead of `pyam.to_list()`.

## Individual updates

- [#771](https://github.com/IAMconsortium/pyam/pull/771) Refactor to start a separate validation module
- [#764](https://github.com/IAMconsortium/pyam/pull/764) Clean-up exposing internal methods and attributes
- [#763](https://github.com/IAMconsortium/pyam/pull/763) Implement a fix against carrying over unused levels when initializing from an indexed pandas object
- [#759](https://github.com/IAMconsortium/pyam/pull/759) Excise "exclude" column from meta and add a own attribute
Expand Down
52 changes: 13 additions & 39 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
read_file,
read_pandas,
format_data,
make_index,
merge_meta,
merge_exclude,
pattern_match,
Expand Down Expand Up @@ -67,6 +68,7 @@
)
from pyam.time import swap_time_for_year, swap_year_for_time
from pyam.logging import raise_data_error, deprecation_warning
from pyam.validation import _exclude_on_fail

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -178,7 +180,7 @@
self._data, index, self.time_col, self.extra_cols = _data

# define `meta` dataframe for categorization & quantitative indicators
_index = _make_index(self._data, cols=index)
_index = make_index(self._data, cols=index)
self.meta = pd.DataFrame(index=_index)
self.exclude = False

Expand Down Expand Up @@ -948,7 +950,7 @@
run_control().update({kind: {name: {value: arg}}})
# find all data that matches categorization
rows = _apply_criteria(self._data, criteria, in_range=True, return_test="all")
idx = _make_index(rows, cols=self.index.names)
idx = make_index(rows, cols=self.index.names)

if len(idx) == 0:
logger.info("No scenarios satisfy the criteria")
Expand Down Expand Up @@ -1037,7 +1039,7 @@
index_missing_required = pd.concat([index_none, index_incomplete])
if not index_missing_required.empty:
if exclude_on_fail:
self._exclude_on_fail(index_missing_required)
_exclude_on_fail(self, index_missing_required)
return index_missing_required

def require_variable(self, variable, unit=None, year=None, exclude_on_fail=False):
Expand Down Expand Up @@ -1082,7 +1084,7 @@
)

if exclude_on_fail:
self._exclude_on_fail(idx)
_exclude_on_fail(self, idx)

logger.info(msg.format(n, variable))
return pd.DataFrame(index=idx).reset_index()
Expand Down Expand Up @@ -1117,7 +1119,7 @@
logger.info(msg.format(len(df), len(self.data)))

if exclude_on_fail and len(df) > 0:
self._exclude_on_fail(df)
_exclude_on_fail(self, df)
return df.reset_index()

def rename(
Expand Down Expand Up @@ -1193,7 +1195,7 @@

# renaming is only applied where a filter matches for all given columns
rows = ret._apply_filters(**filters)
idx = ret.meta.index.isin(_make_index(ret._data[rows], cols=meta_idx))
idx = ret.meta.index.isin(make_index(ret._data[rows], cols=meta_idx))

# apply renaming changes (for `data` only on the index)
_data_index = ret._data.index
Expand Down Expand Up @@ -1486,7 +1488,7 @@
logger.info(msg.format(variable, sum(rows), len(df_var)))

if exclude_on_fail:
self._exclude_on_fail(_meta_idx(df_var[rows].reset_index()))
_exclude_on_fail(self, _meta_idx(df_var[rows].reset_index()))

return pd.concat(
[df_var[rows], df_components[rows]],
Expand Down Expand Up @@ -1641,7 +1643,7 @@
logger.info(msg.format(variable, sum(rows), len(df_region)))

if exclude_on_fail:
self._exclude_on_fail(_meta_idx(df_region[rows].reset_index()))
_exclude_on_fail(self, _meta_idx(df_region[rows].reset_index()))

Check warning on line 1646 in pyam/core.py

View check run for this annotation

Codecov / codecov/patch

pyam/core.py#L1646

Added line #L1646 was not covered by tests

_df = pd.concat(
[
Expand Down Expand Up @@ -1822,19 +1824,6 @@
]
]

def _exclude_on_fail(self, df):
"""Assign a selection of scenarios as `exclude: True`"""
idx = (
df
if isinstance(df, pd.MultiIndex)
else _make_index(df, cols=self.index.names)
)
self.exclude[idx] = True
n = len(idx)
logger.info(
f"{n} scenario{s(n)} failed validation and will be set as `exclude=True`."
)

def slice(self, keep=True, **kwargs):
"""Return a (filtered) slice object of the IamDataFrame timeseries data index

Expand Down Expand Up @@ -1908,7 +1897,7 @@
logger.info(msg)

# downselect `meta` dataframe
idx = _make_index(ret._data, cols=self.index.names)
idx = make_index(ret._data, cols=self.index.names)
if len(idx) == 0:
logger.warning("Filtered IamDataFrame is empty!")
ret.meta = ret.meta.loc[idx]
Expand Down Expand Up @@ -1944,7 +1933,7 @@
f"Filter by `exclude` requires a boolean, found: {values}"
)
exclude_index = (self.exclude[self.exclude == values]).index
keep_col = _make_index(
keep_col = make_index(
self._data, cols=self.index.names, unique=False
).isin(exclude_index)

Expand All @@ -1953,7 +1942,7 @@
self.meta[col], values, regexp=regexp, has_nan=True
)
cat_idx = self.meta[matches].index
keep_col = _make_index(
keep_col = make_index(
self._data, cols=self.index.names, unique=False
).isin(cat_idx)

Expand Down Expand Up @@ -2741,21 +2730,6 @@
return df


def _make_index(df, cols=META_IDX, unique=True):
"""Create an index from the columns/index of a dataframe or series"""

def _get_col(c):
try:
return df.index.get_level_values(c)
except KeyError:
return df[c]

index = list(zip(*[_get_col(col) for col in cols]))
if unique:
index = pd.unique(index)
return pd.MultiIndex.from_tuples(index, names=tuple(cols))


def _empty_iamframe(index):
"""Return an empty IamDataFrame with the correct index columns"""
return IamDataFrame(pd.DataFrame([], columns=index))
Expand Down
15 changes: 15 additions & 0 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,21 @@ def merge_exclude(left, right, ignore_conflict=False):
return pd.concat([left, right.loc[diff]], sort=False)


def make_index(df, cols=META_IDX, unique=True):
"""Create an index from the columns/index of a dataframe or series"""

def _get_col(c):
try:
return df.index.get_level_values(c)
except KeyError:
return df[c]

index = list(zip(*[_get_col(col) for col in cols]))
if unique:
index = pd.unique(index)
return pd.MultiIndex.from_tuples(index, names=tuple(cols))


def pattern_match(
data, values, level=None, regexp=False, has_nan=False, return_codes=False
):
Expand Down
19 changes: 19 additions & 0 deletions pyam/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import logging
import pandas as pd

from pyam.utils import make_index, s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest renaming the function s into something more descriptive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any good ideas? The point of that method is to having something short that can be used in a docstring...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe plural_s? Although I typed that before I'd seen the use in a docstring. There it's quite readable, so ok from my side to keep it this way if we don't have any good ideas.


logger = logging.getLogger(__name__)


def _exclude_on_fail(df, index):
"""Assign a selection of scenarios as `exclude: True`"""

if not isinstance(index, pd.MultiIndex):
index = make_index(index, cols=df.index.names)

df.exclude[index] = True
n = len(index)
logger.info(
f"{n} scenario{s(n)} failed validation and will be set as `exclude=True`."
)
1 change: 0 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import pytest
import re
import datetime

import numpy as np
Expand Down
28 changes: 23 additions & 5 deletions tests/test_feature_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,61 @@
import pytest

from pyam import IamDataFrame, validate, categorize, require_variable
from pyam.utils import META_IDX
from pyam.utils import IAMC_IDX, META_IDX

from .conftest import TEST_YEARS


# add row for `Primary Energy|Gas` to test that all permutations
DATA_GAS = pd.DataFrame(
[
["model_a", "scen_a", "World", "Primary Energy|Gas", "EJ/yr", 0.5, 3],
["model_a", "scen_b", "World", "Primary Energy|Gas", "EJ/yr", 1, 2],
],
columns=IAMC_IDX + TEST_YEARS,
)


@pytest.mark.parametrize(
"kwargs",
(
dict(),
dict(variable="Primary Energy"),
dict(variable="Primary Energy", year=[2005, 2010]),
phackstock marked this conversation as resolved.
Show resolved Hide resolved
dict(variable=["Primary Energy"], year=[2005, 2010]),
dict(variable=["Primary Energy", "Primary Energy|Gas"], year=[2005, 2010]),
),
)
def test_require_data_pass(test_df_year, kwargs):
# check that IamDataFrame with all required data returns None
assert test_df_year.require_data(**kwargs) is None
df = test_df_year.append(IamDataFrame(DATA_GAS))
assert df.require_data(**kwargs) is None


@pytest.mark.parametrize(
"kwargs",
(
dict(variable="Primary Energy|Coal"),
dict(variable="Primary Energy", year=[2005, 2010]),
dict(variable=["Primary Energy"], year=[2005, 2010]),
dict(variable=["Primary Energy", "Primary Energy|Gas"], year=[2005, 2010]),
),
)
@pytest.mark.parametrize("exclude_on_fail", (False, True))
def test_require_data(test_df_year, kwargs, exclude_on_fail):
# check different ways of failing when not all required data is present

test_df_year._data = test_df_year._data[0:5] # remove value for scen_b & 2010
df = test_df_year.append(IamDataFrame(DATA_GAS))

obs = test_df_year.require_data(**kwargs, exclude_on_fail=exclude_on_fail)
obs = df.require_data(**kwargs, exclude_on_fail=exclude_on_fail)
exp = pd.DataFrame([["model_a", "scen_b"]], columns=["model", "scenario"])
pdt.assert_frame_equal(obs, exp)

if exclude_on_fail:
list(test_df_year.exclude) == [False, True]
list(df.exclude) == [False, True]
else:
list(test_df_year.exclude) == [False, False]
list(df.exclude) == [False, False]


def test_require_variable_pass(test_df):
Expand Down
Loading