From 3c0943d932cd4964872926fa70e7e662b430888c Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 3 Mar 2025 15:12:06 +0100 Subject: [PATCH] Add a range-validation feature (#477) --- nomenclature/processor/aggregator.py | 1 + nomenclature/processor/data_validator.py | 151 ++++++++++++------ ...d_rtol.yaml => error_bounds_and_rtol.yaml} | 0 ...value.yaml => error_bounds_and_value.yaml} | 0 ...teria.yaml => error_missing_criteria.yaml} | 0 ..._region.yaml => error_unknown_region.yaml} | 0 ...iable.yaml => error_unknown_variable.yaml} | 0 ..._asc.yaml => error_warning_level_asc.yaml} | 7 +- .../validate_data/validate_warning_range.yaml | 4 + ...unds.yaml => validation_fails_bounds.yaml} | 0 ...value.yaml => validation_fails_value.yaml} | 0 ...e_validation.yaml => validation_pass.yaml} | 0 tests/test_validate_data.py | 68 ++++---- 13 files changed, 143 insertions(+), 88 deletions(-) rename tests/data/validation/validate_data/{validate_bounds_and_rtol.yaml => error_bounds_and_rtol.yaml} (100%) rename tests/data/validation/validate_data/{validate_bounds_and_value.yaml => error_bounds_and_value.yaml} (100%) rename tests/data/validation/validate_data/{validate_missing_criteria.yaml => error_missing_criteria.yaml} (100%) rename tests/data/validation/validate_data/{validate_unknown_region.yaml => error_unknown_region.yaml} (100%) rename tests/data/validation/validate_data/{validate_unknown_variable.yaml => error_unknown_variable.yaml} (100%) rename tests/data/validation/validate_data/{validate_warning_joined_asc.yaml => error_warning_level_asc.yaml} (50%) create mode 100644 tests/data/validation/validate_data/validate_warning_range.yaml rename tests/data/validation/validate_data/{validate_data_fails_bounds.yaml => validation_fails_bounds.yaml} (100%) rename tests/data/validation/validate_data/{validate_data_fails_value.yaml => validation_fails_value.yaml} (100%) rename tests/data/validation/validate_data/{simple_validation.yaml => validation_pass.yaml} (100%) diff --git a/nomenclature/processor/aggregator.py b/nomenclature/processor/aggregator.py index 8e7946c4..52fd946e 100644 --- a/nomenclature/processor/aggregator.py +++ b/nomenclature/processor/aggregator.py @@ -27,6 +27,7 @@ class AggregationItem(BaseModel): class Aggregator(Processor): """Aggregation or renaming of an IamDataFrame on a `dimension`""" + file: FilePath dimension: str aggregate: list[AggregationItem] diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 60507c9d..08a5e7c6 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -4,10 +4,17 @@ from pathlib import Path import yaml -from pandas import concat +import pandas as pd from pyam import IamDataFrame from pyam.logging import adjust_log_level -from pydantic import computed_field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + computed_field, + field_validator, + model_validator, + Field, +) from nomenclature.definition import DataStructureDefinition from nomenclature.error import ErrorCollector @@ -25,11 +32,20 @@ class WarningEnum(str, Enum): low = "low" -class DataValidationCriteria(IamcDataFilter): +class DataValidationCriteria(BaseModel): + model_config = ConfigDict(extra="forbid") + warning_level: WarningEnum = WarningEnum.error + @property + def criteria(self): + pass + + def __str__(self): + return ", ".join([f"{key}: {value}" for key, value in self.criteria.items()]) + -class DataValidationCriteriaValue(DataValidationCriteria): +class DataValidationValue(DataValidationCriteria): value: float rtol: float = 0.0 atol: float = 0.0 @@ -65,7 +81,10 @@ def criteria(self): ) -class DataValidationCriteriaBounds(DataValidationCriteria): +class DataValidationBounds(DataValidationCriteria): + # allow extra but raise error to guard against multiple criteria + model_config = ConfigDict(extra="allow") + upper_bound: float | None = None lower_bound: float | None = None @@ -75,6 +94,14 @@ def check_validation_criteria_exist(self): raise ValueError("No validation criteria provided: " + str(self.criteria)) return self + @model_validator(mode="after") + def check_validation_multiple_criteria(self): + if self.model_extra: + raise ValueError( + "Must use either bounds, range or value, found: " + str(self.criteria) + ) + return self + @property def validation_args(self): return self.criteria @@ -86,67 +113,96 @@ def criteria(self): ) -class DataValidationCriteriaMultiple(IamcDataFilter): - validation: ( - list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None - ) = None +class DataValidationRange(DataValidationCriteria): + range: list[float] = Field(..., min_length=2, max_length=2) + + @field_validator("range", mode="after") + def check_range_is_valid(cls, value: list[float]): + if value[0] > value[1]: + raise ValueError( + "Validation 'range' must be given as `(lower_bound, upper_bound)`, " + "found: " + str(value) + ) + return value + + @computed_field + def upper_bound(self) -> float: + return self.range[1] + + @computed_field + def lower_bound(self) -> float: + return self.range[0] + + @property + def validation_args(self): + """Attributes used for validation (as bounds).""" + return self.model_dump( + exclude_none=True, + exclude_unset=True, + exclude=["warning_level", "range"], + ) + + @property + def criteria(self): + return self.model_dump( + exclude_none=True, + exclude_unset=True, + exclude=["warning_level", "lower_bound", "upper_bound"], + ) + + +class DataValidationItem(IamcDataFilter): + validation: list[DataValidationValue | DataValidationRange | DataValidationBounds] @model_validator(mode="after") def check_warnings_order(self): """Check if warnings are set in descending order of severity.""" if self.validation != sorted(self.validation, key=lambda c: c.warning_level): raise ValueError( - f"Validation criteria for {self.criteria} not" + f"Validation criteria for {self.criteria} not sorted" " in descending order of severity." ) else: return self @property - def criteria(self): + def filter_args(self): """Attributes used for validation (as specified in the file).""" return self.model_dump( exclude_none=True, exclude_unset=True, exclude=["validation"] ) + def __str__(self): + return ", ".join([f"{key}: {value}" for key, value in self.filter_args.items()]) + class DataValidator(Processor): """Processor for validating IAMC datapoints""" - criteria_items: list[DataValidationCriteriaMultiple] + criteria_items: list[DataValidationItem] file: Path - @field_validator("criteria_items", mode="before") - def check_criteria(cls, v): - for item in v: - for criterion in item["validation"]: - has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) - has_values = any(c in criterion for c in ["value", "atol", "rtol"]) - if has_bounds and has_values: - raise ValueError( - f"Cannot use bounds and value-criteria simultaneously: {criterion}" - ) - return v - @classmethod def from_file(cls, file: Path | str) -> "DataValidator": with open(file, "r", encoding="utf-8") as f: content = yaml.safe_load(f) criteria_items = [] for item in content: - filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields} - criteria_args = { - k: item[k] - for k in item - if k not in IamcDataFilter.model_fields and k != "validation" - } - if "validation" in item: - for criterion in item["validation"]: - criterion.update(filter_args) - else: - item["validation"] = [{**filter_args, **criteria_args}] - criteria_items.append({k: item[k] for k in item if k not in criteria_args}) - return cls(file=file, criteria_items=criteria_items) + # handling of simple case where filter and criteria args are given at the same level + if "validation" not in item: + filter_args = { + k: item[k] for k in item if k in IamcDataFilter.model_fields + } + criteria_args = [ + { + k: item[k] + for k in item + if k not in IamcDataFilter.model_fields and k != "validation" + } + ] + item = dict(**filter_args, validation=criteria_args) + criteria_items.append(item) + return cls(file=file, criteria_items=criteria_items) # type: ignore def apply(self, df: IamDataFrame) -> IamDataFrame: """Validates data in IAMC format according to specified criteria. @@ -172,33 +228,30 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: with adjust_log_level(): for item in self.criteria_items: - per_item_df = df + per_item_df = df.filter(**item.filter_args) for criterion in item.validation: failed_validation = per_item_df.validate( **criterion.validation_args ) if failed_validation is not None: per_item_df = IamDataFrame( - concat([df.data, failed_validation]).drop_duplicates( - keep=False - ) - ) - criteria_msg = " Criteria: " + ", ".join( - [ - f"{key}: {value}" - for key, value in criterion.criteria.items() - ] + pd.concat( + [per_item_df.data, failed_validation] + ).drop_duplicates(keep=False) ) failed_validation["warning_level"] = ( criterion.warning_level.value ) if criterion.warning_level == WarningEnum.error: error = True - fail_list.append(criteria_msg) fail_list.append( - textwrap.indent(str(failed_validation), prefix=" ") + " Criteria: " + str(item) + ", " + str(criterion) + ) + fail_list.append( + textwrap.indent(failed_validation.to_string(), prefix=" ") + "\n" ) + fail_msg = "(file %s):\n" % get_relative_path(self.file) if error: fail_msg = ( diff --git a/tests/data/validation/validate_data/validate_bounds_and_rtol.yaml b/tests/data/validation/validate_data/error_bounds_and_rtol.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_bounds_and_rtol.yaml rename to tests/data/validation/validate_data/error_bounds_and_rtol.yaml diff --git a/tests/data/validation/validate_data/validate_bounds_and_value.yaml b/tests/data/validation/validate_data/error_bounds_and_value.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_bounds_and_value.yaml rename to tests/data/validation/validate_data/error_bounds_and_value.yaml diff --git a/tests/data/validation/validate_data/validate_missing_criteria.yaml b/tests/data/validation/validate_data/error_missing_criteria.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_missing_criteria.yaml rename to tests/data/validation/validate_data/error_missing_criteria.yaml diff --git a/tests/data/validation/validate_data/validate_unknown_region.yaml b/tests/data/validation/validate_data/error_unknown_region.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_unknown_region.yaml rename to tests/data/validation/validate_data/error_unknown_region.yaml diff --git a/tests/data/validation/validate_data/validate_unknown_variable.yaml b/tests/data/validation/validate_data/error_unknown_variable.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_unknown_variable.yaml rename to tests/data/validation/validate_data/error_unknown_variable.yaml diff --git a/tests/data/validation/validate_data/validate_warning_joined_asc.yaml b/tests/data/validation/validate_data/error_warning_level_asc.yaml similarity index 50% rename from tests/data/validation/validate_data/validate_warning_joined_asc.yaml rename to tests/data/validation/validate_data/error_warning_level_asc.yaml index 21e484ab..bae74cf2 100644 --- a/tests/data/validation/validate_data/validate_warning_joined_asc.yaml +++ b/tests/data/validation/validate_data/error_warning_level_asc.yaml @@ -3,10 +3,5 @@ validation: - warning_level: low upper_bound: 2.5 - lower_bound: 1 - upper_bound: 5 - lower_bound: 1 - - variable: Primary Energy|Coal - year: 2010 - upper_bound: 5 - lower_bound: 1 + # default warning_level: error diff --git a/tests/data/validation/validate_data/validate_warning_range.yaml b/tests/data/validation/validate_data/validate_warning_range.yaml new file mode 100644 index 00000000..1f4a12c9 --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning_range.yaml @@ -0,0 +1,4 @@ + - variable: Primary Energy + year: 2010 + validation: + - range: [ 1, 5 ] diff --git a/tests/data/validation/validate_data/validate_data_fails_bounds.yaml b/tests/data/validation/validate_data/validation_fails_bounds.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_data_fails_bounds.yaml rename to tests/data/validation/validate_data/validation_fails_bounds.yaml diff --git a/tests/data/validation/validate_data/validate_data_fails_value.yaml b/tests/data/validation/validate_data/validation_fails_value.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_data_fails_value.yaml rename to tests/data/validation/validate_data/validation_fails_value.yaml diff --git a/tests/data/validation/validate_data/simple_validation.yaml b/tests/data/validation/validate_data/validation_pass.yaml similarity index 100% rename from tests/data/validation/validate_data/simple_validation.yaml rename to tests/data/validation/validate_data/validation_pass.yaml diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 01a41682..cdeb12b2 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -18,18 +18,16 @@ def test_DataValidator_from_file(): "year": [2010], "validation": [ { - "variable": "Final Energy", - "year": [2010], "upper_bound": 2.5, "lower_bound": 1.0, # test that integer in yaml is cast to float } ], } ], - "file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml", + "file": DATA_VALIDATION_TEST_DIR / "validation_pass.yaml", } ) - obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml") + obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "validation_pass.yaml") assert obs == exp dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions") @@ -40,13 +38,13 @@ def test_DataValidator_from_file(): "name, match", [ ("missing_criteria", "No validation criteria provided:"), - ("bounds_and_value", "Cannot use bounds and value-criteria simultaneously:"), - ("bounds_and_rtol", "Cannot use bounds and value-criteria simultaneously:"), + ("bounds_and_value", "Must use either bounds, range or value, found:"), + ("bounds_and_rtol", "Must use either bounds, range or value, found:"), ], ) def test_DataValidator_illegal_structure(name, match): with pytest.raises(ValueError, match=match): - DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"validate_{name}.yaml") + DataValidator.from_file(DATA_VALIDATION_TEST_DIR / f"error_{name}.yaml") @pytest.mark.parametrize( @@ -63,7 +61,7 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): # TODO Undefined unit data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / f"validate_unknown_{dimension}.yaml" + DATA_VALIDATION_TEST_DIR / f"error_unknown_{dimension}.yaml" ) # validating against a DataStructure with all dimensions raises @@ -81,7 +79,7 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): def test_DataValidator_apply_no_matching_data(simple_df): data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / "simple_validation.yaml" + DATA_VALIDATION_TEST_DIR / "validation_pass.yaml" ) # no data matches validation criteria, `apply()` passes and returns unchanged object assert data_validator.apply(simple_df) == simple_df @@ -105,27 +103,25 @@ def test_DataValidator_apply_no_matching_data(simple_df): ], ) def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, caplog): - data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml" + data_file = DATA_VALIDATION_TEST_DIR / f"validation_fails_{file}.yaml" data_validator = DataValidator.from_file(data_file) failed_validation_message = ( "Data validation with error(s)/warning(s) " f"""(file {data_file.relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], {item_1} - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error Criteria: variable: ['Primary Energy|Coal'], {item_2} - model scenario region ... year value warning_level - 0 model_a scen_a World ... 2005 0.5 error - - [1 rows x 8 columns] + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5 error Criteria: variable: ['Primary Energy'], year: [2005], {item_3} - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 error""" + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 error""" ) with pytest.raises(ValueError, match="Data validation failed"): @@ -137,7 +133,7 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl @pytest.mark.parametrize( "file, value", - [("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)], + [("joined", 6.0), ("joined", 3.0), ("legacy", 6.0), ("range", 6.0)], ) def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): """Checks that failed validation rows are printed in log.""" @@ -150,30 +146,36 @@ def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): "Data validation with error(s)/warning(s) " f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" ) + if file == "legacy": # prints both error and low warning levels for legacy format # because these are treated as independent validation-criteria failed_validation_message += """ Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" + + if file == "range": + failed_validation_message = failed_validation_message.replace( + "upper_bound: 5.0, lower_bound: 1.0", "range: [1.0, 5.0]" + ) if value == 3.0: # prints each warning level when each is triggered by different rows failed_validation_message = """ Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 - model scenario region variable unit year value warning_level - 0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + model scenario region variable unit year value warning_level + 0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low""" + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low""" with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) @@ -182,8 +184,8 @@ def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): def test_DataValidator_warning_order_fail(): """Raises validation error if warnings for same criteria not in descending order.""" - match = "Validation criteria for .* not in descending order of severity." + match = "Validation criteria for .* not sorted in descending order of severity." with pytest.raises(ValueError, match=match): DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml" + DATA_VALIDATION_TEST_DIR / "error_warning_level_asc.yaml" )