diff --git a/docs/user_guide/config.rst b/docs/user_guide/config.rst index f9fd887b..be5a51d7 100644 --- a/docs/user_guide/config.rst +++ b/docs/user_guide/config.rst @@ -49,19 +49,55 @@ multiple external repositories can be used as the example below illustrates for url: https://github.com/IAMconsortium/common-definitions.git/ definitions: region: - repository: common-definitions + repository: + name: common-definitions variable: repositories: - - common-definitions - - legacy-definitions + - name: common-definitions + - name: legacy-definitions mappings: - repository: common-definitions + repository: + name: common-definitions -The value in *definitions.region.repository* needs to reference the repository in the -*repositories* section. +The value in *definitions.region.repository* can be a list or a single value, needs to +contain the ``name`` keyword and reference the repository in the *repositories* section. For model mappings the process is analogous using *mappings.repository*. +Filter code lists imported from external repositories +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since importing the entirety of, for example, common-definitions is too much for most +projects, the list can be filtered using ``include`` and ``exclude`` keywords. Under +these keywords, lists of filters can be given that will be applied to the code list from +the given repository. + +The filtering can be done by any attribute: + +.. code:: yaml + + repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + definitions: + variable: + repository: + name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Final Energy|*|*" + +In the example above we are including: +1. All variables starting with *Primary Energy* or *Final Energy* +2. All variables starting with *Population* **and** with the tier attribute equal to 1 + +From this list we are then **excluding** all variables that match "Final Energy|*|*". +This means that the final resulting list will contain no Final Energy variables with +three or more levels. + Adding countries to the region codelist --------------------------------------- diff --git a/nomenclature/code.py b/nomenclature/code.py index 097b6443..8fe4d3a2 100644 --- a/nomenclature/code.py +++ b/nomenclature/code.py @@ -171,7 +171,7 @@ class VariableCode(Code): ) method: str | None = None check_aggregate: bool | None = Field(default=False, alias="check-aggregate") - components: Union[List[str], List[Dict[str, List[str]]]] | None = None + components: Union[List[str], Dict[str, list[str]]] | None = None drop_negative_weights: bool | None = None model_config = ConfigDict(populate_by_name=True) @@ -187,6 +187,18 @@ def deserialize_json(cls, v): def convert_none_to_empty_string(cls, v): return v if v is not None else "" + @field_validator("components", mode="before") + def cast_variable_components_args(cls, v): + """Cast "components" list of dicts to a codelist""" + + # translate a list of single-key dictionaries to a simple dictionary + if v is not None and isinstance(v, list) and isinstance(v[0], dict): + comp = {} + for val in v: + comp.update(val) + return comp + return v + @field_serializer("unit") def convert_str_to_none_for_writing(self, v): return v if v != "" else None diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 366e489d..36e60387 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -209,13 +209,12 @@ def from_directory( for repo in getattr( config.definitions, name.lower(), CodeListConfig() ).repositories: - code_list.extend( - cls._parse_codelist_dir( - config.repositories[repo].local_path / "definitions" / name, - file_glob_pattern, - repo, - ) + repository_code_list = cls._parse_codelist_dir( + config.repositories[repo.name].local_path / "definitions" / name, + file_glob_pattern, + repo.name, ) + code_list.extend(repo.filter_list_of_codes(repository_code_list)) errors = ErrorCollector() mapping: Dict[str, Code] = {} for code in code_list: @@ -567,21 +566,6 @@ def check_weight_in_vars(cls, v): ) return v - @field_validator("mapping") - @classmethod - def cast_variable_components_args(cls, v): - """Cast "components" list of dicts to a codelist""" - - # translate a list of single-key dictionaries to a simple dictionary - for var in v.values(): - if var.components and isinstance(var.components[0], dict): - comp = {} - for val in var.components: - comp.update(val) - v[var.name].components = comp - - return v - def vars_default_args(self, variables: List[str]) -> List[VariableCode]: """return subset of variables which does not feature any special pyam aggregation arguments and where skip_region_aggregation is False""" @@ -706,21 +690,25 @@ def from_directory( # importing from an external repository for repo in config.definitions.region.repositories: - repo_path = config.repositories[repo].local_path / "definitions" / "region" + repo_path = ( + config.repositories[repo.name].local_path / "definitions" / "region" + ) - code_list = cls._parse_region_code_dir( - code_list, + repo_list_of_codes = cls._parse_region_code_dir( repo_path, file_glob_pattern, - repository=repo, + repository=repo.name, ) - code_list = cls._parse_and_replace_tags( - code_list, repo_path, file_glob_pattern + repo_list_of_codes = cls._parse_and_replace_tags( + repo_list_of_codes, repo_path, file_glob_pattern ) + code_list.extend(repo.filter_list_of_codes(repo_list_of_codes)) # parse from current repository - code_list = cls._parse_region_code_dir(code_list, path, file_glob_pattern) - code_list = cls._parse_and_replace_tags(code_list, path, file_glob_pattern) + local_code_list = cls._parse_region_code_dir(path, file_glob_pattern) + code_list.extend( + cls._parse_and_replace_tags(local_code_list, path, file_glob_pattern) + ) # translate to mapping mapping: Dict[str, RegionCode] = {} @@ -756,13 +744,12 @@ def hierarchy(self) -> List[str]: @classmethod def _parse_region_code_dir( cls, - code_list: List[Code], path: Path, file_glob_pattern: str = "**/*", repository: str | None = None, ) -> List[RegionCode]: """""" - + code_list: List[RegionCode] = [] for yaml_file in ( f for f in path.glob(file_glob_pattern) diff --git a/nomenclature/config.py b/nomenclature/config.py index 487101bd..8ce42196 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -1,6 +1,7 @@ from enum import Enum from pathlib import Path -from typing import Annotated, Optional +from typing import Any +from fnmatch import fnmatch import yaml from git import Repo @@ -11,29 +12,83 @@ field_validator, model_validator, ConfigDict, - BeforeValidator, ) +from nomenclature.code import Code + + +class RepositoryWithFilter(BaseModel): + name: str + include: list[dict[str, Any]] = [{"name": "*"}] + exclude: list[dict[str, Any]] = Field(default_factory=list) + + def filter_function(self, code: Code, filter: dict[str, Any], keep: bool): + # if is list -> recursive + # if is str -> fnmatch + # if is int -> match exactly + # if is None -> Attribute does not exist therefore does not match + def check_attribute_match(code_value, filter_value): + if isinstance(filter_value, int): + return code_value == filter_value + if isinstance(filter_value, str): + return fnmatch(code_value, filter_value) + if isinstance(filter_value, list): + return any( + check_attribute_match(code_value, value) for value in filter_value + ) + if filter_value is None: + return False + raise ValueError("Something went wrong with the filtering") + + filter_match = all( + check_attribute_match(getattr(code, attribute, None), value) + for attribute, value in filter.items() + ) + if keep: + return filter_match + else: + return not filter_match + + def filter_list_of_codes(self, list_of_codes: list[Code]) -> list[Code]: + # include first + filter_result = [ + code + for code in list_of_codes + if any( + self.filter_function( + code, + filter, + keep=True, + ) + for filter in self.include + ) + ] + + if self.exclude: + filter_result = [ + code + for code in filter_result + if any( + self.filter_function(code, filter, keep=False) + for filter in self.exclude + ) + ] - -def convert_to_set(v: str | list[str] | set[str]) -> set[str]: - match v: - case set(v): - return v - case list(v): - return set(v) - case str(v): - return {v} - case _: - raise TypeError("`repositories` must be of type str, list or set.") + return filter_result class CodeListConfig(BaseModel): dimension: str | None = None - repositories: Annotated[set[str], BeforeValidator(convert_to_set)] = Field( - default_factory=set, alias="repository" + repositories: list[RepositoryWithFilter] = Field( + default_factory=list, alias="repository" ) model_config = ConfigDict(populate_by_name=True) + @field_validator("repositories", mode="before") + def convert_to_set_of_repos(cls, v): + if not isinstance(v, list): + return [v] + return v + @property def repository_dimension_path(self) -> str: return f"definitions/{self.dimension}" @@ -109,8 +164,8 @@ class DataStructureConfig(BaseModel): """ - region: Optional[RegionCodeListConfig] = Field(default_factory=RegionCodeListConfig) - variable: Optional[CodeListConfig] = Field(default_factory=CodeListConfig) + region: RegionCodeListConfig = Field(default_factory=RegionCodeListConfig) + variable: CodeListConfig = Field(default_factory=CodeListConfig) @field_validator("region", "variable", mode="before") @classmethod @@ -126,12 +181,22 @@ def repos(self) -> dict[str, str]: } +class MappingRepository(BaseModel): + name: str + + class RegionMappingConfig(BaseModel): - repositories: Annotated[set[str], BeforeValidator(convert_to_set)] = Field( - default_factory=set, alias="repository" + repositories: list[MappingRepository] = Field( + default_factory=list, alias="repository" ) model_config = ConfigDict(populate_by_name=True) + @field_validator("repositories", mode="before") + def convert_to_set_of_repos(cls, v): + if not isinstance(v, list): + return [v] + return v + class DimensionEnum(str, Enum): model = "model" @@ -157,8 +222,9 @@ def check_definitions_repository( mapping_repos = {"mappings": v.mappings.repositories} if v.mappings else {} repos = {**v.definitions.repos, **mapping_repos} for use, repositories in repos.items(): - if repositories - v.repositories.keys(): - raise ValueError((f"Unknown repository {repositories} in '{use}'.")) + repository_names = [repository.name for repository in repositories] + if unknown_repos := repository_names - v.repositories.keys(): + raise ValueError((f"Unknown repository {unknown_repos} in '{use}'.")) return v def fetch_repos(self, target_folder: Path): diff --git a/nomenclature/processor/region.py b/nomenclature/processor/region.py index a8888a22..2b3320b9 100644 --- a/nomenclature/processor/region.py +++ b/nomenclature/processor/region.py @@ -487,7 +487,7 @@ def from_directory(cls, path: DirectoryPath, dsd: DataStructureDefinition): mapping_files.extend( f for f in ( - dsd.config.repositories[repository].local_path / "mappings" + dsd.config.repositories[repository.name].local_path / "mappings" ).glob("**/*") if f.suffix in {".yaml", ".yml"} ) diff --git a/poetry.lock b/poetry.lock index ef9e3660..069e4c85 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "alabaster" diff --git a/tests/data/config_filter/definitions/region/.gitkeep b/tests/data/config_filter/definitions/region/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/config_filter/definitions/variable/.gitkeep b/tests/data/config_filter/definitions/variable/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/config_filter/nomenclature.yaml b/tests/data/config_filter/nomenclature.yaml new file mode 100644 index 00000000..e990ed5a --- /dev/null +++ b/tests/data/config_filter/nomenclature.yaml @@ -0,0 +1,17 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +definitions: + variable: + repository: + - name: common-definitions + filters: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + - name: legacy-definitions + region: + repository: common-definitions + country: true diff --git a/tests/data/general-config-only/nomenclature.yaml b/tests/data/general-config-only/nomenclature.yaml index eb390c76..48acdb3f 100644 --- a/tests/data/general-config-only/nomenclature.yaml +++ b/tests/data/general-config-only/nomenclature.yaml @@ -3,6 +3,8 @@ repositories: url: https://github.com/IAMconsortium/common-definitions.git/ definitions: region: - repository: common-definitions + repository: + name: common-definitions variable: - repository: common-definitions + repository: + name: common-definitions diff --git a/tests/data/general-config/nomenclature.yaml b/tests/data/general-config/nomenclature.yaml index fe35a156..14b70014 100644 --- a/tests/data/general-config/nomenclature.yaml +++ b/tests/data/general-config/nomenclature.yaml @@ -4,6 +4,8 @@ repositories: url: https://github.com/IAMconsortium/common-definitions.git/ definitions: region: - repository: common-definitions + repository: + name: common-definitions variable: - repository: common-definitions + repository: + name: common-definitions diff --git a/tests/data/nomenclature_configs/external_repo_filters.yaml b/tests/data/nomenclature_configs/external_repo_filters.yaml new file mode 100644 index 00000000..bb77dd7b --- /dev/null +++ b/tests/data/nomenclature_configs/external_repo_filters.yaml @@ -0,0 +1,20 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ +definitions: + variable: + repository: + name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Final Energy|*|*" + region: + repository: + name: common-definitions + include: + - hierarchy: R5 + exclude: + - name: Other (R5) diff --git a/tests/data/nomenclature_configs/hash_and_release.yaml b/tests/data/nomenclature_configs/hash_and_release.yaml index 502d18e4..f7e2a2ad 100644 --- a/tests/data/nomenclature_configs/hash_and_release.yaml +++ b/tests/data/nomenclature_configs/hash_and_release.yaml @@ -5,7 +5,9 @@ repositories: release: "1.0" definitions: region: - repository: common-definitions + repository: + name: common-definitions country: true variable: - repository: common-definitions + repository: + name: common-definitions diff --git a/tests/data/nomenclature_configs/multiple_external_repos_filters.yaml b/tests/data/nomenclature_configs/multiple_external_repos_filters.yaml new file mode 100644 index 00000000..343541f3 --- /dev/null +++ b/tests/data/nomenclature_configs/multiple_external_repos_filters.yaml @@ -0,0 +1,16 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +definitions: + variable: + repository: + - name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Primary Energy|*|*" # exclude all third tier variables + - name: legacy-definitions diff --git a/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml b/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml index ff2bd784..c09c81a1 100644 --- a/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml +++ b/tests/data/nomenclature_configs/multiple_repos_for_mapping.yaml @@ -5,5 +5,5 @@ repositories: url: https://github.com/IAMconsortium/legacy-definitions.git/ mappings: repositories: - - common-definitions - - legacy-definitions + - name: common-definitions + - name: legacy-definitions diff --git a/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml b/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml index 2fa400de..0d085dfb 100644 --- a/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml +++ b/tests/data/nomenclature_configs/multiple_repos_per_dimension.yaml @@ -8,5 +8,5 @@ repositories: definitions: variable: repository: - - common-definitions - - legacy-definitions + - name: common-definitions + - name: legacy-definitions diff --git a/tests/data/nomenclature_configs/regions/.gitkeep b/tests/data/nomenclature_configs/regions/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/nomenclature_configs/unknown_repo.yaml b/tests/data/nomenclature_configs/unknown_repo.yaml index 08187c7a..57ad005d 100644 --- a/tests/data/nomenclature_configs/unknown_repo.yaml +++ b/tests/data/nomenclature_configs/unknown_repo.yaml @@ -1,3 +1,4 @@ definitions: region: - repository: common-definitions + repository: + name: common-definitions diff --git a/tests/data/region_processing/external_repo_test/nomenclature.yaml b/tests/data/region_processing/external_repo_test/nomenclature.yaml index 6c6b2c53..0c90e76d 100644 --- a/tests/data/region_processing/external_repo_test/nomenclature.yaml +++ b/tests/data/region_processing/external_repo_test/nomenclature.yaml @@ -3,8 +3,11 @@ repositories: url: https://github.com/IAMconsortium/common-definitions.git/ definitions: region: - repository: common-definitions + repository: + name: common-definitions variable: - repository: common-definitions + repository: + name: common-definitions mappings: - repository: common-definitions + repository: + name: common-definitions diff --git a/tests/test_codelist.py b/tests/test_codelist.py index 2d292fd7..e2416b01 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -442,3 +442,56 @@ def test_variablecodelist_list_missing_variables_to_new_file(simple_df, tmp_path } assert obs.mapping == exp + + +def test_variable_code_list_external_repo_with_filters(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / "external_repo_filters.yaml" + ) + try: + variable_code_list = VariableCodeList.from_directory( + "variable", + TEST_DATA_DIR / "nomenclature_configs" / "variable", + nomenclature_config, + ) + exp_included_variables = [ + "Final Energy", + "Population", + "Primary Energy|Oil|Hydrogen|w/ CCS", + ] + exp_excluded_variables = [ + "Final Energy|Agriculture|Electricity", # no third level Final Energy + "Population|Clean Cooking Access", # only tier 1 Population + ] + assert all( + variable in variable_code_list for variable in exp_included_variables + ) + assert all( + variable not in variable_code_list for variable in exp_excluded_variables + ) + finally: + clean_up_external_repos(nomenclature_config.repositories) + + +def test_region_code_list_external_repo_with_filters(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / "external_repo_filters.yaml" + ) + try: + region_code_list = RegionCodeList.from_directory( + "region", + TEST_DATA_DIR / "nomenclature_configs" / "variable", + nomenclature_config, + ) + R5_regions = [ + "OECD & EU (R5)", + "Reforming Economies (R5)", + "Asia (R5)", + "Middle East & Africa (R5)", + "Latin America (R5)", + ] + assert len(region_code_list) == 5 + assert all(r5_region in region_code_list for r5_region in R5_regions) + assert "Other (R5)" not in region_code_list + finally: + clean_up_external_repos(nomenclature_config.repositories) diff --git a/tests/test_config.py b/tests/test_config.py index 7670424e..ebc836a9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,11 @@ from pathlib import Path +import pytest from pytest import raises -from nomenclature.config import Repository, NomenclatureConfig, CodeListConfig +from nomenclature.config import ( + Repository, + NomenclatureConfig, +) from conftest import TEST_DATA_DIR, clean_up_external_repos @@ -34,24 +38,16 @@ def test_multiple_definition_repos(): try: exp_repos = {"common-definitions", "legacy-definitions"} assert nomenclature_config.repositories.keys() == exp_repos - assert nomenclature_config.definitions.variable.repositories == exp_repos finally: clean_up_external_repos(nomenclature_config.repositories) -def test_codelist_config_set_input(): - exp_repos = {"repo1", "repo2"} - code_list_config = CodeListConfig(dimension="variable", repositories=exp_repos) - assert code_list_config.repositories == exp_repos - - def test_multiple_mapping_repos(): nomenclature_config = NomenclatureConfig.from_file( TEST_DATA_DIR / "nomenclature_configs" / "multiple_repos_for_mapping.yaml" ) try: exp_repos = {"common-definitions", "legacy-definitions"} - assert nomenclature_config.mappings.repositories == exp_repos assert nomenclature_config.repositories.keys() == exp_repos finally: clean_up_external_repos(nomenclature_config.repositories) @@ -89,3 +85,17 @@ def test_invalid_config_dimensions_raises(): ), ): NomenclatureConfig(dimensions=["year"]) + + +@pytest.mark.parametrize( + "config_file", + ["external_repo_filters.yaml", "multiple_external_repos_filters.yaml"], +) +def test_config_with_filter(config_file): + config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "nomenclature_configs" / config_file + ) + try: + assert isinstance(config.definitions.variable.repositories, list) + finally: + clean_up_external_repos(config.repositories)