From f33904d8a0fb094ddd0b4f2c39e966d5478944be Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Tue, 24 Oct 2023 18:43:35 +0200 Subject: [PATCH] Issue #129 add collection whitelist config --- CHANGELOG.md | 13 +++++------ src/openeo_aggregator/about.py | 2 +- src/openeo_aggregator/backend.py | 10 ++++++++- src/openeo_aggregator/config.py | 3 +++ tests/conftest.py | 11 ++++++++- tests/test_views.py | 38 ++++++++++++++++++++++++++++++++ 6 files changed, 67 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5deecb80..0a3259df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,20 +4,19 @@ All notable changes to this project will be documented in this file. The format is roughly based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [Current: 0.11.x] +## [Current: 0.12.x] -### Added +- Add (optional) config for collection id whitelisting. + Keep union of all "upstream" collections as default. + ([#129](https://github.com/Open-EO/openeo-aggregator/issues/129)) -### Changed + +## [0.11.x] - Dockerfile: switch to `python:3.9-slim-bullseye` base image - Parallelize `/jobs` requests to upstream back-ends ([#28](https://github.com/Open-EO/openeo-aggregator/issues/28)) - Increase timeout on getting batch job logs to 120s ([#128](https://github.com/Open-EO/openeo-aggregator/issues/128)) -### Fixed - -### Removed - ## [0.10.x] diff --git a/src/openeo_aggregator/about.py b/src/openeo_aggregator/about.py index f63d43d8..af952b5c 100644 --- a/src/openeo_aggregator/about.py +++ b/src/openeo_aggregator/about.py @@ -1,7 +1,7 @@ import logging import sys -__version__ = "0.11.1a1" +__version__ = "0.12.0a1" def log_version_info(): diff --git a/src/openeo_aggregator/backend.py b/src/openeo_aggregator/backend.py index 7b02a50d..7bde3177 100644 --- a/src/openeo_aggregator/backend.py +++ b/src/openeo_aggregator/backend.py @@ -144,6 +144,7 @@ def __init__(self, backends: MultiBackendConnection, config: AggregatorConfig): self.backends = backends self._memoizer = memoizer_from_config(config=config, namespace="CollectionCatalog") self.backends.on_connections_change.add(self._memoizer.invalidate) + self._collection_whitelist: Optional[List[str]] = config.collection_whitelist def get_all_metadata(self) -> List[dict]: metadata, internal = self._get_all_metadata_cached() @@ -172,7 +173,14 @@ def _get_all_metadata(self) -> Tuple[List[dict], _InternalCollectionMetadata]: continue for collection_metadata in backend_collections: if "id" in collection_metadata: - grouped[collection_metadata["id"]][con.id] = collection_metadata + collection_id = collection_metadata["id"] + if self._collection_whitelist: + if collection_id not in self._collection_whitelist: + _log.debug(f"Skipping non-whitelisted {collection_id=} from {con.id=}") + continue + else: + _log.debug(f"Preserving whitelisted {collection_id=} from {con.id=}") + grouped[collection_id][con.id] = collection_metadata # TODO: support a trigger to create a collection alias under other name? else: # TODO: there must be something seriously wrong with this backend: skip all its results? diff --git a/src/openeo_aggregator/config.py b/src/openeo_aggregator/config.py index 2a5e5ea9..04c83c90 100644 --- a/src/openeo_aggregator/config.py +++ b/src/openeo_aggregator/config.py @@ -54,6 +54,9 @@ class AggregatorConfig(dict): # TTL for connection caching. connections_cache_ttl = dict_item(default=5 * 60.0) + # List of collection ids to cover with the aggregator (when None: support union of all upstream collections) + collection_whitelist = dict_item(default=None) + @staticmethod def from_py_file(path: Union[str, Path]) -> 'AggregatorConfig': """Load config from Python file.""" diff --git a/tests/conftest.py b/tests/conftest.py index ce2dda76..087ec698 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -103,9 +103,15 @@ def connections_cache_ttl() -> float: return 1.0 +@pytest.fixture +def config_override() -> dict: + """Parameterizable fixture to allow per-test config overrides.""" + return {} + + @pytest.fixture def base_config( - configured_oidc_providers, zk_client, memoizer_config, connections_cache_ttl + configured_oidc_providers, zk_client, memoizer_config, connections_cache_ttl, config_override: dict ) -> AggregatorConfig: """Base config for tests (without any configured backends).""" conf = AggregatorConfig() @@ -123,6 +129,9 @@ def base_config( conf.partitioned_job_tracking = { "zk_client": zk_client, } + + conf.update(config_override) + return conf diff --git a/tests/test_views.py b/tests/test_views.py index ec96171b..dab7cce7 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -270,6 +270,44 @@ def test_collections_links(self, api100, requests_mock, backend1, backend2): "links": [], } + @pytest.mark.parametrize( + ["config_override", "expected"], + [ + ({"collection_whitelist": None}, {"S1", "S2", "S3", "S4"}), + ({"collection_whitelist": []}, {"S1", "S2", "S3", "S4"}), + ({"collection_whitelist": ["S2"]}, {"S2"}), + ({"collection_whitelist": ["S4"]}, {"S4"}), + ({"collection_whitelist": ["S2", "S3"]}, {"S2", "S3"}), + ({"collection_whitelist": ["S2", "S999"]}, {"S2"}), + ({"collection_whitelist": ["S999"]}, set()), + ], + ) + def test_collections_whitelist(self, api100, requests_mock, backend1, backend2, expected): + requests_mock.get(backend1 + "/collections", json={"collections": [{"id": "S1"}, {"id": "S2"}, {"id": "S3"}]}) + for cid in ["S1", "S2", "S3"]: + requests_mock.get(backend1 + f"/collections/{cid}", json={"id": cid, "title": f"b1 {cid}"}) + requests_mock.get(backend2 + "/collections", json={"collections": [{"id": "S3"}, {"id": "S4"}]}) + for cid in ["S3", "S4"]: + requests_mock.get(backend2 + f"/collections/{cid}", json={"id": cid, "title": f"b2 {cid}"}) + + res = api100.get("/collections").assert_status_code(200).json + assert set(c["id"] for c in res["collections"]) == expected + + res = api100.get("/collections/S2") + if "S2" in expected: + assert res.assert_status_code(200).json == DictSubSet({"id": "S2", "title": "b1 S2"}) + else: + res.assert_error(404, "CollectionNotFound") + + res = api100.get("/collections/S3") + if "S3" in expected: + assert res.assert_status_code(200).json == DictSubSet({"id": "S3", "title": "b1 S3"}) + else: + res.assert_error(404, "CollectionNotFound") + + res = api100.get("/collections/S999") + res.assert_error(404, "CollectionNotFound") + class TestAuthentication: def test_credentials_oidc_default(self, api100, backend1, backend2):