From f43ebf5f7bc324191faceca2b262ef056d0bd3a1 Mon Sep 17 00:00:00 2001 From: Pontus Larsson Date: Fri, 20 Oct 2023 09:57:18 +0200 Subject: [PATCH] allow filtering random archives by criteria (#10) * allow filtering random archives by criteria * include search criteria in error message --- archive_db/handlers/DbHandlers.py | 201 ++++++++++++++++++------------ pyproject.toml | 2 +- tests/test_models.py | 29 ++++- 3 files changed, 147 insertions(+), 85 deletions(-) diff --git a/archive_db/handlers/DbHandlers.py b/archive_db/handlers/DbHandlers.py index 3d7fb58..f590032 100644 --- a/archive_db/handlers/DbHandlers.py +++ b/archive_db/handlers/DbHandlers.py @@ -26,7 +26,6 @@ def decode(self, required_members=None): return obj - class UploadHandler(BaseHandler): @gen.coroutine @@ -92,62 +91,6 @@ def post(self): "host": verification.archive.host}}) -class RandomUnverifiedArchiveHandler(BaseHandler): - - @gen.coroutine - def get(self): - """ - Returns an unverified Archive object that has an associated was Upload object - within the interval [today - age - margin, today - margin]. The margin value is - used as a safety buffer, to make sure that the archived data has been properly - flushed to tape upstreams at PDC. - - :param age: Number of days we should look back when picking an unverified archive - :param safety_margin: Number of days we should use as safety buffer - :param today: (optional) if specified, use this timestamp for the reference date instead of - datetime.datetime.utcnow().isoformat() - :return A randomly pickedunverified archive within the specified date interval - """ - body = self.decode(required_members=["age", "safety_margin"]) - age = int(body["age"]) - margin = int(body["safety_margin"]) - today = body.get("today", dt.date.today().isoformat()) - - from_timestamp = dt.datetime.fromisoformat(today) - dt.timedelta(days=age+margin) - to_timestamp = from_timestamp + dt.timedelta(days=age) - - # "Give me a randomly chosen archive that was uploaded between from_timestamp and - # to_timestamp, and has no previous verifications" - query = Upload\ - .select()\ - .join(Verification, JOIN.LEFT_OUTER, on=( - Verification.archive_id == Upload.archive_id))\ - .where(Upload.timestamp.between(from_timestamp, to_timestamp))\ - .group_by(Upload.archive_id)\ - .having(fn.Count(Verification.id) < 1)\ - .order_by(fn.Random()) - - result_len = query.count() - - if result_len > 0: - upload = query.first() - archive_name = os.path.basename(os.path.normpath(upload.archive.path)) - self.write_json({ - "status": "unverified", - "archive": { - "timestamp": str(upload.timestamp), - "path": upload.archive.path, - "description": upload.archive.description, - "host": upload.archive.host, - "archive": archive_name - } - }) - else: - msg = f"No unverified archives uploaded between {from_timestamp} and {to_timestamp} " \ - f"was found!" - self.set_status(204, reason=msg) - - # TODO: We might have to add logic in some of the services # that adds a file with the description inside the archive, # so we can verify that we're operating on the correct @@ -209,6 +152,15 @@ def get(self): class QueryHandlerBase(BaseHandler): + @staticmethod + def _str_as_bool(bool_str): + if type(bool_str) is bool: + return bool_str + if type(bool_str) is str and bool_str.lower() in ["true", "false"]: + return bool_str.lower() == "true" + raise TypeError( + f"{bool_str} can not be converted to bool") + @staticmethod def _db_query(): @@ -232,6 +184,48 @@ def _db_query(): Archive.path.asc()) return query + @staticmethod + def _filter_query( + query, + path=None, + description=None, + host=None, + uploaded_before=None, + uploaded_after=None, + verified=None, + removed=None, + **kwargs): + + if path: + query = query.where( + Archive.path.contains(path)) + if description: + query = query.where( + Archive.description.contains(description)) + if host: + query = query.where( + Archive.host.contains(host)) + if uploaded_before: + query = query.where( + Upload.timestamp <= dt.datetime.strptime( + f"{uploaded_before} 23:59:59", + "%Y-%m-%d %H:%M:%S")) + if uploaded_after: + query = query.where( + Upload.timestamp >= dt.datetime.strptime( + uploaded_after, + "%Y-%m-%d")) + if verified is not None: + query = query.where( + Verification.timestamp.is_null( + not QueryHandlerBase._str_as_bool(verified))) + if removed is not None: + query = query.where( + Removal.timestamp.is_null( + not QueryHandlerBase._str_as_bool(removed))) + + return query.dicts() + def _do_query(self, query): if query: self.write_json({ @@ -304,29 +298,80 @@ def post(self): under the key "archives" """ body = self.decode() - query = self._db_query() + query = self._filter_query( + self._db_query(), + **body) + self._do_query(query) - if body.get("path"): - query = query.where(Archive.path.contains(body["path"])) - if body.get("description"): - query = query.where(Archive.description.contains(body["description"])) - if body.get("host"): - query = query.where(Archive.host.contains(body["host"])) - if body.get("uploaded_before"): - query = query.where( - Upload.timestamp <= dt.datetime.strptime( - f"{body['uploaded_before']} 23:59:59", - "%Y-%m-%d %H:%M:%S")) - if body.get("uploaded_after"): - query = query.where( - Upload.timestamp >= dt.datetime.strptime(body["uploaded_after"], "%Y-%m-%d")) - if body.get("verified") is not None and body["verified"] in ["True", "False"]: - query = query.where(Verification.timestamp.is_null(body["verified"] == "False")) - if body.get("removed") is not None and body["removed"] in ["True", "False"]: - query = query.where(Removal.timestamp.is_null(body["removed"] == "False")) - query = (query.dicts()) - self._do_query(query) +class RandomUnverifiedArchiveHandler(QueryHandlerBase): + + @gen.coroutine + def get(self): + """ + For backwards compability, forward this GET request to the POST handler + """ + self.post() + + @gen.coroutine + def post(self): + """ + Returns an unverified Archive object that has an associated was Upload object + within the interval [today - age - margin, today - margin]. The margin value is + used as a safety buffer, to make sure that the archived data has been properly + flushed to tape upstreams at PDC. + + :param age: Number of days we should look back when picking an unverified archive + :param safety_margin: Number of days we should use as safety buffer + :param today: (optional) if specified, use this timestamp for the reference date instead of + datetime.datetime.utcnow().isoformat() + :return A randomly pickedunverified archive within the specified date interval + """ + body = self.decode( + required_members=[ + "age", + "safety_margin"]) + age = int(body["age"]) + margin = int(body["safety_margin"]) + today = body.get("today", dt.date.today().isoformat()) + + from_timestamp = dt.datetime.fromisoformat(today) - dt.timedelta(days=age+margin) + to_timestamp = from_timestamp + dt.timedelta(days=age) + + body["uploaded_before"] = to_timestamp.date().isoformat() + body["uploaded_after"] = from_timestamp.date().isoformat() + body["verified"] = False + + query = self._filter_query( + self._db_query(), + **body) + + result_len = query.count() + + if result_len > 0: + upload = query.first() + archive_name = os.path.basename( + os.path.normpath( + upload["path"] + ) + ) + self.write_json({ + "status": "unverified", + "archive": { + "timestamp": str(upload["uploaded"]), + "path": upload["path"], + "description": upload["description"], + "host": upload["host"], + "archive": archive_name + } + }) + else: + criteria = ", ".join([f"{k}={v}" for k, v in body.items()]) + msg = f"No archives matching criteria {criteria} were found!" + self.set_status( + 204, + reason=msg + ) class VersionHandler(BaseHandler): diff --git a/pyproject.toml b/pyproject.toml index 7120af3..16b337e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ include = ["archive_db*"] [project] name = "archive-db" -version = "1.3.1" +version = "1.4.0" authors = [ {name = "SNP&SEQ Technology Platform, Uppsala University", email = "seq@medsci.uu.se" }, ] diff --git a/tests/test_models.py b/tests/test_models.py index e620c36..6546c1c 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -22,6 +22,7 @@ class TestDb(AsyncHTTPTestCase): num_archives = 5 first_archive = 1 second_archive = 3 + third_archive = 4 API_BASE = "/api/1.0" @@ -48,8 +49,9 @@ def example_data(self): "path": f"/data/testhost/runfolders/archive-{i}", "host": "testhost", "uploaded": str(self.now - datetime.timedelta(days=i)) if i in [ - self.first_archive, self.second_archive] else None, - "verified": str(self.now) if i == self.second_archive else None, + self.first_archive, self.second_archive, self.third_archive] else None, + "verified": str(self.now) if i in [ + self.second_archive] else None, "removed": str(self.now) if i == self.second_archive else None } @@ -173,8 +175,8 @@ def test_failing_fetch_random_unverified_archive(self): self.create_data() # I.e. our lookback window is [today - age - safety_margin, today - safety_margin] days. body = { - "age": "5", - "safety_margin": "1", + "age": "1", + "safety_margin": "2", "today": self.now.date().isoformat() } resp = self.go("/randomarchive", method="GET", body=body) @@ -183,8 +185,8 @@ def test_failing_fetch_random_unverified_archive(self): def test_fetch_random_unverified_archive(self): archives = self.create_data() body = { - "age": "5", - "safety_margin": "0", + "age": "2", + "safety_margin": "1", "today": self.now.date().isoformat() } resp = self.go("/randomarchive", method="GET", body=body) @@ -194,6 +196,21 @@ def test_fetch_random_unverified_archive(self): for key in ("description", "host", "path"): self.assertEqual(obs_archive[key], exp_archive[key]) + def test_fetch_random_archive_with_criteria(self): + archives = self.create_data() + body = { + "age": "5", + "safety_margin": "2", + "description": f"-{self.third_archive}", + "today": self.now.date().isoformat() + } + resp = self.go("/randomarchive", method="POST", body=body) + self.assertEqual(resp.code, 200) + obs_archive = json_decode(resp.body).get("archive") + exp_archive = archives[self.third_archive] + for key in ("description", "host", "path"): + self.assertEqual(obs_archive[key], exp_archive[key]) + def test_version(self): resp = self.go("/version", method="GET") self.assertEqual(resp.code, 200)