Skip to content

Commit

Permalink
allow filtering random archives by criteria (#10)
Browse files Browse the repository at this point in the history
* allow filtering random archives by criteria

* include search criteria in error message
  • Loading branch information
b97pla authored Oct 20, 2023
1 parent 340ae83 commit f43ebf5
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 85 deletions.
201 changes: 123 additions & 78 deletions archive_db/handlers/DbHandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def decode(self, required_members=None):
return obj



class UploadHandler(BaseHandler):

@gen.coroutine
Expand Down Expand Up @@ -92,62 +91,6 @@ def post(self):
"host": verification.archive.host}})


class RandomUnverifiedArchiveHandler(BaseHandler):

@gen.coroutine
def get(self):
"""
Returns an unverified Archive object that has an associated was Upload object
within the interval [today - age - margin, today - margin]. The margin value is
used as a safety buffer, to make sure that the archived data has been properly
flushed to tape upstreams at PDC.
:param age: Number of days we should look back when picking an unverified archive
:param safety_margin: Number of days we should use as safety buffer
:param today: (optional) if specified, use this timestamp for the reference date instead of
datetime.datetime.utcnow().isoformat()
:return A randomly pickedunverified archive within the specified date interval
"""
body = self.decode(required_members=["age", "safety_margin"])
age = int(body["age"])
margin = int(body["safety_margin"])
today = body.get("today", dt.date.today().isoformat())

from_timestamp = dt.datetime.fromisoformat(today) - dt.timedelta(days=age+margin)
to_timestamp = from_timestamp + dt.timedelta(days=age)

# "Give me a randomly chosen archive that was uploaded between from_timestamp and
# to_timestamp, and has no previous verifications"
query = Upload\
.select()\
.join(Verification, JOIN.LEFT_OUTER, on=(
Verification.archive_id == Upload.archive_id))\
.where(Upload.timestamp.between(from_timestamp, to_timestamp))\
.group_by(Upload.archive_id)\
.having(fn.Count(Verification.id) < 1)\
.order_by(fn.Random())

result_len = query.count()

if result_len > 0:
upload = query.first()
archive_name = os.path.basename(os.path.normpath(upload.archive.path))
self.write_json({
"status": "unverified",
"archive": {
"timestamp": str(upload.timestamp),
"path": upload.archive.path,
"description": upload.archive.description,
"host": upload.archive.host,
"archive": archive_name
}
})
else:
msg = f"No unverified archives uploaded between {from_timestamp} and {to_timestamp} " \
f"was found!"
self.set_status(204, reason=msg)


# TODO: We might have to add logic in some of the services
# that adds a file with the description inside the archive,
# so we can verify that we're operating on the correct
Expand Down Expand Up @@ -209,6 +152,15 @@ def get(self):

class QueryHandlerBase(BaseHandler):

@staticmethod
def _str_as_bool(bool_str):
if type(bool_str) is bool:
return bool_str
if type(bool_str) is str and bool_str.lower() in ["true", "false"]:
return bool_str.lower() == "true"
raise TypeError(
f"{bool_str} can not be converted to bool")

@staticmethod
def _db_query():

Expand All @@ -232,6 +184,48 @@ def _db_query():
Archive.path.asc())
return query

@staticmethod
def _filter_query(
query,
path=None,
description=None,
host=None,
uploaded_before=None,
uploaded_after=None,
verified=None,
removed=None,
**kwargs):

if path:
query = query.where(
Archive.path.contains(path))
if description:
query = query.where(
Archive.description.contains(description))
if host:
query = query.where(
Archive.host.contains(host))
if uploaded_before:
query = query.where(
Upload.timestamp <= dt.datetime.strptime(
f"{uploaded_before} 23:59:59",
"%Y-%m-%d %H:%M:%S"))
if uploaded_after:
query = query.where(
Upload.timestamp >= dt.datetime.strptime(
uploaded_after,
"%Y-%m-%d"))
if verified is not None:
query = query.where(
Verification.timestamp.is_null(
not QueryHandlerBase._str_as_bool(verified)))
if removed is not None:
query = query.where(
Removal.timestamp.is_null(
not QueryHandlerBase._str_as_bool(removed)))

return query.dicts()

def _do_query(self, query):
if query:
self.write_json({
Expand Down Expand Up @@ -304,29 +298,80 @@ def post(self):
under the key "archives"
"""
body = self.decode()
query = self._db_query()
query = self._filter_query(
self._db_query(),
**body)
self._do_query(query)

if body.get("path"):
query = query.where(Archive.path.contains(body["path"]))
if body.get("description"):
query = query.where(Archive.description.contains(body["description"]))
if body.get("host"):
query = query.where(Archive.host.contains(body["host"]))
if body.get("uploaded_before"):
query = query.where(
Upload.timestamp <= dt.datetime.strptime(
f"{body['uploaded_before']} 23:59:59",
"%Y-%m-%d %H:%M:%S"))
if body.get("uploaded_after"):
query = query.where(
Upload.timestamp >= dt.datetime.strptime(body["uploaded_after"], "%Y-%m-%d"))
if body.get("verified") is not None and body["verified"] in ["True", "False"]:
query = query.where(Verification.timestamp.is_null(body["verified"] == "False"))
if body.get("removed") is not None and body["removed"] in ["True", "False"]:
query = query.where(Removal.timestamp.is_null(body["removed"] == "False"))

query = (query.dicts())
self._do_query(query)
class RandomUnverifiedArchiveHandler(QueryHandlerBase):

@gen.coroutine
def get(self):
"""
For backwards compability, forward this GET request to the POST handler
"""
self.post()

@gen.coroutine
def post(self):
"""
Returns an unverified Archive object that has an associated was Upload object
within the interval [today - age - margin, today - margin]. The margin value is
used as a safety buffer, to make sure that the archived data has been properly
flushed to tape upstreams at PDC.
:param age: Number of days we should look back when picking an unverified archive
:param safety_margin: Number of days we should use as safety buffer
:param today: (optional) if specified, use this timestamp for the reference date instead of
datetime.datetime.utcnow().isoformat()
:return A randomly pickedunverified archive within the specified date interval
"""
body = self.decode(
required_members=[
"age",
"safety_margin"])
age = int(body["age"])
margin = int(body["safety_margin"])
today = body.get("today", dt.date.today().isoformat())

from_timestamp = dt.datetime.fromisoformat(today) - dt.timedelta(days=age+margin)
to_timestamp = from_timestamp + dt.timedelta(days=age)

body["uploaded_before"] = to_timestamp.date().isoformat()
body["uploaded_after"] = from_timestamp.date().isoformat()
body["verified"] = False

query = self._filter_query(
self._db_query(),
**body)

result_len = query.count()

if result_len > 0:
upload = query.first()
archive_name = os.path.basename(
os.path.normpath(
upload["path"]
)
)
self.write_json({
"status": "unverified",
"archive": {
"timestamp": str(upload["uploaded"]),
"path": upload["path"],
"description": upload["description"],
"host": upload["host"],
"archive": archive_name
}
})
else:
criteria = ", ".join([f"{k}={v}" for k, v in body.items()])
msg = f"No archives matching criteria {criteria} were found!"
self.set_status(
204,
reason=msg
)


class VersionHandler(BaseHandler):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ include = ["archive_db*"]

[project]
name = "archive-db"
version = "1.3.1"
version = "1.4.0"
authors = [
{name = "SNP&SEQ Technology Platform, Uppsala University", email = "[email protected]" },
]
Expand Down
29 changes: 23 additions & 6 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class TestDb(AsyncHTTPTestCase):
num_archives = 5
first_archive = 1
second_archive = 3
third_archive = 4

API_BASE = "/api/1.0"

Expand All @@ -48,8 +49,9 @@ def example_data(self):
"path": f"/data/testhost/runfolders/archive-{i}",
"host": "testhost",
"uploaded": str(self.now - datetime.timedelta(days=i)) if i in [
self.first_archive, self.second_archive] else None,
"verified": str(self.now) if i == self.second_archive else None,
self.first_archive, self.second_archive, self.third_archive] else None,
"verified": str(self.now) if i in [
self.second_archive] else None,
"removed": str(self.now) if i == self.second_archive else None
}

Expand Down Expand Up @@ -173,8 +175,8 @@ def test_failing_fetch_random_unverified_archive(self):
self.create_data()
# I.e. our lookback window is [today - age - safety_margin, today - safety_margin] days.
body = {
"age": "5",
"safety_margin": "1",
"age": "1",
"safety_margin": "2",
"today": self.now.date().isoformat()
}
resp = self.go("/randomarchive", method="GET", body=body)
Expand All @@ -183,8 +185,8 @@ def test_failing_fetch_random_unverified_archive(self):
def test_fetch_random_unverified_archive(self):
archives = self.create_data()
body = {
"age": "5",
"safety_margin": "0",
"age": "2",
"safety_margin": "1",
"today": self.now.date().isoformat()
}
resp = self.go("/randomarchive", method="GET", body=body)
Expand All @@ -194,6 +196,21 @@ def test_fetch_random_unverified_archive(self):
for key in ("description", "host", "path"):
self.assertEqual(obs_archive[key], exp_archive[key])

def test_fetch_random_archive_with_criteria(self):
archives = self.create_data()
body = {
"age": "5",
"safety_margin": "2",
"description": f"-{self.third_archive}",
"today": self.now.date().isoformat()
}
resp = self.go("/randomarchive", method="POST", body=body)
self.assertEqual(resp.code, 200)
obs_archive = json_decode(resp.body).get("archive")
exp_archive = archives[self.third_archive]
for key in ("description", "host", "path"):
self.assertEqual(obs_archive[key], exp_archive[key])

def test_version(self):
resp = self.go("/version", method="GET")
self.assertEqual(resp.code, 200)
Expand Down

0 comments on commit f43ebf5

Please sign in to comment.