Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

outputs refactor #238

Merged
merged 2 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions sacro/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import hashlib
import json
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path

from sacro import utils, versioning


def load_from_path(path):
"""Use outputs path from request and load it"""
outputs = ACROOutputs(path)

versioning.check_version(outputs.version)

return outputs


@dataclass
class ACROOutputs(dict):
"""An ACRO json output file"""

class InvalidFile(Exception):
pass

path: Path
version: str = None
config: dict = field(default_factory=dict)

def __post_init__(self):
self.raw_metadata = json.loads(self.path.read_text())
config_path = self.path.parent / "config.json"
if config_path.exists():
self.config = json.loads(config_path.read_text())

# super basic structural validation
try:
assert "version" in self.raw_metadata
assert "results" in self.raw_metadata
assert len(self.raw_metadata["results"]) > 0
for result in self.raw_metadata["results"].values():
assert "files" in result
assert len(result["files"]) > 0
for filedata in result["files"]:
assert "name" in filedata

except AssertionError as exc:
raise self.InvalidFile(f"{self.path} is not a valid ACRO json file: {exc}")

self.version = self.raw_metadata["version"]
self.update(self.raw_metadata["results"])
self.annotate()

def annotate(self):
"""Add various useful annotations to the JSON data"""

# add urls to JSON data
for output, metadata in self.items():
for filedata in metadata["files"]:
filedata["url"] = utils.reverse_with_params(
{
"path": str(self.path),
"output": output,
"filename": filedata["name"],
},
"contents",
)

# add and check checksum data, and transform cell data to more useful format
checksums_dir = self.path.parent / "checksums"
for output, metadata in self.items():
for filedata in metadata["files"]:
# checksums
filedata["checksum_valid"] = False
filedata["checksum"] = None

path = checksums_dir / (filedata["name"] + ".txt")
if not path.exists():
continue

filedata["checksum"] = path.read_text(encoding="utf8")
actual_file = self.get_file_path(output, filedata["name"])

if not actual_file.exists(): # pragma: nocover
continue

checksum = hashlib.sha256(actual_file.read_bytes()).hexdigest()
filedata["checksum_valid"] = checksum == filedata["checksum"]

# cells
cells = filedata.get("sdc", {}).get("cells", {})
cell_index = defaultdict(list)

for flag, indicies in cells.items():
for x, y in indicies:
key = f"{x},{y}"
cell_index[key].append(flag)

filedata["cell_index"] = cell_index

def get_file_path(self, output, filename):
"""Return absolute path to output file"""
if filename not in {
f["name"] for f in self[output]["files"]
}: # pragma: nocover
return None
# note: if filename is absolute, this will just return filename
return self.path.parent / filename

def write(self):
"""Useful testing helper"""
self.path.write_text(json.dumps(self.raw_metadata, indent=2))
self.clear()
self.version = None
self.__post_init__()
9 changes: 9 additions & 0 deletions sacro/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from urllib.parse import urlencode

from django.urls import reverse


def reverse_with_params(param_dict, *args, **kwargs):
"""Wrapper for django reverse that adds query parameters"""
url = reverse(*args, **kwargs)
return url + "?" + urlencode(param_dict)
114 changes: 9 additions & 105 deletions sacro/views.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import getpass
import hashlib
import html
import json
import logging
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from urllib.parse import urlencode

from django.conf import settings
from django.http import FileResponse, Http404, HttpResponse, HttpResponseBadRequest
Expand All @@ -16,104 +12,16 @@
from django.urls import reverse
from django.views.decorators.http import require_GET, require_POST

from sacro import models, utils
from sacro.adapters import local_audit, zipfile
from sacro.versioning import check_version


logger = logging.getLogger(__name__)

REVIEWS = {}


def reverse_with_params(param_dict, *args, **kwargs):
"""Wrapper for django reverse that adds query parameters"""
url = reverse(*args, **kwargs)
return url + "?" + urlencode(param_dict)


@dataclass
class Outputs(dict):
"""An ACRO json output file"""

path: Path
version: str = None
config: dict = field(default_factory=dict)

def __post_init__(self):
self.raw_metadata = json.loads(self.path.read_text())
config_path = self.path.parent / "config.json"
if config_path.exists():
self.config = json.loads(config_path.read_text())

self.version = self.raw_metadata["version"]
self.update(self.raw_metadata["results"])
self.annotate()

def annotate(self):
"""Add various useful annotations to the JSON data"""

# add urls to JSON data
for output, metadata in self.items():
for filedata in metadata["files"]:
filedata["url"] = reverse_with_params(
{
"path": str(self.path),
"output": output,
"filename": filedata["name"],
},
"contents",
)

# add and check checksum data, and transform cell data to more useful format
checksums_dir = self.path.parent / "checksums"
for output, metadata in self.items():
for filedata in metadata["files"]:
# checksums
filedata["checksum_valid"] = False
filedata["checksum"] = None

path = checksums_dir / (filedata["name"] + ".txt")
if not path.exists():
continue

filedata["checksum"] = path.read_text(encoding="utf8")
actual_file = self.get_file_path(output, filedata["name"])

if not actual_file.exists(): # pragma: nocover
continue

checksum = hashlib.sha256(actual_file.read_bytes()).hexdigest()
filedata["checksum_valid"] = checksum == filedata["checksum"]

# cells
cells = filedata.get("sdc", {}).get("cells", {})
cell_index = defaultdict(list)

for flag, indicies in cells.items():
for x, y in indicies:
key = f"{x},{y}"
cell_index[key].append(flag)

filedata["cell_index"] = cell_index

def get_file_path(self, output, filename):
"""Return absolute path to output file"""
if filename not in {
f["name"] for f in self[output]["files"]
}: # pragma: nocover
return None
# note: if filename is absolute, this will just return filename
return self.path.parent / filename

def write(self):
"""Useful testing helper"""
self.path.write_text(json.dumps(self.raw_metadata, indent=2))
self.clear()
self.version = None
self.__post_init__()


def get_outputs(data):
def get_outputs_from_request(data):
"""Use outputs path from request and load it"""
param_path = data.get("path")
if param_path is None:
Expand All @@ -124,11 +32,7 @@ def get_outputs(data):
if not path.exists(): # pragma: no cover
raise Http404

outputs = Outputs(path)

check_version(outputs.version)

return outputs
return models.load_from_path(path)


@require_GET
Expand All @@ -139,8 +43,8 @@ def index(request):
if "path" not in request.GET and settings.DEBUG:
data = {"path": "outputs/results.json"}

outputs = get_outputs(data)
create_url = reverse_with_params({"path": str(outputs.path)}, "review-create")
outputs = get_outputs_from_request(data)
create_url = utils.reverse_with_params({"path": str(outputs.path)}, "review-create")

return TemplateResponse(
request,
Expand All @@ -161,7 +65,7 @@ def contents(request):
We also require the json file and check that the requested file is present
in the json. This prevents loading arbitrary user files over http.
"""
outputs = get_outputs(request.GET)
outputs = get_outputs_from_request(request.GET)
output = request.GET.get("output")
filename = request.GET.get("filename")

Expand Down Expand Up @@ -194,7 +98,7 @@ def approved_outputs(request, pk):
if not (review := REVIEWS.get(pk)):
raise Http404

outputs = Outputs(review["path"])
outputs = models.ACROOutputs(review["path"])

approved_outputs = [k for k, v in review["decisions"].items() if v["state"] is True]
in_memory_zf = zipfile.create(outputs, approved_outputs)
Expand All @@ -220,7 +124,7 @@ def review_create(request):

# we load the path from the querystring, even though this is a post request
# check the reviewed outputs are valid
outputs = get_outputs(request.GET)
outputs = get_outputs_from_request(request.GET)
approved_outputs = [k for k, v in review.items() if v["state"] is True]
unrecognized_outputs = [o for o in approved_outputs if o not in outputs]
if unrecognized_outputs:
Expand Down Expand Up @@ -267,7 +171,7 @@ def summary(request, pk):
raise Http404

# add ACRO status to output decisions
outputs = Outputs(review["path"])
outputs = models.ACROOutputs(review["path"])
for name, data in review["decisions"].items():
review["decisions"][name]["acro_status"] = outputs[name]["status"]

Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from sacro import views
from sacro import models


@pytest.fixture
Expand All @@ -14,4 +14,4 @@ def TEST_PATH():
@pytest.fixture
def test_outputs(tmp_path, TEST_PATH):
shutil.copytree(TEST_PATH.parent, tmp_path, dirs_exist_ok=True)
return views.Outputs(tmp_path / TEST_PATH.name)
return models.ACROOutputs(tmp_path / TEST_PATH.name)
56 changes: 56 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json

import pytest

from sacro import models


def test_outputs_annotation(test_outputs):
assert test_outputs.version == "0.4.0"
for metadata in test_outputs.values():
for filedata in metadata["files"]:
assert filedata["checksum"] is not None
assert filedata["checksum_valid"] is True
assert filedata["url"].startswith("/contents/?path=")

cells = filedata.get("sdc", {}).get("cells", {})
cell_index = filedata["cell_index"]

if cells == {}:
assert cell_index == {}
continue

for flag, indicies in cells.items():
for x, y in indicies:
key = f"{x},{y}"
assert key in cell_index


def test_outputs_annotation_checksum_failed(test_outputs):
first_output = list(test_outputs)[0]
first_file = test_outputs[first_output]["files"][0]["name"]
checksum = test_outputs.path.parent / "checksums" / (first_file + ".txt")
checksum.write_text("bad checksum")

# re-annotate
test_outputs.annotate()

assert test_outputs[first_output]["files"][0]["checksum"] == "bad checksum"
assert test_outputs[first_output]["files"][0]["checksum_valid"] is False


@pytest.mark.parametrize(
"data",
[
{},
{"version": "1"},
{"version": "1", "results": {}},
{"version": "1", "results": {"name": {"files": []}}},
{"version": "1", "results": {"name": {"notfiles": "foo"}}},
],
)
def test_validation(data, tmp_path):
bad_json = tmp_path / "bad.json"
bad_json.write_text(json.dumps(data))
with pytest.raises(models.ACROOutputs.InvalidFile):
models.ACROOutputs(bad_json)
Loading