generated from opensafely-core/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #238 from opensafely-core/outputs-refactor
outputs refactor
- Loading branch information
Showing
6 changed files
with
193 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import hashlib | ||
import json | ||
from collections import defaultdict | ||
from dataclasses import dataclass, field | ||
from pathlib import Path | ||
|
||
from sacro import utils, versioning | ||
|
||
|
||
def load_from_path(path): | ||
"""Use outputs path from request and load it""" | ||
outputs = ACROOutputs(path) | ||
|
||
versioning.check_version(outputs.version) | ||
|
||
return outputs | ||
|
||
|
||
@dataclass | ||
class ACROOutputs(dict): | ||
"""An ACRO json output file""" | ||
|
||
class InvalidFile(Exception): | ||
pass | ||
|
||
path: Path | ||
version: str = None | ||
config: dict = field(default_factory=dict) | ||
|
||
def __post_init__(self): | ||
self.raw_metadata = json.loads(self.path.read_text()) | ||
config_path = self.path.parent / "config.json" | ||
if config_path.exists(): | ||
self.config = json.loads(config_path.read_text()) | ||
|
||
# super basic structural validation | ||
try: | ||
assert "version" in self.raw_metadata | ||
assert "results" in self.raw_metadata | ||
assert len(self.raw_metadata["results"]) > 0 | ||
for result in self.raw_metadata["results"].values(): | ||
assert "files" in result | ||
assert len(result["files"]) > 0 | ||
for filedata in result["files"]: | ||
assert "name" in filedata | ||
|
||
except AssertionError as exc: | ||
raise self.InvalidFile(f"{self.path} is not a valid ACRO json file: {exc}") | ||
|
||
self.version = self.raw_metadata["version"] | ||
self.update(self.raw_metadata["results"]) | ||
self.annotate() | ||
|
||
def annotate(self): | ||
"""Add various useful annotations to the JSON data""" | ||
|
||
# add urls to JSON data | ||
for output, metadata in self.items(): | ||
for filedata in metadata["files"]: | ||
filedata["url"] = utils.reverse_with_params( | ||
{ | ||
"path": str(self.path), | ||
"output": output, | ||
"filename": filedata["name"], | ||
}, | ||
"contents", | ||
) | ||
|
||
# add and check checksum data, and transform cell data to more useful format | ||
checksums_dir = self.path.parent / "checksums" | ||
for output, metadata in self.items(): | ||
for filedata in metadata["files"]: | ||
# checksums | ||
filedata["checksum_valid"] = False | ||
filedata["checksum"] = None | ||
|
||
path = checksums_dir / (filedata["name"] + ".txt") | ||
if not path.exists(): | ||
continue | ||
|
||
filedata["checksum"] = path.read_text(encoding="utf8") | ||
actual_file = self.get_file_path(output, filedata["name"]) | ||
|
||
if not actual_file.exists(): # pragma: nocover | ||
continue | ||
|
||
checksum = hashlib.sha256(actual_file.read_bytes()).hexdigest() | ||
filedata["checksum_valid"] = checksum == filedata["checksum"] | ||
|
||
# cells | ||
cells = filedata.get("sdc", {}).get("cells", {}) | ||
cell_index = defaultdict(list) | ||
|
||
for flag, indicies in cells.items(): | ||
for x, y in indicies: | ||
key = f"{x},{y}" | ||
cell_index[key].append(flag) | ||
|
||
filedata["cell_index"] = cell_index | ||
|
||
def get_file_path(self, output, filename): | ||
"""Return absolute path to output file""" | ||
if filename not in { | ||
f["name"] for f in self[output]["files"] | ||
}: # pragma: nocover | ||
return None | ||
# note: if filename is absolute, this will just return filename | ||
return self.path.parent / filename | ||
|
||
def write(self): | ||
"""Useful testing helper""" | ||
self.path.write_text(json.dumps(self.raw_metadata, indent=2)) | ||
self.clear() | ||
self.version = None | ||
self.__post_init__() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from urllib.parse import urlencode | ||
|
||
from django.urls import reverse | ||
|
||
|
||
def reverse_with_params(param_dict, *args, **kwargs): | ||
"""Wrapper for django reverse that adds query parameters""" | ||
url = reverse(*args, **kwargs) | ||
return url + "?" + urlencode(param_dict) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import json | ||
|
||
import pytest | ||
|
||
from sacro import models | ||
|
||
|
||
def test_outputs_annotation(test_outputs): | ||
assert test_outputs.version == "0.4.0" | ||
for metadata in test_outputs.values(): | ||
for filedata in metadata["files"]: | ||
assert filedata["checksum"] is not None | ||
assert filedata["checksum_valid"] is True | ||
assert filedata["url"].startswith("/contents/?path=") | ||
|
||
cells = filedata.get("sdc", {}).get("cells", {}) | ||
cell_index = filedata["cell_index"] | ||
|
||
if cells == {}: | ||
assert cell_index == {} | ||
continue | ||
|
||
for flag, indicies in cells.items(): | ||
for x, y in indicies: | ||
key = f"{x},{y}" | ||
assert key in cell_index | ||
|
||
|
||
def test_outputs_annotation_checksum_failed(test_outputs): | ||
first_output = list(test_outputs)[0] | ||
first_file = test_outputs[first_output]["files"][0]["name"] | ||
checksum = test_outputs.path.parent / "checksums" / (first_file + ".txt") | ||
checksum.write_text("bad checksum") | ||
|
||
# re-annotate | ||
test_outputs.annotate() | ||
|
||
assert test_outputs[first_output]["files"][0]["checksum"] == "bad checksum" | ||
assert test_outputs[first_output]["files"][0]["checksum_valid"] is False | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"data", | ||
[ | ||
{}, | ||
{"version": "1"}, | ||
{"version": "1", "results": {}}, | ||
{"version": "1", "results": {"name": {"files": []}}}, | ||
{"version": "1", "results": {"name": {"notfiles": "foo"}}}, | ||
], | ||
) | ||
def test_validation(data, tmp_path): | ||
bad_json = tmp_path / "bad.json" | ||
bad_json.write_text(json.dumps(data)) | ||
with pytest.raises(models.ACROOutputs.InvalidFile): | ||
models.ACROOutputs(bad_json) |
Oops, something went wrong.