Merge pull request #238 from opensafely-core/outputs-refactor

outputs refactor
AI-SDC · Oct 17, 2023 · 2f244cd · 2f244cd
2 parents 3deaa3c + d81b3f6
commit 2f244cd
Show file tree

Hide file tree

Showing 6 changed files with 193 additions and 143 deletions.
diff --git a/sacro/models.py b/sacro/models.py
@@ -0,0 +1,115 @@
+import hashlib
+import json
+from collections import defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from sacro import utils, versioning
+
+
+def load_from_path(path):
+    """Use outputs path from request and load it"""
+    outputs = ACROOutputs(path)
+
+    versioning.check_version(outputs.version)
+
+    return outputs
+
+
+@dataclass
+class ACROOutputs(dict):
+    """An ACRO json output file"""
+
+    class InvalidFile(Exception):
+        pass
+
+    path: Path
+    version: str = None
+    config: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        self.raw_metadata = json.loads(self.path.read_text())
+        config_path = self.path.parent / "config.json"
+        if config_path.exists():
+            self.config = json.loads(config_path.read_text())
+
+        # super basic structural validation
+        try:
+            assert "version" in self.raw_metadata
+            assert "results" in self.raw_metadata
+            assert len(self.raw_metadata["results"]) > 0
+            for result in self.raw_metadata["results"].values():
+                assert "files" in result
+                assert len(result["files"]) > 0
+                for filedata in result["files"]:
+                    assert "name" in filedata
+
+        except AssertionError as exc:
+            raise self.InvalidFile(f"{self.path} is not a valid ACRO json file: {exc}")
+
+        self.version = self.raw_metadata["version"]
+        self.update(self.raw_metadata["results"])
+        self.annotate()
+
+    def annotate(self):
+        """Add various useful annotations to the JSON data"""
+
+        # add urls to JSON data
+        for output, metadata in self.items():
+            for filedata in metadata["files"]:
+                filedata["url"] = utils.reverse_with_params(
+                    {
+                        "path": str(self.path),
+                        "output": output,
+                        "filename": filedata["name"],
+                    },
+                    "contents",
+                )
+
+        # add and check checksum data, and transform cell data to more useful format
+        checksums_dir = self.path.parent / "checksums"
+        for output, metadata in self.items():
+            for filedata in metadata["files"]:
+                # checksums
+                filedata["checksum_valid"] = False
+                filedata["checksum"] = None
+
+                path = checksums_dir / (filedata["name"] + ".txt")
+                if not path.exists():
+                    continue
+
+                filedata["checksum"] = path.read_text(encoding="utf8")
+                actual_file = self.get_file_path(output, filedata["name"])
+
+                if not actual_file.exists():  # pragma: nocover
+                    continue
+
+                checksum = hashlib.sha256(actual_file.read_bytes()).hexdigest()
+                filedata["checksum_valid"] = checksum == filedata["checksum"]
+
+                # cells
+                cells = filedata.get("sdc", {}).get("cells", {})
+                cell_index = defaultdict(list)
+
+                for flag, indicies in cells.items():
+                    for x, y in indicies:
+                        key = f"{x},{y}"
+                        cell_index[key].append(flag)
+
+                filedata["cell_index"] = cell_index
+
+    def get_file_path(self, output, filename):
+        """Return absolute path to output file"""
+        if filename not in {
+            f["name"] for f in self[output]["files"]
+        }:  # pragma: nocover
+            return None
+        # note: if filename is absolute, this will just return filename
+        return self.path.parent / filename
+
+    def write(self):
+        """Useful testing helper"""
+        self.path.write_text(json.dumps(self.raw_metadata, indent=2))
+        self.clear()
+        self.version = None
+        self.__post_init__()
diff --git a/sacro/utils.py b/sacro/utils.py
@@ -0,0 +1,9 @@
+from urllib.parse import urlencode
+
+from django.urls import reverse
+
+
+def reverse_with_params(param_dict, *args, **kwargs):
+    """Wrapper for django reverse that adds query parameters"""
+    url = reverse(*args, **kwargs)
+    return url + "?" + urlencode(param_dict)
diff --git a/sacro/views.py b/sacro/views.py
@@ -1,12 +1,8 @@
 import getpass
-import hashlib
 import html
 import json
 import logging
-from collections import defaultdict
-from dataclasses import dataclass, field
 from pathlib import Path
-from urllib.parse import urlencode
 
 from django.conf import settings
 from django.http import FileResponse, Http404, HttpResponse, HttpResponseBadRequest
@@ -16,104 +12,16 @@
 from django.urls import reverse
 from django.views.decorators.http import require_GET, require_POST
 
+from sacro import models, utils
 from sacro.adapters import local_audit, zipfile
-from sacro.versioning import check_version
 
 
 logger = logging.getLogger(__name__)
 
 REVIEWS = {}
 
 
-def reverse_with_params(param_dict, *args, **kwargs):
-    """Wrapper for django reverse that adds query parameters"""
-    url = reverse(*args, **kwargs)
-    return url + "?" + urlencode(param_dict)
-
-
-@dataclass
-class Outputs(dict):
-    """An ACRO json output file"""
-
-    path: Path
-    version: str = None
-    config: dict = field(default_factory=dict)
-
-    def __post_init__(self):
-        self.raw_metadata = json.loads(self.path.read_text())
-        config_path = self.path.parent / "config.json"
-        if config_path.exists():
-            self.config = json.loads(config_path.read_text())
-
-        self.version = self.raw_metadata["version"]
-        self.update(self.raw_metadata["results"])
-        self.annotate()
-
-    def annotate(self):
-        """Add various useful annotations to the JSON data"""
-
-        # add urls to JSON data
-        for output, metadata in self.items():
-            for filedata in metadata["files"]:
-                filedata["url"] = reverse_with_params(
-                    {
-                        "path": str(self.path),
-                        "output": output,
-                        "filename": filedata["name"],
-                    },
-                    "contents",
-                )
-
-        # add and check checksum data, and transform cell data to more useful format
-        checksums_dir = self.path.parent / "checksums"
-        for output, metadata in self.items():
-            for filedata in metadata["files"]:
-                # checksums
-                filedata["checksum_valid"] = False
-                filedata["checksum"] = None
-
-                path = checksums_dir / (filedata["name"] + ".txt")
-                if not path.exists():
-                    continue
-
-                filedata["checksum"] = path.read_text(encoding="utf8")
-                actual_file = self.get_file_path(output, filedata["name"])
-
-                if not actual_file.exists():  # pragma: nocover
-                    continue
-
-                checksum = hashlib.sha256(actual_file.read_bytes()).hexdigest()
-                filedata["checksum_valid"] = checksum == filedata["checksum"]
-
-                # cells
-                cells = filedata.get("sdc", {}).get("cells", {})
-                cell_index = defaultdict(list)
-
-                for flag, indicies in cells.items():
-                    for x, y in indicies:
-                        key = f"{x},{y}"
-                        cell_index[key].append(flag)
-
-                filedata["cell_index"] = cell_index
-
-    def get_file_path(self, output, filename):
-        """Return absolute path to output file"""
-        if filename not in {
-            f["name"] for f in self[output]["files"]
-        }:  # pragma: nocover
-            return None
-        # note: if filename is absolute, this will just return filename
-        return self.path.parent / filename
-
-    def write(self):
-        """Useful testing helper"""
-        self.path.write_text(json.dumps(self.raw_metadata, indent=2))
-        self.clear()
-        self.version = None
-        self.__post_init__()
-
-
-def get_outputs(data):
+def get_outputs_from_request(data):
     """Use outputs path from request and load it"""
     param_path = data.get("path")
     if param_path is None:
@@ -124,11 +32,7 @@ def get_outputs(data):
     if not path.exists():  # pragma: no cover
         raise Http404
 
-    outputs = Outputs(path)
-
-    check_version(outputs.version)
-
-    return outputs
+    return models.load_from_path(path)
 
 
 @require_GET
@@ -139,8 +43,8 @@ def index(request):
     if "path" not in request.GET and settings.DEBUG:
         data = {"path": "outputs/results.json"}
 
-    outputs = get_outputs(data)
-    create_url = reverse_with_params({"path": str(outputs.path)}, "review-create")
+    outputs = get_outputs_from_request(data)
+    create_url = utils.reverse_with_params({"path": str(outputs.path)}, "review-create")
 
     return TemplateResponse(
         request,
@@ -161,7 +65,7 @@ def contents(request):
     We also require the json file and check that the requested file is present
     in the json.  This prevents loading arbitrary user files over http.
     """
-    outputs = get_outputs(request.GET)
+    outputs = get_outputs_from_request(request.GET)
     output = request.GET.get("output")
     filename = request.GET.get("filename")
 
@@ -194,7 +98,7 @@ def approved_outputs(request, pk):
     if not (review := REVIEWS.get(pk)):
         raise Http404
 
-    outputs = Outputs(review["path"])
+    outputs = models.ACROOutputs(review["path"])
 
     approved_outputs = [k for k, v in review["decisions"].items() if v["state"] is True]
     in_memory_zf = zipfile.create(outputs, approved_outputs)
@@ -220,7 +124,7 @@ def review_create(request):
 
     # we load the path from the querystring, even though this is a post request
     # check the reviewed outputs are valid
-    outputs = get_outputs(request.GET)
+    outputs = get_outputs_from_request(request.GET)
     approved_outputs = [k for k, v in review.items() if v["state"] is True]
     unrecognized_outputs = [o for o in approved_outputs if o not in outputs]
     if unrecognized_outputs:
@@ -267,7 +171,7 @@ def summary(request, pk):
         raise Http404
 
     # add ACRO status to output decisions
-    outputs = Outputs(review["path"])
+    outputs = models.ACROOutputs(review["path"])
     for name, data in review["decisions"].items():
         review["decisions"][name]["acro_status"] = outputs[name]["status"]
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from sacro import views
+from sacro import models
 
 
 @pytest.fixture
@@ -14,4 +14,4 @@ def TEST_PATH():
 @pytest.fixture
 def test_outputs(tmp_path, TEST_PATH):
     shutil.copytree(TEST_PATH.parent, tmp_path, dirs_exist_ok=True)
-    return views.Outputs(tmp_path / TEST_PATH.name)
+    return models.ACROOutputs(tmp_path / TEST_PATH.name)
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -0,0 +1,56 @@
+import json
+
+import pytest
+
+from sacro import models
+
+
+def test_outputs_annotation(test_outputs):
+    assert test_outputs.version == "0.4.0"
+    for metadata in test_outputs.values():
+        for filedata in metadata["files"]:
+            assert filedata["checksum"] is not None
+            assert filedata["checksum_valid"] is True
+            assert filedata["url"].startswith("/contents/?path=")
+
+            cells = filedata.get("sdc", {}).get("cells", {})
+            cell_index = filedata["cell_index"]
+
+            if cells == {}:
+                assert cell_index == {}
+                continue
+
+            for flag, indicies in cells.items():
+                for x, y in indicies:
+                    key = f"{x},{y}"
+                    assert key in cell_index
+
+
+def test_outputs_annotation_checksum_failed(test_outputs):
+    first_output = list(test_outputs)[0]
+    first_file = test_outputs[first_output]["files"][0]["name"]
+    checksum = test_outputs.path.parent / "checksums" / (first_file + ".txt")
+    checksum.write_text("bad checksum")
+
+    # re-annotate
+    test_outputs.annotate()
+
+    assert test_outputs[first_output]["files"][0]["checksum"] == "bad checksum"
+    assert test_outputs[first_output]["files"][0]["checksum_valid"] is False
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {},
+        {"version": "1"},
+        {"version": "1", "results": {}},
+        {"version": "1", "results": {"name": {"files": []}}},
+        {"version": "1", "results": {"name": {"notfiles": "foo"}}},
+    ],
+)
+def test_validation(data, tmp_path):
+    bad_json = tmp_path / "bad.json"
+    bad_json.write_text(json.dumps(data))
+    with pytest.raises(models.ACROOutputs.InvalidFile):
+        models.ACROOutputs(bad_json)