From 4aff98cf1655f4f752c09b5b2e8a270c675cb2e0 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:35:02 -0400
Subject: [PATCH 1/6] first pass to allow for output_schema as a true JSON
 schema https://github.com/pepkit/pipestat/issues/85

---
 pipestat/parsed_schema.py                    | 65 ++++++++++++--------
 tests/conftest.py                            |  4 ++
 tests/data/output_schema_as_JSON_schema.yaml | 47 ++++++++++++++
 tests/test_parsed_schema.py                  | 11 +++-
 4 files changed, 100 insertions(+), 27 deletions(-)
 create mode 100644 tests/data/output_schema_as_JSON_schema.yaml

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index fc1fcac7..37b11acd 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import *
 from pydantic import create_model
+from jsonschema import validate
 
 # from sqlalchemy.dialects.postgresql import ARRAY
 from sqlalchemy import Column, null
@@ -81,46 +82,62 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
         # initial validation and parse
         if not isinstance(data, dict):
             _, data = read_yaml_data(data, "schema")
+
         data = copy.deepcopy(data)
 
-        # pipeline identifier
-        self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
+        # Currently supporting backwards compatibility with old output schema while now also supporting a JSON schema:
+        if "properties" in list(data.keys()):
+            # Assume top-level properties key implies proper JSON schema.
+            self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)
+            if "samples" in list(data["properties"].keys()):
+                sample_data = _safe_pop_one_mapping(
+                    key="properties", data=data["properties"]["samples"]["items"], info_name="sample-level"
+                )
+            else:
+                sample_data = {}
+
+            if "project" in list(data["properties"].keys()):
+                prj_data = _safe_pop_one_mapping(
+                    key="properties",data=data["properties"]["project"]["items"], info_name="project-level"
+                )
+            else:
+                prj_data = {}
+            # Parse custom status declaration if present.
+            if "status" in list(data["properties"].keys()):
+                self._status_data = _safe_pop_one_mapping(
+                    key="properties", data=data["properties"]["status"], info_name="status"
+                )
+            else:
+                self._status_data = {}
+
+        else:
+            self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
+            sample_data = _safe_pop_one_mapping(
+                key=self._SAMPLES_KEY, data=data, info_name="sample-level"
+            )
+            prj_data = _safe_pop_one_mapping(
+                key=self._PROJECT_KEY, data=data, info_name="project-level"
+            )
+            # Parse custom status declaration if present.
+            self._status_data = _safe_pop_one_mapping(
+                key=self._STATUS_KEY, data=data, info_name="status"
+            )
+
         if not isinstance(self._pipeline_name, str):
             raise SchemaError(
                 f"Could not find valid pipeline identifier (key '{SCHEMA_PIPELINE_NAME_KEY}') in given schema data"
             )
 
-        # Parse sample-level data item declarations.
-        sample_data = _safe_pop_one_mapping(
-            key=self._SAMPLES_KEY, data=data, info_name="sample-level"
-        )
-
         self._sample_level_data = _recursively_replace_custom_types(sample_data)
 
-        # Parse project-level data item declarations.
-        prj_data = _safe_pop_one_mapping(
-            key=self._PROJECT_KEY, data=data, info_name="project-level"
-        )
         self._project_level_data = _recursively_replace_custom_types(prj_data)
 
         # Sample- and/or project-level data must be declared.
         if not self._sample_level_data and not self._project_level_data:
             raise SchemaError("Neither sample-level nor project-level data items are declared.")
 
-        # Parse custom status declaration if present.
-        self._status_data = _safe_pop_one_mapping(
-            key=self._STATUS_KEY, data=data, info_name="status"
-        )
-
-        if data:
-            _LOGGER.info(
-                "Top-Level arguments found in output schema. They will be assigned to project-level."
-            )
-            extra_project_data = _recursively_replace_custom_types(data)
-            self._project_level_data.update(extra_project_data)
-
         # Check that no reserved keywords were used as data items.
-        resv_kwds = {"id", SAMPLE_NAME}
+        resv_kwds = {"id", RECORD_IDENTIFIER}
         reserved_keywords_used = set()
         for data in [self.project_level_data, self.sample_level_data, self.status_data]:
             reserved_keywords_used |= set(data.keys()) & resv_kwds
diff --git a/tests/conftest.py b/tests/conftest.py
index cf62a478..11896b5a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -86,3 +86,7 @@ def custom_status_schema2():
 @pytest.fixture
 def output_schema_html_report():
     return get_data_file_path("output_schema_html_report.yaml")
+
+@pytest.fixture
+def output_schema_as_JSON_schema():
+    return get_data_file_path("output_schema_as_JSON_schema.yaml")
diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml
new file mode 100644
index 00000000..d76d41ac
--- /dev/null
+++ b/tests/data/output_schema_as_JSON_schema.yaml
@@ -0,0 +1,47 @@
+title: An example Pipestat output schema
+description: A pipeline that uses pipestat to report sample and project level results.
+type: object
+properties:
+  pipeline_name: "default_pipeline_name"
+  samples:
+    type: array
+    items:
+      type: object
+      properties:
+        number_of_things:
+          type: integer
+          description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+        smooth_bw:
+          type: string
+          description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce nec cursus nulla."
+          path: "aligned_{genome}/{sample_name}_smooth.bw"
+        collection_of_images:
+          type: array
+          description: A collection of images.
+          items:
+            type: object
+            properties:
+              prop1:
+                type: file
+                description: An example file.
+        output_file_in_object:
+          type: object
+          description: An object containing output files.
+          properties:
+            example_property_1:
+              type: file
+              description: An example file.
+            example_property_2:
+              type: image
+              description: An example image.
+  project:
+    type: array
+    items:
+      type: object
+      properties:
+        project_output_file:
+          type: file
+          description: The path to the output file.
+        protocol:
+          type: string
+          description: example protocol description
diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py
index c097faa3..52f435f7 100644
--- a/tests/test_parsed_schema.py
+++ b/tests/test_parsed_schema.py
@@ -5,7 +5,7 @@
 from typing import *
 import pytest
 import oyaml
-from pipestat.const import SAMPLE_NAME, STATUS
+from pipestat.const import SAMPLE_NAME, STATUS, RECORD_IDENTIFIER
 from pipestat.exceptions import SchemaError
 from pipestat.parsed_schema import (
     NULL_MAPPING_VALUE,
@@ -227,10 +227,10 @@ def test_insufficient_schema__raises_expected_error_and_message(schema_data, exp
         ]
         for extra in [
             [("id", {"type": "string", "description": "identifier"})],
-            [(SAMPLE_NAME, {"type": "string", "description": "identifier"})],
+            [(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"})],
             [
                 ("id", {"type": "string", "description": "identifier"}),
-                (SAMPLE_NAME, {"type": "string", "description": "identifier"}),
+                (RECORD_IDENTIFIER, {"type": "string", "description": "identifier"}),
             ],
         ]
     ],
@@ -257,3 +257,8 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag
     obs_msg = str(err_ctx.value)
     exp_msg = f"Overlap between project- and sample-level keys: {common_key}"
     assert obs_msg == exp_msg
+
+def test_JSON_schema_validation(output_schema_as_JSON_schema):
+    ParsedSchema(output_schema_as_JSON_schema)
+    assert True
+    #assert "reserved keyword(s) used" in observed_message
\ No newline at end of file

From f50aedefdf1d11b35d4a0e72c038ceec2be262e9 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:36:44 -0400
Subject: [PATCH 2/6] remove unused import

---
 pipestat/parsed_schema.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index 37b11acd..13bf3fa9 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import *
 from pydantic import create_model
-from jsonschema import validate
 
 # from sqlalchemy.dialects.postgresql import ARRAY
 from sqlalchemy import Column, null

From 236dac22f0f17f6f0398ea32a0e0e73907623320 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:12:14 -0400
Subject: [PATCH 3/6] simplify logic, extend _safe_pop_one_mapping to handles
 multiple keys

---
 pipestat/parsed_schema.py   | 64 ++++++++++++++++++++++---------------
 tests/conftest.py           |  1 +
 tests/test_parsed_schema.py |  3 +-
 3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index 13bf3fa9..e2b50c59 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -50,13 +50,24 @@ class Config:
     return BaseModel
 
 
-def _safe_pop_one_mapping(key: str, data: Dict[str, Any], info_name: str) -> Any:
-    value = data.pop(key, NULL_MAPPING_VALUE)
-    if isinstance(value, Mapping):
-        return value
-    raise SchemaError(
-        f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
-    )
+def _safe_pop_one_mapping(
+    key: str, data: Dict[str, Any], info_name: str, keys: Optional[List[str]] = None
+) -> Any:
+    if keys:
+        # keys=["samples", "items"]
+        try:
+            value = data[keys[0]][keys[1]].pop(key, NULL_MAPPING_VALUE)
+        except KeyError:
+            value = {}
+        if isinstance(value, Mapping):
+            return value
+    else:
+        value = data.pop(key, NULL_MAPPING_VALUE)
+        if isinstance(value, Mapping):
+            return value
+        raise SchemaError(
+            f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
+        )
 
 
 class ParsedSchema(object):
@@ -88,26 +99,27 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
         if "properties" in list(data.keys()):
             # Assume top-level properties key implies proper JSON schema.
             self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)
-            if "samples" in list(data["properties"].keys()):
-                sample_data = _safe_pop_one_mapping(
-                    key="properties", data=data["properties"]["samples"]["items"], info_name="sample-level"
-                )
-            else:
-                sample_data = {}
 
-            if "project" in list(data["properties"].keys()):
-                prj_data = _safe_pop_one_mapping(
-                    key="properties",data=data["properties"]["project"]["items"], info_name="project-level"
-                )
-            else:
-                prj_data = {}
-            # Parse custom status declaration if present.
-            if "status" in list(data["properties"].keys()):
-                self._status_data = _safe_pop_one_mapping(
-                    key="properties", data=data["properties"]["status"], info_name="status"
-                )
-            else:
-                self._status_data = {}
+            sample_data = _safe_pop_one_mapping(
+                keys=["samples", "items"],
+                data=data["properties"],
+                info_name="sample-level",
+                key="properties",
+            )
+
+            prj_data = _safe_pop_one_mapping(
+                keys=["project", "items"],
+                data=data["properties"],
+                info_name="project-level",
+                key="properties",
+            )
+
+            self._status_data = _safe_pop_one_mapping(
+                keys=["status", "items"],
+                data=data["properties"],
+                info_name="status",
+                key="properties",
+            )
 
         else:
             self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
diff --git a/tests/conftest.py b/tests/conftest.py
index 11896b5a..16422035 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -87,6 +87,7 @@ def custom_status_schema2():
 def output_schema_html_report():
     return get_data_file_path("output_schema_html_report.yaml")
 
+
 @pytest.fixture
 def output_schema_as_JSON_schema():
     return get_data_file_path("output_schema_as_JSON_schema.yaml")
diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py
index 52f435f7..c10847de 100644
--- a/tests/test_parsed_schema.py
+++ b/tests/test_parsed_schema.py
@@ -258,7 +258,8 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag
     exp_msg = f"Overlap between project- and sample-level keys: {common_key}"
     assert obs_msg == exp_msg
 
+
 def test_JSON_schema_validation(output_schema_as_JSON_schema):
     ParsedSchema(output_schema_as_JSON_schema)
     assert True
-    #assert "reserved keyword(s) used" in observed_message
\ No newline at end of file
+    # assert "reserved keyword(s) used" in observed_message

From 0b65844903600695f0e4a0685a7381f69fe56065 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Tue, 3 Oct 2023 15:15:59 -0400
Subject: [PATCH 4/6] disambiguation key vs keys in parsed schema

---
 pipestat/parsed_schema.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index e2b50c59..99c9af76 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -51,18 +51,21 @@ class Config:
 
 
 def _safe_pop_one_mapping(
-    key: str, data: Dict[str, Any], info_name: str, keys: Optional[List[str]] = None
+    mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None
 ) -> Any:
-    if keys:
-        # keys=["samples", "items"]
+    """
+    subkeys exist if the output schema is in a JSON schema format
+
+    """
+    if subkeys:
         try:
-            value = data[keys[0]][keys[1]].pop(key, NULL_MAPPING_VALUE)
+            value = data[subkeys[0]][subkeys[1]].pop(mappingkey, NULL_MAPPING_VALUE)
         except KeyError:
             value = {}
         if isinstance(value, Mapping):
             return value
     else:
-        value = data.pop(key, NULL_MAPPING_VALUE)
+        value = data.pop(mappingkey, NULL_MAPPING_VALUE)
         if isinstance(value, Mapping):
             return value
         raise SchemaError(
@@ -101,37 +104,37 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
             self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)
 
             sample_data = _safe_pop_one_mapping(
-                keys=["samples", "items"],
+                subkeys=["samples", "items"],
                 data=data["properties"],
                 info_name="sample-level",
-                key="properties",
+                mappingkey="properties",
             )
 
             prj_data = _safe_pop_one_mapping(
-                keys=["project", "items"],
+                subkeys=["project", "items"],
                 data=data["properties"],
                 info_name="project-level",
-                key="properties",
+                mappingkey="properties",
             )
 
             self._status_data = _safe_pop_one_mapping(
-                keys=["status", "items"],
+                subkeys=["status", "items"],
                 data=data["properties"],
                 info_name="status",
-                key="properties",
+                mappingkey="properties",
             )
 
         else:
             self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
             sample_data = _safe_pop_one_mapping(
-                key=self._SAMPLES_KEY, data=data, info_name="sample-level"
+                mappingkey=self._SAMPLES_KEY, data=data, info_name="sample-level"
             )
             prj_data = _safe_pop_one_mapping(
-                key=self._PROJECT_KEY, data=data, info_name="project-level"
+                mappingkey=self._PROJECT_KEY, data=data, info_name="project-level"
             )
             # Parse custom status declaration if present.
             self._status_data = _safe_pop_one_mapping(
-                key=self._STATUS_KEY, data=data, info_name="status"
+                mappingkey=self._STATUS_KEY, data=data, info_name="status"
             )
 
         if not isinstance(self._pipeline_name, str):

From 7e861bcfeb26e5ddeb2c5f8e7af9928e8757a1c3 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 4 Oct 2023 08:45:01 -0400
Subject: [PATCH 5/6] make samples and project objects instead of arrays, add
 more test assertions, clean up docstrings.

---
 pipestat/parsed_schema.py                    | 12 +++++-----
 tests/data/output_schema_as_JSON_schema.yaml | 24 ++++++++------------
 tests/test_parsed_schema.py                  |  6 ++---
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index 99c9af76..9a5a4d81 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -54,12 +54,12 @@ def _safe_pop_one_mapping(
     mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None
 ) -> Any:
     """
-    subkeys exist if the output schema is in a JSON schema format
-
+    mapping key: the dict key where the sample, project or status values are stored, e.g. data["mappingkey"]
+    subkeys: if using JSON schema, the dict is nested further, e.g. data["properties"]["samples"]["mappingkey"]
     """
     if subkeys:
         try:
-            value = data[subkeys[0]][subkeys[1]].pop(mappingkey, NULL_MAPPING_VALUE)
+            value = data[subkeys[0]].pop(mappingkey, NULL_MAPPING_VALUE)
         except KeyError:
             value = {}
         if isinstance(value, Mapping):
@@ -104,21 +104,21 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
             self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)
 
             sample_data = _safe_pop_one_mapping(
-                subkeys=["samples", "items"],
+                subkeys=["samples"],
                 data=data["properties"],
                 info_name="sample-level",
                 mappingkey="properties",
             )
 
             prj_data = _safe_pop_one_mapping(
-                subkeys=["project", "items"],
+                subkeys=["project"],
                 data=data["properties"],
                 info_name="project-level",
                 mappingkey="properties",
             )
 
             self._status_data = _safe_pop_one_mapping(
-                subkeys=["status", "items"],
+                subkeys=["status"],
                 data=data["properties"],
                 info_name="status",
                 mappingkey="properties",
diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml
index d76d41ac..e391fd6d 100644
--- a/tests/data/output_schema_as_JSON_schema.yaml
+++ b/tests/data/output_schema_as_JSON_schema.yaml
@@ -4,10 +4,8 @@ type: object
 properties:
   pipeline_name: "default_pipeline_name"
   samples:
-    type: array
-    items:
-      type: object
-      properties:
+    type: object
+    properties:
         number_of_things:
           type: integer
           description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
@@ -35,13 +33,11 @@ properties:
               type: image
               description: An example image.
   project:
-    type: array
-    items:
-      type: object
-      properties:
-        project_output_file:
-          type: file
-          description: The path to the output file.
-        protocol:
-          type: string
-          description: example protocol description
+    type: object
+    properties:
+      project_output_file:
+        type: file
+        description: The path to the output file.
+      protocol:
+        type: string
+        description: example protocol description
diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py
index c10847de..001b8a30 100644
--- a/tests/test_parsed_schema.py
+++ b/tests/test_parsed_schema.py
@@ -260,6 +260,6 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag
 
 
 def test_JSON_schema_validation(output_schema_as_JSON_schema):
-    ParsedSchema(output_schema_as_JSON_schema)
-    assert True
-    # assert "reserved keyword(s) used" in observed_message
+    schema = ParsedSchema(output_schema_as_JSON_schema)
+    assert "number_of_things" in dict(schema.sample_level_data).keys()
+    assert "protocol" in dict(schema.project_level_data).keys()

From f430655b10918462f9102cf14b6307afa3834103 Mon Sep 17 00:00:00 2001
From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com>
Date: Wed, 4 Oct 2023 15:03:43 -0400
Subject: [PATCH 6/6] add status data to string representation of ParsedSchema

---
 pipestat/parsed_schema.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
index 9a5a4d81..0857b2fb 100644
--- a/pipestat/parsed_schema.py
+++ b/pipestat/parsed_schema.py
@@ -182,7 +182,10 @@ def __str__(self):
             res += f"\n Sample Level Data:"
             for k, v in self._sample_level_data.items():
                 res += f"\n -  {k} : {v}"
-        # TODO: add status schema data
+        if self._status_data is not None:
+            res += f"\n Status Data:"
+            for k, v in self._status_data.items():
+                res += f"\n -  {k} : {v}"
         return res
 
     @property