From 4aff98cf1655f4f752c09b5b2e8a270c675cb2e0 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:35:02 -0400 Subject: [PATCH 1/6] first pass to allow for output_schema as a true JSON schema https://github.com/pepkit/pipestat/issues/85 --- pipestat/parsed_schema.py | 65 ++++++++++++-------- tests/conftest.py | 4 ++ tests/data/output_schema_as_JSON_schema.yaml | 47 ++++++++++++++ tests/test_parsed_schema.py | 11 +++- 4 files changed, 100 insertions(+), 27 deletions(-) create mode 100644 tests/data/output_schema_as_JSON_schema.yaml diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index fc1fcac7..37b11acd 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import * from pydantic import create_model +from jsonschema import validate # from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy import Column, null @@ -81,46 +82,62 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None: # initial validation and parse if not isinstance(data, dict): _, data = read_yaml_data(data, "schema") + data = copy.deepcopy(data) - # pipeline identifier - self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) + # Currently supporting backwards compatibility with old output schema while now also supporting a JSON schema: + if "properties" in list(data.keys()): + # Assume top-level properties key implies proper JSON schema. + self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None) + if "samples" in list(data["properties"].keys()): + sample_data = _safe_pop_one_mapping( + key="properties", data=data["properties"]["samples"]["items"], info_name="sample-level" + ) + else: + sample_data = {} + + if "project" in list(data["properties"].keys()): + prj_data = _safe_pop_one_mapping( + key="properties",data=data["properties"]["project"]["items"], info_name="project-level" + ) + else: + prj_data = {} + # Parse custom status declaration if present. + if "status" in list(data["properties"].keys()): + self._status_data = _safe_pop_one_mapping( + key="properties", data=data["properties"]["status"], info_name="status" + ) + else: + self._status_data = {} + + else: + self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) + sample_data = _safe_pop_one_mapping( + key=self._SAMPLES_KEY, data=data, info_name="sample-level" + ) + prj_data = _safe_pop_one_mapping( + key=self._PROJECT_KEY, data=data, info_name="project-level" + ) + # Parse custom status declaration if present. + self._status_data = _safe_pop_one_mapping( + key=self._STATUS_KEY, data=data, info_name="status" + ) + if not isinstance(self._pipeline_name, str): raise SchemaError( f"Could not find valid pipeline identifier (key '{SCHEMA_PIPELINE_NAME_KEY}') in given schema data" ) - # Parse sample-level data item declarations. - sample_data = _safe_pop_one_mapping( - key=self._SAMPLES_KEY, data=data, info_name="sample-level" - ) - self._sample_level_data = _recursively_replace_custom_types(sample_data) - # Parse project-level data item declarations. - prj_data = _safe_pop_one_mapping( - key=self._PROJECT_KEY, data=data, info_name="project-level" - ) self._project_level_data = _recursively_replace_custom_types(prj_data) # Sample- and/or project-level data must be declared. if not self._sample_level_data and not self._project_level_data: raise SchemaError("Neither sample-level nor project-level data items are declared.") - # Parse custom status declaration if present. - self._status_data = _safe_pop_one_mapping( - key=self._STATUS_KEY, data=data, info_name="status" - ) - - if data: - _LOGGER.info( - "Top-Level arguments found in output schema. They will be assigned to project-level." - ) - extra_project_data = _recursively_replace_custom_types(data) - self._project_level_data.update(extra_project_data) - # Check that no reserved keywords were used as data items. - resv_kwds = {"id", SAMPLE_NAME} + resv_kwds = {"id", RECORD_IDENTIFIER} reserved_keywords_used = set() for data in [self.project_level_data, self.sample_level_data, self.status_data]: reserved_keywords_used |= set(data.keys()) & resv_kwds diff --git a/tests/conftest.py b/tests/conftest.py index cf62a478..11896b5a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -86,3 +86,7 @@ def custom_status_schema2(): @pytest.fixture def output_schema_html_report(): return get_data_file_path("output_schema_html_report.yaml") + +@pytest.fixture +def output_schema_as_JSON_schema(): + return get_data_file_path("output_schema_as_JSON_schema.yaml") diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml new file mode 100644 index 00000000..d76d41ac --- /dev/null +++ b/tests/data/output_schema_as_JSON_schema.yaml @@ -0,0 +1,47 @@ +title: An example Pipestat output schema +description: A pipeline that uses pipestat to report sample and project level results. +type: object +properties: + pipeline_name: "default_pipeline_name" + samples: + type: array + items: + type: object + properties: + number_of_things: + type: integer + description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + smooth_bw: + type: string + description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce nec cursus nulla." + path: "aligned_{genome}/{sample_name}_smooth.bw" + collection_of_images: + type: array + description: A collection of images. + items: + type: object + properties: + prop1: + type: file + description: An example file. + output_file_in_object: + type: object + description: An object containing output files. + properties: + example_property_1: + type: file + description: An example file. + example_property_2: + type: image + description: An example image. + project: + type: array + items: + type: object + properties: + project_output_file: + type: file + description: The path to the output file. + protocol: + type: string + description: example protocol description diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py index c097faa3..52f435f7 100644 --- a/tests/test_parsed_schema.py +++ b/tests/test_parsed_schema.py @@ -5,7 +5,7 @@ from typing import * import pytest import oyaml -from pipestat.const import SAMPLE_NAME, STATUS +from pipestat.const import SAMPLE_NAME, STATUS, RECORD_IDENTIFIER from pipestat.exceptions import SchemaError from pipestat.parsed_schema import ( NULL_MAPPING_VALUE, @@ -227,10 +227,10 @@ def test_insufficient_schema__raises_expected_error_and_message(schema_data, exp ] for extra in [ [("id", {"type": "string", "description": "identifier"})], - [(SAMPLE_NAME, {"type": "string", "description": "identifier"})], + [(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"})], [ ("id", {"type": "string", "description": "identifier"}), - (SAMPLE_NAME, {"type": "string", "description": "identifier"}), + (RECORD_IDENTIFIER, {"type": "string", "description": "identifier"}), ], ] ], @@ -257,3 +257,8 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag obs_msg = str(err_ctx.value) exp_msg = f"Overlap between project- and sample-level keys: {common_key}" assert obs_msg == exp_msg + +def test_JSON_schema_validation(output_schema_as_JSON_schema): + ParsedSchema(output_schema_as_JSON_schema) + assert True + #assert "reserved keyword(s) used" in observed_message \ No newline at end of file From f50aedefdf1d11b35d4a0e72c038ceec2be262e9 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:36:44 -0400 Subject: [PATCH 2/6] remove unused import --- pipestat/parsed_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index 37b11acd..13bf3fa9 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import * from pydantic import create_model -from jsonschema import validate # from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy import Column, null From 236dac22f0f17f6f0398ea32a0e0e73907623320 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:12:14 -0400 Subject: [PATCH 3/6] simplify logic, extend _safe_pop_one_mapping to handles multiple keys --- pipestat/parsed_schema.py | 64 ++++++++++++++++++++++--------------- tests/conftest.py | 1 + tests/test_parsed_schema.py | 3 +- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index 13bf3fa9..e2b50c59 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -50,13 +50,24 @@ class Config: return BaseModel -def _safe_pop_one_mapping(key: str, data: Dict[str, Any], info_name: str) -> Any: - value = data.pop(key, NULL_MAPPING_VALUE) - if isinstance(value, Mapping): - return value - raise SchemaError( - f"{info_name} info in schema definition has invalid type: {type(value).__name__}" - ) +def _safe_pop_one_mapping( + key: str, data: Dict[str, Any], info_name: str, keys: Optional[List[str]] = None +) -> Any: + if keys: + # keys=["samples", "items"] + try: + value = data[keys[0]][keys[1]].pop(key, NULL_MAPPING_VALUE) + except KeyError: + value = {} + if isinstance(value, Mapping): + return value + else: + value = data.pop(key, NULL_MAPPING_VALUE) + if isinstance(value, Mapping): + return value + raise SchemaError( + f"{info_name} info in schema definition has invalid type: {type(value).__name__}" + ) class ParsedSchema(object): @@ -88,26 +99,27 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None: if "properties" in list(data.keys()): # Assume top-level properties key implies proper JSON schema. self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None) - if "samples" in list(data["properties"].keys()): - sample_data = _safe_pop_one_mapping( - key="properties", data=data["properties"]["samples"]["items"], info_name="sample-level" - ) - else: - sample_data = {} - if "project" in list(data["properties"].keys()): - prj_data = _safe_pop_one_mapping( - key="properties",data=data["properties"]["project"]["items"], info_name="project-level" - ) - else: - prj_data = {} - # Parse custom status declaration if present. - if "status" in list(data["properties"].keys()): - self._status_data = _safe_pop_one_mapping( - key="properties", data=data["properties"]["status"], info_name="status" - ) - else: - self._status_data = {} + sample_data = _safe_pop_one_mapping( + keys=["samples", "items"], + data=data["properties"], + info_name="sample-level", + key="properties", + ) + + prj_data = _safe_pop_one_mapping( + keys=["project", "items"], + data=data["properties"], + info_name="project-level", + key="properties", + ) + + self._status_data = _safe_pop_one_mapping( + keys=["status", "items"], + data=data["properties"], + info_name="status", + key="properties", + ) else: self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) diff --git a/tests/conftest.py b/tests/conftest.py index 11896b5a..16422035 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -87,6 +87,7 @@ def custom_status_schema2(): def output_schema_html_report(): return get_data_file_path("output_schema_html_report.yaml") + @pytest.fixture def output_schema_as_JSON_schema(): return get_data_file_path("output_schema_as_JSON_schema.yaml") diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py index 52f435f7..c10847de 100644 --- a/tests/test_parsed_schema.py +++ b/tests/test_parsed_schema.py @@ -258,7 +258,8 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag exp_msg = f"Overlap between project- and sample-level keys: {common_key}" assert obs_msg == exp_msg + def test_JSON_schema_validation(output_schema_as_JSON_schema): ParsedSchema(output_schema_as_JSON_schema) assert True - #assert "reserved keyword(s) used" in observed_message \ No newline at end of file + # assert "reserved keyword(s) used" in observed_message From 0b65844903600695f0e4a0685a7381f69fe56065 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 3 Oct 2023 15:15:59 -0400 Subject: [PATCH 4/6] disambiguation key vs keys in parsed schema --- pipestat/parsed_schema.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index e2b50c59..99c9af76 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -51,18 +51,21 @@ class Config: def _safe_pop_one_mapping( - key: str, data: Dict[str, Any], info_name: str, keys: Optional[List[str]] = None + mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None ) -> Any: - if keys: - # keys=["samples", "items"] + """ + subkeys exist if the output schema is in a JSON schema format + + """ + if subkeys: try: - value = data[keys[0]][keys[1]].pop(key, NULL_MAPPING_VALUE) + value = data[subkeys[0]][subkeys[1]].pop(mappingkey, NULL_MAPPING_VALUE) except KeyError: value = {} if isinstance(value, Mapping): return value else: - value = data.pop(key, NULL_MAPPING_VALUE) + value = data.pop(mappingkey, NULL_MAPPING_VALUE) if isinstance(value, Mapping): return value raise SchemaError( @@ -101,37 +104,37 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None: self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None) sample_data = _safe_pop_one_mapping( - keys=["samples", "items"], + subkeys=["samples", "items"], data=data["properties"], info_name="sample-level", - key="properties", + mappingkey="properties", ) prj_data = _safe_pop_one_mapping( - keys=["project", "items"], + subkeys=["project", "items"], data=data["properties"], info_name="project-level", - key="properties", + mappingkey="properties", ) self._status_data = _safe_pop_one_mapping( - keys=["status", "items"], + subkeys=["status", "items"], data=data["properties"], info_name="status", - key="properties", + mappingkey="properties", ) else: self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) sample_data = _safe_pop_one_mapping( - key=self._SAMPLES_KEY, data=data, info_name="sample-level" + mappingkey=self._SAMPLES_KEY, data=data, info_name="sample-level" ) prj_data = _safe_pop_one_mapping( - key=self._PROJECT_KEY, data=data, info_name="project-level" + mappingkey=self._PROJECT_KEY, data=data, info_name="project-level" ) # Parse custom status declaration if present. self._status_data = _safe_pop_one_mapping( - key=self._STATUS_KEY, data=data, info_name="status" + mappingkey=self._STATUS_KEY, data=data, info_name="status" ) if not isinstance(self._pipeline_name, str): From 7e861bcfeb26e5ddeb2c5f8e7af9928e8757a1c3 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 4 Oct 2023 08:45:01 -0400 Subject: [PATCH 5/6] make samples and project objects instead of arrays, add more test assertions, clean up docstrings. --- pipestat/parsed_schema.py | 12 +++++----- tests/data/output_schema_as_JSON_schema.yaml | 24 ++++++++------------ tests/test_parsed_schema.py | 6 ++--- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index 99c9af76..9a5a4d81 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -54,12 +54,12 @@ def _safe_pop_one_mapping( mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None ) -> Any: """ - subkeys exist if the output schema is in a JSON schema format - + mapping key: the dict key where the sample, project or status values are stored, e.g. data["mappingkey"] + subkeys: if using JSON schema, the dict is nested further, e.g. data["properties"]["samples"]["mappingkey"] """ if subkeys: try: - value = data[subkeys[0]][subkeys[1]].pop(mappingkey, NULL_MAPPING_VALUE) + value = data[subkeys[0]].pop(mappingkey, NULL_MAPPING_VALUE) except KeyError: value = {} if isinstance(value, Mapping): @@ -104,21 +104,21 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None: self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None) sample_data = _safe_pop_one_mapping( - subkeys=["samples", "items"], + subkeys=["samples"], data=data["properties"], info_name="sample-level", mappingkey="properties", ) prj_data = _safe_pop_one_mapping( - subkeys=["project", "items"], + subkeys=["project"], data=data["properties"], info_name="project-level", mappingkey="properties", ) self._status_data = _safe_pop_one_mapping( - subkeys=["status", "items"], + subkeys=["status"], data=data["properties"], info_name="status", mappingkey="properties", diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml index d76d41ac..e391fd6d 100644 --- a/tests/data/output_schema_as_JSON_schema.yaml +++ b/tests/data/output_schema_as_JSON_schema.yaml @@ -4,10 +4,8 @@ type: object properties: pipeline_name: "default_pipeline_name" samples: - type: array - items: - type: object - properties: + type: object + properties: number_of_things: type: integer description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit." @@ -35,13 +33,11 @@ properties: type: image description: An example image. project: - type: array - items: - type: object - properties: - project_output_file: - type: file - description: The path to the output file. - protocol: - type: string - description: example protocol description + type: object + properties: + project_output_file: + type: file + description: The path to the output file. + protocol: + type: string + description: example protocol description diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py index c10847de..001b8a30 100644 --- a/tests/test_parsed_schema.py +++ b/tests/test_parsed_schema.py @@ -260,6 +260,6 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag def test_JSON_schema_validation(output_schema_as_JSON_schema): - ParsedSchema(output_schema_as_JSON_schema) - assert True - # assert "reserved keyword(s) used" in observed_message + schema = ParsedSchema(output_schema_as_JSON_schema) + assert "number_of_things" in dict(schema.sample_level_data).keys() + assert "protocol" in dict(schema.project_level_data).keys() From f430655b10918462f9102cf14b6307afa3834103 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 4 Oct 2023 15:03:43 -0400 Subject: [PATCH 6/6] add status data to string representation of ParsedSchema --- pipestat/parsed_schema.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index 9a5a4d81..0857b2fb 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -182,7 +182,10 @@ def __str__(self): res += f"\n Sample Level Data:" for k, v in self._sample_level_data.items(): res += f"\n - {k} : {v}" - # TODO: add status schema data + if self._status_data is not None: + res += f"\n Status Data:" + for k, v in self._status_data.items(): + res += f"\n - {k} : {v}" return res @property