Sage-Bionetworks · afwillia · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
@@ -219,6 +219,13 @@
             "output_jsonld": (
                 "Path to where the generated JSON-LD file needs to be outputted."
             ),
+            "output_type": (
+                "Output format to export the schema. "
+                "Options are 'jsonld', 'graph', 'all'. Default is 'jsonld'."
+                "'jsonld' will output the schema as a JSON-LD file. "
+                "'graph' will output an nx graph object of the schema as a pickle file."
+                "'all' will output both the JSON-LD file and the graph object."
+            ),
             "data_model_labels": DATA_MODEL_LABELS_HELP,
         }
     }

@@ -32,6 +32,7 @@
     DisplayLabelType,
     extract_component_validation_rules,
 )
+from schematic.utils.io_utils import read_pickle
 from schematic.utils.validate_utils import rule_in_rule_list
 
 logger = logging.getLogger(__name__)
@@ -1675,11 +1676,15 @@ def create_manifests(
         title: Optional[str] = None,
         strict: Optional[bool] = True,
         use_annotations: Optional[bool] = False,
+        graph_data_model: Optional[nx.MultiDiGraph] = None,
+        data_model_graph_pickle: Optional[str] = None,
     ) -> Union[List[str], List[pd.DataFrame]]:
         """Create multiple manifests
 
         Args:
             path_to_data_model (str): str path to data model
+            data_model_graph_pickle (str, optional): path to pickled networkx MultiDiGraph object. Defaults to None.
+            graph_data_model (nx.MultiDiGraph, optional): An networkx MultiDiGraph object. Defaults to None.
             data_types (list): a list of data types
             access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None.
             dataset_ids (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None.
@@ -1737,16 +1742,25 @@ def create_manifests(
                     "Please check your submission and try again."
                 )
 
-        data_model_parser = DataModelParser(path_to_data_model=path_to_data_model)
+        if graph_data_model is None:
+            if data_model_graph_pickle:
+                """What if pickle file does not fit in memory?"""
+                graph_data_model = read_pickle(data_model_graph_pickle)
+            else:
+                data_model_parser = DataModelParser(
+                    path_to_data_model=path_to_data_model
+                )
 
-        # Parse Model
-        parsed_data_model = data_model_parser.parse_model()
+                # Parse Model
+                parsed_data_model = data_model_parser.parse_model()
 
-        # Instantiate DataModelGraph
-        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
+                # Instantiate DataModelGraph
+                data_model_grapher = DataModelGraph(
+                    parsed_data_model, data_model_labels
+                )
 
-        # Generate graph
-        graph_data_model = data_model_grapher.graph
+                # Generate graph
+                graph_data_model = data_model_grapher.graph
 
         # Gather all returned result urls
         all_results = []

@@ -19,6 +19,7 @@
 # we shouldn't need to expose Synapse functionality explicitly
 from schematic.store.synapse import SynapseStorage
 from schematic.utils.df_utils import load_df
+from schematic.utils.io_utils import read_pickle
 
 logger = logging.getLogger(__name__)
 
@@ -41,12 +42,14 @@ def __init__(
         inputMModelLocation: str,
         inputMModelLocationType: str,
         data_model_labels: str,
+        data_model_graph_pickle: Optional[str] = None,
     ) -> None:
         """Instantiates a MetadataModel object.
 
         Args:
             inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
             inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
+            data_model_graph_pickle: filepath to a data model graph stored as pickle file.
         """
         # extract extension of 'inputMModelLocation'
         # ensure that it is necessarily pointing to a '.jsonld' file
@@ -59,17 +62,24 @@ def __init__(
         self.inputMModelLocation = inputMModelLocation
         self.path_to_json_ld = inputMModelLocation
 
-        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
-        # Parse Model
-        parsed_data_model = data_model_parser.parse_model()
+        # Use graph, if provided. Otherwise parse data model for graph.
+        if data_model_graph_pickle:
+            self.graph_data_model = read_pickle(data_model_graph_pickle)
+            self.dmge = DataModelGraphExplorer(self.graph_data_model)
+        else:
+            data_model_parser = DataModelParser(
+                path_to_data_model=self.inputMModelLocation
+            )
+            # Parse Model
+            parsed_data_model = data_model_parser.parse_model()
 
-        # Instantiate DataModelGraph
-        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
+            # Instantiate DataModelGraph
+            data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 
-        # Generate graph
-        self.graph_data_model = data_model_grapher.graph
+            # Generate graph
+            self.graph_data_model = data_model_grapher.graph
 
-        self.dmge = DataModelGraphExplorer(self.graph_data_model)
+            self.dmge = DataModelGraphExplorer(self.graph_data_model)
 
         # check if the type of MModel file is "local"
         # currently, the application only supports reading from local JSON-LD files

@@ -3,7 +3,7 @@
 import logging
 import time
 import re
-from typing import get_args, Optional, Any
+from typing import get_args, Optional, Any, Literal
 
 import click
 import click_log  # type: ignore
@@ -17,7 +17,7 @@
 
 from schematic.utils.schema_utils import DisplayLabelType
 from schematic.utils.cli_utils import query_dict
-from schematic.utils.schema_utils import export_schema
+from schematic.utils.schema_utils import export_schema, export_graph
 from schematic.help import schema_commands
 
 logger = logging.getLogger("schematic")
@@ -59,9 +59,21 @@ def schema() -> None:  # use as `schematic model ...`
     metavar="<OUTPUT_PATH>",
     help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")),
 )
+@click.option("--output_path", help="Alias for --output_jsonld")
+@click.option(
+    "--output_type",
+    "-ot",
+    type=click.Choice(["jsonld", "graph", "all"], case_sensitive=False),
+    default="jsonld",
+    help=query_dict(schema_commands, ("schema", "convert", "output_type")),
+)
 def convert(
-    schema: Any, data_model_labels: DisplayLabelType, output_jsonld: Optional[str]
-) -> None:
+    schema: Any,
+    data_model_labels: DisplayLabelType,
+    output_jsonld: Optional[str],
+    output_type: Optional[Literal["jsonld", "graph", "all"]],
+    output_path: Optional[str],
+) -> int:
     """
     Running CLI to convert data model specification in CSV format to
     data model in JSON-LD format.
@@ -80,19 +92,19 @@ def convert(
     data_model_parser = DataModelParser(schema)
 
     # Parse Model
-    logger.info("Parsing data model.")
+    click.echo("Parsing data model.")
     parsed_data_model = data_model_parser.parse_model()
 
     # Convert parsed model to graph
     # Instantiate DataModelGraph
     data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 
     # Generate graphschema
-    logger.info("Generating data model graph.")
+    click.echo("Generating data model graph.")
     graph_data_model = data_model_grapher.graph
 
     # Validate generated data model.
-    logger.info("Validating the data model internally.")
+    click.echo("Validating the data model internally.")
     data_model_validator = DataModelValidator(graph=graph_data_model)
     data_model_errors, data_model_warnings = data_model_validator.run_checks()
 
@@ -114,40 +126,49 @@ def convert(
                 for warning in war:
                     logger.warning(warning)
 
-    logger.info("Converting data model to JSON-LD")
+    if output_path:
+        output_jsonld = output_path
+
+    if output_jsonld is None:
+        output_file_no_ext = re.sub("[.](jsonld|csv|pickle)$", "", schema)
+    else:
+        output_file_no_ext = re.sub("[.](jsonld|csv|pickle)$", "", output_jsonld)
+
+    click.echo(
+        "By default, the JSON-LD output will be stored alongside the first "
+        f"input CSV or JSON-LD file. In this case, it will appear here: '{output_jsonld}'. "
+        "You can use the `--output_jsonld` argument to specify another file path."
+    )
+
+    if output_type in ["graph", "all"]:
+        output_graph = output_file_no_ext + ".pickle"
+        click.echo(f"Saving data model graph to '{output_graph}'.")
+        export_graph(graph_data_model, output_graph)
+        if output_type == "graph":
+            return 0
+
+    click.echo("Converting data model to JSON-LD")
     jsonld_data_model = convert_graph_to_jsonld(graph=graph_data_model)
 
     # output JSON-LD file alongside CSV file by default, get path.
-    if output_jsonld is None:
-        if not ".jsonld" in schema:
-            csv_no_ext = re.sub("[.]csv$", "", schema)
-            output_jsonld = csv_no_ext + ".jsonld"
-        else:
-            output_jsonld = schema
-
-        logger.info(
-            "By default, the JSON-LD output will be stored alongside the first "
-            f"input CSV or JSON-LD file. In this case, it will appear here: '{output_jsonld}'. "
-            "You can use the `--output_jsonld` argument to specify another file path."
-        )
+    output_jsonld = output_file_no_ext + ".jsonld"
 
     # saving updated schema.org schema
     try:
         export_schema(jsonld_data_model, output_jsonld)
         click.echo(
             f"The Data Model was created and saved to '{output_jsonld}' location."
         )
-    except:  # pylint: disable=bare-except
-        click.echo(
-            (
-                f"The Data Model could not be created by using '{output_jsonld}' location. "
-                "Please check your file path again"
-            )
-        )
+    except Exception as exc:
+        raise ValueError(
+            f"The Data Model could not be created by using '{output_jsonld}' location. "
+            "Please check your file path again"
+        ) from exc
 
     # get the end time
     end_time = time.time()
 
     # get the execution time
     elapsed_time = time.strftime("%M:%S", time.gmtime(end_time - start_time))
     click.echo(f"Execution time: {elapsed_time} (M:S)")
+    return 0
@@ -5,6 +5,7 @@
 import time
 import urllib.request
 from typing import Any
+import pickle
 
 
 def load_json(file_path: str) -> Any:
@@ -49,6 +50,17 @@ def load_schemaorg() -> Any:
     return load_json(schema_org_path)
 
 
+def read_pickle(file_path: str) -> Any:
+    """Read pickle file and return error if file not a .pkl or .pickle file"""
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    if not file_path.lower().endswith((".pkl", ".pickle")):
+        raise ValueError(f"File is not a pickle file: {file_path}")
+    with open(file_path, "rb") as fle:
+        data = pickle.load(fle)
+        return data
+
+
 def cleanup_temporary_storage(
     temporary_storage_directory: str, time_delta_seconds: int
 ) -> None:

@@ -7,6 +7,7 @@
 import os
 import string
 from typing import Literal, Union, Optional
+import pickle
 
 import inflection
 
@@ -500,3 +501,20 @@ def get_json_schema_log_file_path(data_model_path: str, source_node: str) -> str
         prefix = prefix_root
     json_schema_log_file_path = f"{prefix}.{source_node}.schema.json"
     return json_schema_log_file_path
+
+
+def export_graph(schema: dict, file_path: str) -> None:
+    """Write object to a pickle file.
+    Args:
+        schema, dict: A data model graph to export
+        file_path, str: File to create
+    """
+    try:
+        with open(file_path, "wb") as file:
+            pickle.dump(schema, file)
+        logger.info(f"The graph was created and saved to '{file_path}'.")
+    except SystemExit as error:
+        logger.exception(
+            f"The graph failed to save to '{file_path}'. Please check your file path again."
+        )
+        raise error
@@ -6,12 +6,13 @@
 
 import numpy as np
 import pandas as pd
+import networkx as nx  # type: ignore
 
 from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 from schematic.schemas.data_model_json_schema import DataModelJSONSchema
-from schematic.schemas.data_model_parser import DataModelParser
-from schematic.utils.io_utils import load_json
 from schematic.utils.schema_utils import DisplayLabelType
+from schematic.utils.io_utils import load_json, read_pickle
+from schematic.schemas.data_model_parser import DataModelParser
 
 logger = logging.getLogger(__name__)
 
@@ -23,17 +24,28 @@ class AttributesExplorer:
     def __init__(
         self,
         path_to_jsonld: str,
-        data_model_labels: DisplayLabelType,
+        data_model_labels: DisplayLabelType = "class_label",
         data_model_grapher: Optional[DataModelGraph] = None,
         data_model_graph_explorer: Optional[DataModelGraphExplorer] = None,
         parsed_data_model: Optional[dict] = None,
+        graph_data_model: Optional[nx.MultiDiGraph] = None,
+        data_model_graph_pickle: Optional[str] = None,
     ) -> None:
         self.path_to_jsonld = path_to_jsonld
 
         self.jsonld = load_json(self.path_to_jsonld)
+        if graph_data_model is not None:
+            self.graph_data_model = graph_data_model
+        elif data_model_graph_pickle is not None:
+            data_model_graph = read_pickle(data_model_graph_pickle)
+            if not isinstance(data_model_graph, nx.MultiDiGraph):
+                raise ValueError(
+                    "The data model graph must be a networkx MultiDiGraph object."
+                )
+            self.graph_data_model = data_model_graph
 
         # Parse Model
-        if not parsed_data_model:
+        if parsed_data_model is None:
             data_model_parser = DataModelParser(
                 path_to_data_model=self.path_to_jsonld,
             )
@@ -43,15 +55,14 @@ def __init__(
         if not data_model_grapher:
             assert parsed_data_model is not None
             data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
-
-        # Generate graph
-        self.graph_data_model = data_model_grapher.graph
+            # Generate graph
+            self.graph_data_model = data_model_grapher.graph
 
         # Instantiate Data Model Graph Explorer
-        if not data_model_graph_explorer:
-            self.dmge = DataModelGraphExplorer(self.graph_data_model)
-        else:
+        if data_model_graph_explorer is not None:
             self.dmge = data_model_graph_explorer
+        else:
+            self.dmge = DataModelGraphExplorer(self.graph_data_model)
 
         # Instantiate Data Model Json Schema
         self.data_model_js = DataModelJSONSchema(