-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fds 1797 input graph api #1481
base: develop
Are you sure you want to change the base?
Fds 1797 input graph api #1481
Changes from all commits
2527224
70bfc19
5b41aa5
76a5567
670b5c7
a87d2fe
07dbd70
421c59a
2f5abf1
119cf40
c6c5cb8
dd8b2cd
6ec81db
bbe6c98
b99d9eb
1189107
86f7af8
d3c3769
215cc86
242c885
a1cfcce
1c67b4a
a986b39
0390b69
37bbf74
441b865
fe4fffb
8c64981
f20f88b
d56c965
4596560
b5dd8e4
c8da900
601b5d0
c6d01de
1f57cf1
99e860b
d89dc10
c9786dc
38b0690
77979c7
9907b4e
481ebd3
f63231a
f3fdc41
032771f
bf0e9ba
bc59218
5973a75
bf1a449
69bb182
d93aa9d
aef4f62
08b2ae5
a088d63
68672c3
de07c55
77f5ec4
5934ed5
51313bd
e4e1368
9a78a18
05b4fe0
0964501
697cc3e
774dc91
bb7c450
8f5bc1a
35c13dd
f8c14d4
13c9ffc
9205d6d
172e1b1
d4d3a30
730b227
795e263
0ddb552
537af6f
dcb2f65
bb4ee3a
5504586
b88ee86
2bf450b
7adaf44
c2a06f6
1e5bd7d
c849f92
8041b07
0870fa1
6802753
23b7e1d
fa5e308
3dd6659
f28a0d7
afbe448
e4d0b7c
a7d7366
ee9cb9f
cec7aa6
92b66fd
b52bfdc
8a64c39
5bd3fc0
9c2edde
58c1429
487e31d
e13f693
3d5da40
a331725
3ed1177
b6728d3
b7460ea
53e639b
498851b
58aef40
03f7497
bc64305
2a7ea55
275fbae
89cd4ce
0f4091a
3197abf
abd1630
445aad5
0c4fad7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,8 @@ | |
DisplayLabelType, | ||
extract_component_validation_rules, | ||
) | ||
from schematic.utils.df_utils import update_df, load_df | ||
from schematic.utils.io_utils import read_pickle | ||
from schematic.utils.validate_utils import rule_in_rule_list | ||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -1660,11 +1662,15 @@ def create_manifests( | |
title: Optional[str] = None, | ||
strict: Optional[bool] = True, | ||
use_annotations: Optional[bool] = False, | ||
graph_data_model: Optional[nx.MultiDiGraph] = None, | ||
data_model_graph_pickle: Optional[str] = None, | ||
) -> Union[List[str], List[pd.DataFrame]]: | ||
"""Create multiple manifests | ||
|
||
Args: | ||
path_to_data_model (str): str path to data model | ||
data_model_graph_pickle (str, optional): path to pickled networkx MultiDiGraph object. Defaults to None. | ||
graph_data_model (str, optional): An networkx MultiDiGraph object. Defaults to None. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't match the type parameter above |
||
data_types (list): a list of data types | ||
access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None. | ||
dataset_ids (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None. | ||
|
@@ -1722,16 +1728,25 @@ def create_manifests( | |
"Please check your submission and try again." | ||
) | ||
|
||
data_model_parser = DataModelParser(path_to_data_model=path_to_data_model) | ||
if graph_data_model is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so the logic from 1685 to 1703 can be further improved. Essentially you want to cover the following cases:
so instead of starting with and also, please consider wrapping this part in its own function... that way testing can be easier |
||
if data_model_graph_pickle: | ||
"""What if pickle file does not fit in memory?""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this comment meant to be addressed in this PR? If not, can a new ticket be created so that we don't lose track? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, also this is formatted like a DOC string with triple quotes instead of starting with a hash like a comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally things like this or |
||
graph_data_model = read_pickle(data_model_graph_pickle) | ||
else: | ||
data_model_parser = DataModelParser( | ||
path_to_data_model=path_to_data_model | ||
) | ||
|
||
# Parse Model | ||
parsed_data_model = data_model_parser.parse_model() | ||
# Parse Model | ||
parsed_data_model = data_model_parser.parse_model() | ||
|
||
# Instantiate DataModelGraph | ||
data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) | ||
# Instantiate DataModelGraph | ||
data_model_grapher = DataModelGraph( | ||
parsed_data_model, data_model_labels | ||
) | ||
|
||
# Generate graph | ||
graph_data_model = data_model_grapher.graph | ||
# Generate graph | ||
graph_data_model = data_model_grapher.graph | ||
|
||
# Gather all returned result urls | ||
all_results = [] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
import logging | ||
import time | ||
import re | ||
from typing import get_args, Optional, Any | ||
from typing import get_args, Optional, Any, Literal | ||
|
||
import click | ||
import click_log # type: ignore | ||
|
@@ -17,7 +17,7 @@ | |
|
||
from schematic.utils.schema_utils import DisplayLabelType | ||
from schematic.utils.cli_utils import query_dict | ||
from schematic.utils.schema_utils import export_schema | ||
from schematic.utils.schema_utils import export_schema, export_graph | ||
from schematic.help import schema_commands | ||
|
||
logger = logging.getLogger("schematic") | ||
|
@@ -59,9 +59,21 @@ def schema() -> None: # use as `schematic model ...` | |
metavar="<OUTPUT_PATH>", | ||
help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), | ||
) | ||
@click.option("--output_path", help="Alias for --output_jsonld") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you copied this from |
||
@click.option( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the CLI part still needs some more work and design thinking. When I am reading this, I think Can we simplify this by combining --output_jsonld and --output_path into a single I am seeing:
what is the purpose of adding output_path parameter if it is just using the value from output_jsonld? |
||
"--output_type", | ||
"-ot", | ||
type=click.Choice(["jsonld", "graph", "all"], case_sensitive=False), | ||
default="jsonld", | ||
help=query_dict(schema_commands, ("schema", "convert", "output_type")), | ||
) | ||
def convert( | ||
schema: Any, data_model_labels: DisplayLabelType, output_jsonld: Optional[str] | ||
) -> None: | ||
schema: Any, | ||
data_model_labels: DisplayLabelType, | ||
output_jsonld: Optional[str], | ||
output_type: Optional[Literal["jsonld", "graph", "all"]], | ||
output_path: Optional[str], | ||
) -> int: | ||
""" | ||
Running CLI to convert data model specification in CSV format to | ||
data model in JSON-LD format. | ||
|
@@ -80,19 +92,19 @@ def convert( | |
data_model_parser = DataModelParser(schema) | ||
|
||
# Parse Model | ||
logger.info("Parsing data model.") | ||
click.echo("Parsing data model.") | ||
parsed_data_model = data_model_parser.parse_model() | ||
|
||
# Convert parsed model to graph | ||
# Instantiate DataModelGraph | ||
data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) | ||
|
||
# Generate graphschema | ||
logger.info("Generating data model graph.") | ||
click.echo("Generating data model graph.") | ||
graph_data_model = data_model_grapher.graph | ||
|
||
# Validate generated data model. | ||
logger.info("Validating the data model internally.") | ||
click.echo("Validating the data model internally.") | ||
data_model_validator = DataModelValidator(graph=graph_data_model) | ||
data_model_errors, data_model_warnings = data_model_validator.run_checks() | ||
|
||
|
@@ -114,40 +126,49 @@ def convert( | |
for warning in war: | ||
logger.warning(warning) | ||
|
||
logger.info("Converting data model to JSON-LD") | ||
if output_path: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so many |
||
output_jsonld = output_path | ||
|
||
if output_jsonld is None: | ||
output_file_no_ext = re.sub("[.](jsonld|csv|pickle)$", "", schema) | ||
else: | ||
output_file_no_ext = re.sub("[.](jsonld|csv|pickle)$", "", output_jsonld) | ||
|
||
click.echo( | ||
"By default, the JSON-LD output will be stored alongside the first " | ||
f"input CSV or JSON-LD file. In this case, it will appear here: '{output_jsonld}'. " | ||
"You can use the `--output_jsonld` argument to specify another file path." | ||
) | ||
|
||
if output_type in ["graph", "all"]: | ||
output_graph = output_file_no_ext + ".pickle" | ||
click.echo(f"Saving data model graph to '{output_graph}'.") | ||
export_graph(graph_data_model, output_graph) | ||
if output_type == "graph": | ||
return 0 | ||
afwillia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
click.echo("Converting data model to JSON-LD") | ||
jsonld_data_model = convert_graph_to_jsonld(graph=graph_data_model) | ||
|
||
# output JSON-LD file alongside CSV file by default, get path. | ||
if output_jsonld is None: | ||
if not ".jsonld" in schema: | ||
csv_no_ext = re.sub("[.]csv$", "", schema) | ||
output_jsonld = csv_no_ext + ".jsonld" | ||
else: | ||
output_jsonld = schema | ||
|
||
logger.info( | ||
"By default, the JSON-LD output will be stored alongside the first " | ||
f"input CSV or JSON-LD file. In this case, it will appear here: '{output_jsonld}'. " | ||
"You can use the `--output_jsonld` argument to specify another file path." | ||
) | ||
output_jsonld = output_file_no_ext + ".jsonld" | ||
|
||
# saving updated schema.org schema | ||
try: | ||
export_schema(jsonld_data_model, output_jsonld) | ||
click.echo( | ||
f"The Data Model was created and saved to '{output_jsonld}' location." | ||
) | ||
except: # pylint: disable=bare-except | ||
click.echo( | ||
( | ||
f"The Data Model could not be created by using '{output_jsonld}' location. " | ||
"Please check your file path again" | ||
) | ||
) | ||
except Exception as exc: | ||
raise ValueError( | ||
f"The Data Model could not be created by using '{output_jsonld}' location. " | ||
"Please check your file path again" | ||
) from exc | ||
|
||
# get the end time | ||
end_time = time.time() | ||
|
||
# get the execution time | ||
elapsed_time = time.strftime("%M:%S", time.gmtime(end_time - start_time)) | ||
click.echo(f"Execution time: {elapsed_time} (M:S)") | ||
return 0 | ||
linglp marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
from typing import Any | ||
import json | ||
import urllib.request | ||
import pickle | ||
from schematic import LOADER | ||
|
||
|
||
|
@@ -40,3 +41,10 @@ def load_schemaorg() -> Any: | |
data_path = "data_models/schema_org.model.jsonld" | ||
schema_org_path = LOADER.filename(data_path) | ||
return load_json(schema_org_path) | ||
|
||
|
||
def read_pickle(file_path: str) -> Any: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, I think the read_pickle function assumes that the pickle file can be correctly loaded. But what if it doesn't? Could you raise a meaningful error message here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can test for read_pickle also be added? |
||
"""Read pickle file""" | ||
with open(file_path, "rb") as fle: | ||
data = pickle.load(fle) | ||
return data |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,12 +5,13 @@ | |
from typing import Optional, no_type_check | ||
import numpy as np | ||
import pandas as pd | ||
import networkx as nx # type: ignore | ||
|
||
from schematic.schemas.data_model_parser import DataModelParser | ||
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer | ||
from schematic.schemas.data_model_json_schema import DataModelJSONSchema | ||
from schematic.utils.schema_utils import DisplayLabelType | ||
from schematic.utils.io_utils import load_json | ||
from schematic.utils.io_utils import load_json, read_pickle | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -22,34 +23,44 @@ class AttributesExplorer: | |
def __init__( | ||
self, | ||
path_to_jsonld: str, | ||
data_model_labels: DisplayLabelType, | ||
data_model_labels: DisplayLabelType = "class_label", | ||
data_model_grapher: Optional[DataModelGraph] = None, | ||
data_model_graph_explorer: Optional[DataModelGraphExplorer] = None, | ||
parsed_data_model: Optional[dict] = None, | ||
graph_data_model: Optional[nx.MultiDiGraph] = None, | ||
data_model_graph_pickle: Optional[str] = None, | ||
) -> None: | ||
self.path_to_jsonld = path_to_jsonld | ||
|
||
self.jsonld = load_json(self.path_to_jsonld) | ||
if graph_data_model is not None: | ||
self.graph_data_model = graph_data_model | ||
elif data_model_graph_pickle is not None: | ||
data_model_graph = read_pickle(data_model_graph_pickle) | ||
if not isinstance(data_model_graph, nx.MultiDiGraph): | ||
raise ValueError( | ||
"The data model graph must be a networkx MultiDiGraph object." | ||
) | ||
self.graph_data_model = data_model_graph | ||
|
||
# Parse Model | ||
if not parsed_data_model: | ||
if parsed_data_model is None: | ||
data_model_parser = DataModelParser( | ||
path_to_data_model=self.path_to_jsonld, | ||
) | ||
parsed_data_model = data_model_parser.parse_model() | ||
|
||
# Instantiate DataModelGraph | ||
if not data_model_grapher: | ||
if data_model_grapher is None: | ||
data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) | ||
|
||
# Generate graph | ||
self.graph_data_model = data_model_grapher.graph | ||
# Generate graph | ||
self.graph_data_model = data_model_grapher.graph | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will self.graph_data_model always be assigned a value? The logic here is kind of confusion, and it's nto clear it always will be. |
||
|
||
# Instantiate Data Model Graph Explorer | ||
if not data_model_graph_explorer: | ||
self.dmge = DataModelGraphExplorer(self.graph_data_model) | ||
else: | ||
if data_model_graph_explorer is not None: | ||
self.dmge = data_model_graph_explorer | ||
else: | ||
self.dmge = DataModelGraphExplorer(self.graph_data_model) | ||
|
||
# Instantiate Data Model Json Schema | ||
self.data_model_js = DataModelJSONSchema( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is already loaded on line 25