Skip to content

Commit

Permalink
Adding CLI option from import-from-frictionless. (linkml#118)
Browse files Browse the repository at this point in the history
* Adding CLI option from import-from-frictionless.

Removed old CLI options

* update

* updating to latest schemabuilder
  • Loading branch information
cmungall authored May 27, 2023
1 parent 5347c8e commit 1bb3268
Show file tree
Hide file tree
Showing 11 changed files with 2,962 additions and 754 deletions.
Binary file added notebooks/images/FRED.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3,451 changes: 2,795 additions & 656 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ packages = [

[tool.poetry.dependencies]
python = "^3.9"
linkml = "^1.3.5"
linkml = ">=1.5.4"
mkdocs = "^1.2.3"
pandas = "^1.3.5"
python-dateutil = "^2.8.2"
Expand Down
88 changes: 69 additions & 19 deletions schema_automator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from linkml_runtime.linkml_model import SchemaDefinition
from oaklib.selector import get_resource_from_shorthand, get_implementation_from_shorthand

from schema_automator import JsonLdAnnotator
from schema_automator import JsonLdAnnotator, FrictionlessImportEngine
from schema_automator.annotators.schema_annotator import SchemaAnnotator
from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
Expand Down Expand Up @@ -44,6 +44,9 @@
default=DEFAULT_SCHEMA_NAME,
show_default=True,
help='Schema name')
schema_id_option = click.option(
'--schema-id',
help='Schema id')
annotator_option = click.option(
'--annotator',
'-A',
Expand All @@ -52,6 +55,18 @@
"--use-attributes/--no-use-attributes",
help="If true, use attributes over slots/slot_usage"
)
column_separator_option = click.option('--column-separator', '-s', default='\t', help='separator')

# generalizer options

downcase_header_option = click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
snakecase_header_option = click.option('--snakecase-header/--no-snakecase-header', default=False, help='if true make headers snakecase')
infer_foreign_keys_option = click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
enum_columns_option = click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
enum_mask_columns_option = click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
max_enum_size_option = click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
enum_threshold_option = click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')


@click.group()
@click.option("-v", "--verbose",
Expand Down Expand Up @@ -89,13 +104,12 @@ def main(verbose: int, quiet: bool):
@schema_name_option
@annotator_option
@click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
@click.option('--column-separator', '-s', default='\t', help='separator')
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
@click.option('--max-enum-size',
type=click.INT,
help='set high to be more inclusive')
@column_separator_option
@downcase_header_option
@snakecase_header_option
@enum_columns_option
@enum_threshold_option
@max_enum_size_option
@click.option('--data-dictionary-row-count',
type=click.INT,
help='rows that provide metadata about columns')
Expand Down Expand Up @@ -128,13 +142,12 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
@click.argument('tsvfiles', nargs=-1) # input TSV (must have column headers
@output_option
@schema_name_option
@click.option('--column-separator', '-s', default='\t', help='separator')
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
@column_separator_option
@downcase_header_option
@snakecase_header_option
@enum_columns_option
@enum_threshold_option
@max_enum_size_option
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
"""
Expand All @@ -157,6 +170,12 @@ def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
@click.argument('url') # input TSV (must have column headers
@output_option
@schema_name_option
@column_separator_option
@downcase_header_option
@snakecase_header_option
@enum_columns_option
@enum_threshold_option
@max_enum_size_option
@click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
@click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
@click.option('--data-output', help='Path to file of downloaded data')
Expand All @@ -179,8 +198,13 @@ def generalize_htmltable(url, output, class_name, schema_name, pandera: bool,
dfs = pd.read_html(url)
logging.info(f"{url} has {len(dfs)} tables")
df = dfs[table_number]
importer = TableImportEngine(**kwargs)
schema = importer.import_from_dataframe(df)
if data_output:
df.to_csv(data_output, index=False, sep="\t")
if pandera:
ge = PandasDataGeneralizer(**kwargs)
else:
ge = CsvDataGeneralizer(**kwargs)
schema = ge.convert_from_dataframe(df, class_name=class_name, schema_name=schema_name)
write_schema(schema, output)


Expand Down Expand Up @@ -241,13 +265,15 @@ def import_htmltable(url, output, class_name, schema_name, columns,
table_number: int, data_output,
**kwargs):
"""
Generalizes from a table parsed from a URL
Imports from a table parsed from a URL using SchemaSheets
Uses pandas/beautiful soup
"""
dfs = pd.read_html(url)
logging.info(f"{url} has {len(dfs)} tables")
df = dfs[table_number]
if data_output:
df.to_csv(data_output, index=False, sep="\t")
ie = TableImportEngine(columns=columns.split(","), **kwargs)
schema = ie.import_from_dataframe(df)
write_schema(schema, output)
Expand Down Expand Up @@ -339,6 +365,26 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
ie.import_project(input, output, name=schema_name, format=format)


@main.command()
@click.argument('input')
@output_option
@schema_name_option
@schema_id_option
def import_frictionless(input, output, schema_name, schema_id, **kwargs):
"""
Imports from Frictionless data package to LinkML
See :ref:`importers` for more on the importer framework
Example:
schemauto import-frictionless cfde.package.json
"""
ie = FrictionlessImportEngine(**kwargs)
schema = ie.convert(input, name=schema_name, id=schema_id)
write_schema(schema, output)


@main.command()
@click.argument('owlfile')
@output_option
Expand Down Expand Up @@ -428,7 +474,7 @@ def generalize_rdf(rdffile, dir, output, **args):
@output_option
def annotate_schema(schema: str, input: str, output: str, **kwargs):
"""
Annotate all elements of a schema
Annotate all elements of a schema.
This uses OAK (https://incatools.github.io/ontology-access-kit),
and you can provide any OAK backend that supports text annotation.
Expand Down Expand Up @@ -471,6 +517,10 @@ def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
"""
Enrich a schema using an ontology.
Here, "enrich" means copying over metadata from the ontology to the schema.
For example, if the schema has a class "Gene" that is mapped to a SO class for "gene",
then calling this command will copy the SO class definition to the schema class.
This will use OAK to add additional metadata using uris and mappings in the schema.
For example, if your schema has a class with a mapping to a SO class,
Expand Down
77 changes: 22 additions & 55 deletions schema_automator/generalizers/csv_data_generalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
from linkml_runtime.linkml_model.meta import UniqueKey
from linkml_runtime.utils.formatutils import underscore
from quantulum3 import parser as q_parser
from dataclasses import dataclass, field

Expand Down Expand Up @@ -93,6 +94,9 @@ class CsvDataGeneralizer(Generalizer):
downcase_header: bool = False
"""If true, coerce column names to be lower case"""

snakecase_header: bool = False
"""If true, coerce column names to be snake case"""

infer_foreign_keys: bool = False
"""For multi-CVS files, infer linkages between rows"""

Expand Down Expand Up @@ -127,10 +131,14 @@ def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
c = os.path.splitext(os.path.basename(file))[0]
if self.downcase_header:
c = c.lower()
if self.snakecase_header:
c = underscore(c)
logging.info(f'READING {file} ')
df = pd.read_csv(file, sep=self.column_separator, skipinitialspace=True).fillna("")
if self.downcase_header:
df = df.rename(columns=str.lower)
if self.snakecase_header:
df = df.rename(columns=underscore)
exclude = []
for col in df.columns:
vals = set(df[col].tolist())
Expand Down Expand Up @@ -242,6 +250,8 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
c = os.path.splitext(os.path.basename(file))[0]
if self.downcase_header:
c = c.lower()
if self.snakecase_header:
c = underscore(c)
s = self.convert(file, class_name=c, **kwargs)
if s is not None:
schemas.append(s)
Expand All @@ -267,6 +277,16 @@ def convert(self, file: str, **kwargs) -> SchemaDefinition:
rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)
return self.convert_dicts([r for r in rr], **kwargs)

def convert_from_dataframe(self, df: pd.DataFrame, **kwargs) -> SchemaDefinition:
"""
Converts a single dataframe to a single-class schema
:param df:
:param kwargs:
:return:
"""
return self.convert_dicts(df.to_dict('records'), **kwargs)

def read_slot_tsv(self, file: str, **kwargs) -> Dict:
with open(file, newline='') as tsv_file:
rows_list = csv.reader(tsv_file, delimiter=self.column_separator)
Expand Down Expand Up @@ -359,6 +379,8 @@ def convert_dicts(self,
for row in rr:
if self.downcase_header:
row = {k.lower(): v for k, v in row.items()}
if self.snakecase_header:
row = {underscore(k): v for k, v in row.items()}
n += 1
if n == 1 and self.robot:
for k, v in row.items():
Expand Down Expand Up @@ -784,60 +806,5 @@ def add_missing_to_schema(schema: SchemaDefinition):
description='Holds a measurement serialized as a string')


@click.group()
def main():
pass


@main.command()
@click.argument('tsvfile') # input TSV (must have column headers
@click.option('--output', '-o', help='Output file')
@click.option('--class_name', '-c', default='example', help='Core class name in schema')
@click.option('--schema_name', '-n', default='example', help='Schema name')
@click.option('--separator', '-s', default='\t', help='separator')
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
def tsv2model(tsvfile, output, separator, class_name, schema_name, **kwargs):
""" Infer a model from a TSV """
ie = CsvDataGeneralizer(**kwargs)
schema = ie.convert(tsvfile, class_name=class_name, schema_name=schema_name)
write_schema(schema, output)


@main.command()
@click.argument('tsvfiles', nargs=-1) # input TSV (must have column headers
@click.option('--output', '-o', help='Output file')
@click.option('--schema_name', '-n', default='example', help='Schema name')
@click.option('--file_separator', '-s', default='\t', help='separator')
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
def tsvs2model(tsvfiles, output, schema_name, **kwargs):
""" Infer a model from multiple TSVs """
ie = CsvDataGeneralizer(**kwargs)
schema = ie.convert_multiple(tsvfiles, schema_name=schema_name)
write_schema(schema, output)


@main.command()
@click.argument('yamlfile')
@click.option('--zooma-confidence', '-Z', help='zooma confidence')
@click.option('--results', '-r', help='mapping results file')
def enrich(yamlfile, results, **args):
""" Infer a model from a TSV """
yamlobj = yaml.load(open(yamlfile))
cache = {}
infer_enum_meanings(yamlobj, cache=cache)
if results is not None:
with open(results, "w") as io:
io.write(yaml.dump(cache))
print(yaml.dump(yamlobj, default_flow_style=False, sort_keys=False))


if __name__ == '__main__':
main()
6 changes: 2 additions & 4 deletions schema_automator/importers/frictionless_import_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class FrictionlessImportEngine(ImportEngine):
"""

def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
def convert(self, file: str, id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
"""
Converts one or more JSON files into a Schema
Expand All @@ -59,8 +59,6 @@ def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
schema = sb.schema
if id:
schema.id = id
if name:
sb.add_prefix(name, f"{id}/")
if not name:
name = package.name
if name:
Expand Down Expand Up @@ -128,7 +126,7 @@ def add_enum(self, sb: SchemaBuilder, field: fl.Field) -> EnumDefinition:
if len(toks) == 2:
[prefix, short] = toks
pv = PermissibleValue(short, meaning=code)
sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/")
sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/", replace_if_present=True)
e.permissible_values[pv.text] = pv
if e.name is sb.schema:
raise NotImplementedError(f"Cannot yet merge enums")
Expand Down
6 changes: 3 additions & 3 deletions schema_automator/importers/rdfs_import_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,14 @@ def convert(
if name is None:
name = "example"
sb = SchemaBuilder(name=name)
sb.add_defaults()
schema = sb.schema
for k, v in g.namespaces():
sb.add_prefix(k, v)
sb.add_prefix(k, v, replace_if_present=True)
if default_prefix is not None:
schema.default_prefix = default_prefix
if default_prefix not in schema.prefixes:
sb.add_prefix(default_prefix, model_uri)
sb.add_prefix(default_prefix, model_uri, replace_if_present=True)
schema.id = schema.prefixes[default_prefix].prefix_reference
cls_slots = defaultdict(list)
props = []
Expand Down Expand Up @@ -155,7 +156,6 @@ def convert(
c.slots = cls_slots.get(cn, [])
c.class_uri = str(s.n3(g.namespace_manager))
sb.add_class(c)
sb.add_defaults()
if identifier is not None:
id_slot = SlotDefinition(identifier, identifier=True, range="uriorcurie")
schema.slots[identifier] = id_slot
Expand Down
4 changes: 3 additions & 1 deletion schema_automator/importers/tabular_import_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ def import_from_dataframe(self, df: pd.DataFrame):
:return:
"""
tf = NamedTemporaryFile(delete=False)
if not self.columns:
raise ValueError("Must specify columns")
logging.info(f"Using columns: {self.columns}")
ix = 1
line = pd.DataFrame(dict(zip(df.head(), self.columns)), index=[ix])
df = pd.concat([df.iloc[:ix-1], line, df.iloc[ix-1:]]).reset_index(drop=True)
if self.parent:
df.insert(0,
column="parent",
value=[f">{self.element_type}"] + [self.parent] * (len(df) - 1))
#print(df)
df.to_csv(tf.name, sep='\t', index=False)
#print(open(tf.name, 'r').read())
#element_map = dict(zip(df.head(), self.columns))
Expand Down
Loading

0 comments on commit 1bb3268

Please sign in to comment.