Adding CLI option from import-from-frictionless. (linkml#118)

* Adding CLI option from import-from-frictionless. Removed old CLI options * update * updating to latest schemabuilder
dalito · May 27, 2023 · 1bb3268 · 1bb3268
1 parent 5347c8e
commit 1bb3268
Show file tree

Hide file tree

Showing 11 changed files with 2,962 additions and 754 deletions.
diff --git a/notebooks/images/FRED.png b/notebooks/images/FRED.png
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = "^3.9"
-linkml = "^1.3.5"
+linkml = ">=1.5.4"
 mkdocs = "^1.2.3"
 pandas = "^1.3.5"
 python-dateutil = "^2.8.2"

diff --git a/schema_automator/cli.py b/schema_automator/cli.py
@@ -13,7 +13,7 @@
 from linkml_runtime.linkml_model import SchemaDefinition
 from oaklib.selector import get_resource_from_shorthand, get_implementation_from_shorthand
 
-from schema_automator import JsonLdAnnotator
+from schema_automator import JsonLdAnnotator, FrictionlessImportEngine
 from schema_automator.annotators.schema_annotator import SchemaAnnotator
 from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
 from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
@@ -44,6 +44,9 @@
     default=DEFAULT_SCHEMA_NAME,
     show_default=True,
     help='Schema name')
+schema_id_option = click.option(
+    '--schema-id',
+    help='Schema id')
 annotator_option = click.option(
     '--annotator',
     '-A',
@@ -52,6 +55,18 @@
     "--use-attributes/--no-use-attributes",
     help="If true, use attributes over slots/slot_usage"
 )
+column_separator_option = click.option('--column-separator', '-s', default='\t', help='separator')
+
+# generalizer options
+
+downcase_header_option = click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
+snakecase_header_option = click.option('--snakecase-header/--no-snakecase-header', default=False, help='if true make headers snakecase')
+infer_foreign_keys_option = click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
+enum_columns_option = click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
+enum_mask_columns_option = click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
+max_enum_size_option = click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
+enum_threshold_option = click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
+
 
 @click.group()
 @click.option("-v", "--verbose",
@@ -89,13 +104,12 @@ def main(verbose: int, quiet: bool):
 @schema_name_option
 @annotator_option
 @click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
-@click.option('--column-separator', '-s', default='\t', help='separator')
-@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
-@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
-@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
-@click.option('--max-enum-size',
-              type=click.INT,
-              help='set high to be more inclusive')
+@column_separator_option
+@downcase_header_option
+@snakecase_header_option
+@enum_columns_option
+@enum_threshold_option
+@max_enum_size_option
 @click.option('--data-dictionary-row-count',
               type=click.INT,
               help='rows that provide metadata about columns')
@@ -128,13 +142,12 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
 @click.argument('tsvfiles', nargs=-1)  # input TSV (must have column headers
 @output_option
 @schema_name_option
-@click.option('--column-separator', '-s', default='\t', help='separator')
-@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
-@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
-@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
-@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
-@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
-@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
+@column_separator_option
+@downcase_header_option
+@snakecase_header_option
+@enum_columns_option
+@enum_threshold_option
+@max_enum_size_option
 @click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
 def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
     """
@@ -157,6 +170,12 @@ def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
 @click.argument('url')  # input TSV (must have column headers
 @output_option
 @schema_name_option
+@column_separator_option
+@downcase_header_option
+@snakecase_header_option
+@enum_columns_option
+@enum_threshold_option
+@max_enum_size_option
 @click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
 @click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
 @click.option('--data-output', help='Path to file of downloaded data')
@@ -179,8 +198,13 @@ def generalize_htmltable(url, output, class_name, schema_name, pandera: bool,
     dfs = pd.read_html(url)
     logging.info(f"{url} has {len(dfs)} tables")
     df = dfs[table_number]
-    importer = TableImportEngine(**kwargs)
-    schema = importer.import_from_dataframe(df)
+    if data_output:
+        df.to_csv(data_output, index=False, sep="\t")
+    if pandera:
+        ge = PandasDataGeneralizer(**kwargs)
+    else:
+        ge = CsvDataGeneralizer(**kwargs)
+    schema = ge.convert_from_dataframe(df, class_name=class_name, schema_name=schema_name)
     write_schema(schema, output)
 
 
@@ -241,13 +265,15 @@ def import_htmltable(url, output, class_name, schema_name, columns,
                          table_number: int, data_output,
                          **kwargs):
     """
-    Generalizes from a table parsed from a URL
+    Imports from a table parsed from a URL using SchemaSheets
 
     Uses pandas/beautiful soup
     """
     dfs = pd.read_html(url)
     logging.info(f"{url} has {len(dfs)} tables")
     df = dfs[table_number]
+    if data_output:
+        df.to_csv(data_output, index=False, sep="\t")
     ie = TableImportEngine(columns=columns.split(","), **kwargs)
     schema = ie.import_from_dataframe(df)
     write_schema(schema, output)
@@ -339,6 +365,26 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
         ie.import_project(input, output, name=schema_name, format=format)
 
 
+@main.command()
+@click.argument('input')
+@output_option
+@schema_name_option
+@schema_id_option
+def import_frictionless(input, output, schema_name, schema_id, **kwargs):
+    """
+    Imports from Frictionless data package to LinkML
+
+    See :ref:`importers` for more on the importer framework
+
+    Example:
+
+        schemauto import-frictionless cfde.package.json
+    """
+    ie = FrictionlessImportEngine(**kwargs)
+    schema = ie.convert(input, name=schema_name, id=schema_id)
+    write_schema(schema, output)
+
+
 @main.command()
 @click.argument('owlfile')
 @output_option
@@ -428,7 +474,7 @@ def generalize_rdf(rdffile, dir, output, **args):
 @output_option
 def annotate_schema(schema: str, input: str, output: str, **kwargs):
     """
-    Annotate all elements of a schema
+    Annotate all elements of a schema.
 
     This uses OAK (https://incatools.github.io/ontology-access-kit),
     and you can provide any OAK backend that supports text annotation.
@@ -471,6 +517,10 @@ def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
     """
     Enrich a schema using an ontology.
 
+    Here, "enrich" means copying over metadata from the ontology to the schema.
+    For example, if the schema has a class "Gene" that is mapped to a SO class for "gene",
+    then calling this command will copy the SO class definition to the schema class.
+
     This will use OAK to add additional metadata using uris and mappings in the schema.
 
     For example, if your schema has a class with a mapping to a SO class,

diff --git a/schema_automator/generalizers/csv_data_generalizer.py b/schema_automator/generalizers/csv_data_generalizer.py
@@ -15,6 +15,7 @@
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
 from linkml_runtime.linkml_model.meta import UniqueKey
+from linkml_runtime.utils.formatutils import underscore
 from quantulum3 import parser as q_parser
 from dataclasses import dataclass, field
 
@@ -93,6 +94,9 @@ class CsvDataGeneralizer(Generalizer):
     downcase_header: bool = False
     """If true, coerce column names to be lower case"""
 
+    snakecase_header: bool = False
+    """If true, coerce column names to be snake case"""
+
     infer_foreign_keys: bool = False
     """For multi-CVS files, infer linkages between rows"""
 
@@ -127,10 +131,14 @@ def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
             c = os.path.splitext(os.path.basename(file))[0]
             if self.downcase_header:
                 c = c.lower()
+            if self.snakecase_header:
+                c = underscore(c)
             logging.info(f'READING {file} ')
             df = pd.read_csv(file, sep=self.column_separator, skipinitialspace=True).fillna("")
             if self.downcase_header:
                 df = df.rename(columns=str.lower)
+            if self.snakecase_header:
+                df = df.rename(columns=underscore)
             exclude = []
             for col in df.columns:
                 vals = set(df[col].tolist())
@@ -242,6 +250,8 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
             c = os.path.splitext(os.path.basename(file))[0]
             if self.downcase_header:
                 c = c.lower()
+            if self.snakecase_header:
+                c = underscore(c)
             s = self.convert(file, class_name=c, **kwargs)
             if s is not None:
                 schemas.append(s)
@@ -267,6 +277,16 @@ def convert(self, file: str, **kwargs) -> SchemaDefinition:
             rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)
             return self.convert_dicts([r for r in rr], **kwargs)
 
+    def convert_from_dataframe(self, df: pd.DataFrame, **kwargs) -> SchemaDefinition:
+        """
+        Converts a single dataframe to a single-class schema
+
+        :param df:
+        :param kwargs:
+        :return:
+        """
+        return self.convert_dicts(df.to_dict('records'), **kwargs)
+
     def read_slot_tsv(self, file: str, **kwargs) -> Dict:
         with open(file, newline='') as tsv_file:
             rows_list = csv.reader(tsv_file, delimiter=self.column_separator)
@@ -359,6 +379,8 @@ def convert_dicts(self,
         for row in rr:
             if self.downcase_header:
                 row = {k.lower(): v for k, v in row.items()}
+            if self.snakecase_header:
+                row = {underscore(k): v for k, v in row.items()}
             n += 1
             if n == 1 and self.robot:
                 for k, v in row.items():
@@ -784,60 +806,5 @@ def add_missing_to_schema(schema: SchemaDefinition):
                                    description='Holds a measurement serialized as a string')
 
 
-@click.group()
-def main():
-    pass
-
-
-@main.command()
-@click.argument('tsvfile')  # input TSV (must have column headers
-@click.option('--output', '-o', help='Output file')
-@click.option('--class_name', '-c', default='example', help='Core class name in schema')
-@click.option('--schema_name', '-n', default='example', help='Schema name')
-@click.option('--separator', '-s', default='\t', help='separator')
-@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
-@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
-@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
-def tsv2model(tsvfile, output, separator, class_name, schema_name, **kwargs):
-    """ Infer a model from a TSV """
-    ie = CsvDataGeneralizer(**kwargs)
-    schema = ie.convert(tsvfile, class_name=class_name, schema_name=schema_name)
-    write_schema(schema, output)
-
-
-@main.command()
-@click.argument('tsvfiles', nargs=-1)  # input TSV (must have column headers
-@click.option('--output', '-o', help='Output file')
-@click.option('--schema_name', '-n', default='example', help='Schema name')
-@click.option('--file_separator', '-s', default='\t', help='separator')
-@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
-@click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
-@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
-@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
-@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
-@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
-@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
-def tsvs2model(tsvfiles, output, schema_name, **kwargs):
-    """ Infer a model from multiple TSVs """
-    ie = CsvDataGeneralizer(**kwargs)
-    schema = ie.convert_multiple(tsvfiles, schema_name=schema_name)
-    write_schema(schema, output)
-
-
-@main.command()
-@click.argument('yamlfile')
-@click.option('--zooma-confidence', '-Z', help='zooma confidence')
-@click.option('--results', '-r', help='mapping results file')
-def enrich(yamlfile, results, **args):
-    """ Infer a model from a TSV """
-    yamlobj = yaml.load(open(yamlfile))
-    cache = {}
-    infer_enum_meanings(yamlobj, cache=cache)
-    if results is not None:
-        with open(results, "w") as io:
-            io.write(yaml.dump(cache))
-    print(yaml.dump(yamlobj, default_flow_style=False, sort_keys=False))
-
-
 if __name__ == '__main__':
     main()
diff --git a/schema_automator/importers/frictionless_import_engine.py b/schema_automator/importers/frictionless_import_engine.py
@@ -46,7 +46,7 @@ class FrictionlessImportEngine(ImportEngine):
 
     """
 
-    def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
+    def convert(self, file: str, id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
         """
         Converts one or more JSON files into a Schema
 
@@ -59,8 +59,6 @@ def convert(self, file: str, id: str,name: str, **kwargs) -> SchemaDefinition:
         schema = sb.schema
         if id:
             schema.id = id
-            if name:
-                sb.add_prefix(name, f"{id}/")
         if not name:
             name = package.name
         if name:
@@ -128,7 +126,7 @@ def add_enum(self, sb: SchemaBuilder, field: fl.Field) -> EnumDefinition:
                 if len(toks) == 2:
                     [prefix, short] = toks
                     pv = PermissibleValue(short, meaning=code)
-                    sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/")
+                    sb.add_prefix(prefix, f"{sb.schema.id}/{prefix}/", replace_if_present=True)
             e.permissible_values[pv.text] = pv
         if e.name is sb.schema:
             raise NotImplementedError(f"Cannot yet merge enums")

diff --git a/schema_automator/importers/rdfs_import_engine.py b/schema_automator/importers/rdfs_import_engine.py
@@ -104,13 +104,14 @@ def convert(
         if name is None:
             name = "example"
         sb = SchemaBuilder(name=name)
+        sb.add_defaults()
         schema = sb.schema
         for k, v in g.namespaces():
-            sb.add_prefix(k, v)
+            sb.add_prefix(k, v, replace_if_present=True)
         if default_prefix is not None:
             schema.default_prefix = default_prefix
             if default_prefix not in schema.prefixes:
-                sb.add_prefix(default_prefix, model_uri)
+                sb.add_prefix(default_prefix, model_uri, replace_if_present=True)
             schema.id = schema.prefixes[default_prefix].prefix_reference
         cls_slots = defaultdict(list)
         props = []
@@ -155,7 +156,6 @@ def convert(
             c.slots = cls_slots.get(cn, [])
             c.class_uri = str(s.n3(g.namespace_manager))
             sb.add_class(c)
-        sb.add_defaults()
         if identifier is not None:
             id_slot = SlotDefinition(identifier, identifier=True, range="uriorcurie")
             schema.slots[identifier] = id_slot

diff --git a/schema_automator/importers/tabular_import_engine.py b/schema_automator/importers/tabular_import_engine.py
@@ -40,14 +40,16 @@ def import_from_dataframe(self, df: pd.DataFrame):
         :return:
         """
         tf = NamedTemporaryFile(delete=False)
+        if not self.columns:
+            raise ValueError("Must specify columns")
+        logging.info(f"Using columns: {self.columns}")
         ix = 1
         line = pd.DataFrame(dict(zip(df.head(), self.columns)), index=[ix])
         df = pd.concat([df.iloc[:ix-1], line, df.iloc[ix-1:]]).reset_index(drop=True)
         if self.parent:
             df.insert(0,
                       column="parent",
                       value=[f">{self.element_type}"] + [self.parent] * (len(df) - 1))
-        #print(df)
         df.to_csv(tf.name, sep='\t', index=False)
         #print(open(tf.name, 'r').read())
         #element_map = dict(zip(df.head(), self.columns))