diff --git a/Makefile b/Makefile index 024cd745..5832e279 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,10 @@ MAKEFLAGS += --warn-undefined-variables build: mkdir build -.PHONY: doc time test sqlite_test pg_test -.PHONY: api_test sqlite_api_test pg_qpi_test -.PHONY: random_test_data random_test sqlite_random_test pg_random_test +.PHONY: doc readme valve_debug valve_release test sqlite_test pg_test api_test sqlite_api_test \ + pg_qpi_test random_test_data random_test sqlite_random_test pg_random_test guess_test_data \ + perf_test_data sqlite_perf_test pg_perf_test perf_test + doc: cargo doc --document-private-items @@ -23,13 +24,19 @@ readme: cargo readme --no-title > README.md valve: src/*.rs src/*.lalrpop + @$(MAKE) valve_debug + +valve_release: rm -f valve cargo build --release ln -s target/release/ontodev_valve valve - # cargo build - # ln -s target/debug/ontodev_valve valve -build/valve.db: test/src/table.tsv valve clean | build +valve_debug: + rm -f valve + cargo build + ln -s target/debug/ontodev_valve valve + +build/valve.db: test/src/table.tsv clean valve | build ./valve $< $@ test/output: @@ -37,7 +44,8 @@ test/output: test: sqlite_test pg_test api_test random_test -tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 table9 table10 table11 +tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 \ + table9 table10 table11 sqlite_test: build/valve.db test/src/table.tsv | test/output @echo "Testing valve on sqlite ..." @@ -93,10 +101,10 @@ random_test_dir = test/random_test_data random_test: sqlite_random_test pg_random_test $(random_test_dir)/ontology: - mkdir -p $(random_test_dir)/ontology + mkdir -p $@ -random_test_data: test/generate_random_test_data.py | $(random_test_dir)/ontology - ./$< $$(date +"%s") 100 5 $| +random_test_data: test/generate_random_test_data.py valve valve test/random_test_data/table.tsv | $(random_test_dir)/ontology + ./$< $$(date +"%s") 100 5 $(word 3,$^) $| sqlite_random_test: valve clean random_test_data | build test/output @echo "Testing with random data on sqlite ..." @@ -110,40 +118,66 @@ pg_random_test: valve clean random_test_data | build test/output test/round_trip.sh postgresql:///valve_postgres $(random_test_dir)/table.tsv @echo "Test succeeded!" -test/perf_test_data/ontology: test/generate_random_test_data.py - mkdir $@ - ./$< 1 10000 5 $@ +guess_test_dir = test/guess_test_data +guess_test_db = build/valve_guess.db + +$(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv + ./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) + +$(guess_test_dir)/ontology: + mkdir -p $@ + +guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology + ./confirm_overwrite.sh $(guess_test_dir)/ontology + rm -f $(guess_test_dir)/table1.tsv + ./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) + rm -f $(guess_test_dir)/ontology/*.tsv + ./$< 0 30000 5 $(guess_test_dir)/table_expected.tsv $| + rm -f $(guess_test_dir)/ontology/table1.tsv -build/valve_perf.db: valve | test/perf_test_data/ontology build - @if [ -f $@ ]; \ - then \ - echo "'$@' exists but is out of date. To rebuild '$@', run \`make cleanperfdb\`" \ - "before running \`make $@\`" ; \ - false; \ - fi - time -p ./$< --verbose test/perf_test_data/table.tsv $@ +$(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology + rm -f $@ + ./$< $(guess_test_dir)/table.tsv $@ + +perf_test_dir = test/perf_test_data +perf_test_db = build/valve_perf.db + +$(perf_test_dir)/ontology: + mkdir -p $@ + +perf_test_data: test/generate_random_test_data.py valve confirm_overwrite.sh $(perf_test_dir)/*.tsv | $(perf_test_dir)/ontology + ./confirm_overwrite.sh $(perf_test_dir)/ontology + rm -f $(perf_test_dir)/ontology/*.tsv + ./$< $$(date +"%s") 10000 5 $(perf_test_dir)/table.tsv $| + +$(perf_test_db): valve perf_test_data $(perf_test_dir)/*.tsv | build $(perf_test_dir)/ontology + rm -f $@ + time -p ./$< --verbose $(perf_test_dir)/table.tsv $@ -.PHONY: sqlite_perf_test sqlite_perf_test: build/valve_perf.db | test/output time -p scripts/export.py messages $< $| $(tables_to_test) -.PHONY: pg_perf_test -pg_perf_test: valve test/perf_test_data/ontology | test/output - time -p ./$< --verbose test/perf_test_data/table.tsv postgresql:///valve_postgres +pg_perf_test: valve $(perf_test_dir)/ontology | test/output + time -p ./$< --verbose $(perf_test_dir)/table.tsv postgresql:///valve_postgres time -p scripts/export.py messages postgresql:///valve_postgres $| $(tables_to_test) -.PHONY: perf_test perf_test: sqlite_perf_test pg_perf_test clean: - rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology + rm -Rf build/valve.db* build/valve_random.db* test/output $(random_test_dir)/ontology valve -cleanperfdb: +clean_guess_db: + rm -Rf build/valve_guess.db + +clean_guess_data: + rm -Rf $(guess_test_dir)/table1.tsv $(guess_test_dir)/ontology + +clean_perf_db: rm -Rf build/valve_perf.db -cleanperfdata: - rm -Rf test/perf_test_data/ontology +clean_perf_data: + rm -Rf $(perf_test_dir)/ontology -cleanall: clean cleanperfdb cleanperfdata +cleanall: clean clean_perf_db clean_perf_data clean_guess_db clean_guess_data cargo clean - rm -Rf valve + rm -f valve diff --git a/confirm_overwrite.sh b/confirm_overwrite.sh new file mode 100755 index 00000000..aa58cd50 --- /dev/null +++ b/confirm_overwrite.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh + +if [ -d $1 -a ! -z "$(ls -A $1)" ] +then + printf "$1 already exists and contains the following files: $(ls -A -m -w 0 $1)\nAre you sure (y/n)? " + read enter + if [ $enter = 'y' ] + then + exit 0 + else + echo "Understood. Exiting with error code." + exit 1 + fi +fi diff --git a/scripts/guess.py b/scripts/guess.py new file mode 100755 index 00000000..0f9ab864 --- /dev/null +++ b/scripts/guess.py @@ -0,0 +1,524 @@ +#!/usr/bin/env python3 + +import csv +import json +import random +import re +import sqlite3 +import subprocess +import sys +import time + +from copy import deepcopy +from guess_grammar import grammar, TreeToDict + +from argparse import ArgumentParser +from lark import Lark +from numbers import Number +from pathlib import Path +from pprint import pformat +from textwrap import dedent + + +SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] +VERBOSE = False + + +def log(message, force=False, suppress_time=False): + global VERBOSE + + if force or VERBOSE: + if not suppress_time: + print(f"{time.asctime()} {message}", file=sys.stderr) + else: + print(f"{message}", file=sys.stderr) + + +def has_ncolumn(sample, ncolumn): + return bool([label for label in sample if sample[label]["normalized"] == ncolumn]) + + +def get_random_sample(table, sample_size): + # Get the number of rows in the file (we substract 1 for the header row): + with open(table, "rb") as f: + total_rows = sum(1 for _ in f) - 1 + + if total_rows <= sample_size: + sample_size = total_rows + sample_row_numbers = range(0, total_rows) + else: + sample_row_numbers = random.sample(range(0, total_rows), sample_size) + with open(table) as f: + rows = [r for r in csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)] + sample = {} + pattern = re.compile(r"[^0-9a-zA-Z_]+") + for i in sample_row_numbers: + for label, value in rows[i].items(): + if label not in sample: + ncolumn = re.sub(pattern, "_", label).casefold().strip("_") + if has_ncolumn(sample, ncolumn): + print( + "The data has more than one column with the normalized name " + f"{ncolumn}" + ) + sys.exit(1) + sample[label] = { + "normalized": ncolumn, + "values": [], + } + sample[label]["values"].append(value) + return sample + + +def get_valve_config(valve_table): + result = subprocess.run(["./valve", "--dump_config", valve_table], capture_output=True) + if result.returncode != 0: + error = result.stderr.decode() + output = result.stdout.decode() + if output: + error = f"{error}\n{output}" + print(f"{error}", file=sys.stderr) + sys.exit(result.returncode) + return json.loads(result.stdout.decode()) + + +def get_hierarchy_for_dt(config, primary_dt_name): + def get_parents(dt_name): + datatypes = [] + if dt_name is not None: + datatype = config["datatype"][dt_name] + if datatype["datatype"] != primary_dt_name: + datatypes.append(datatype) + datatypes += get_parents(datatype.get("parent")) + return datatypes + + return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) + + +def get_dt_hierarchies(config): + """ + Given a VALVE configuration, return a datatype hierarchy that looks like this: + {0: {'dt_name_1': [{'datatype': 'dt_name_1', + 'description': 'a description', + ...}, + {'datatype': 'parent datatype', + 'description': 'a description', + ...}, + {'datatype': 'grandparent datatype', + 'description': 'a description', + ...}, + ...], + 'dt_name_2': etc.}, + 1: ... etc.} + """ + + def get_higher_datatypes(datatype_hierarchies, universals, depth): + current_datatypes = [dt_name for dt_name in datatype_hierarchies.get(depth, [])] + higher_datatypes = {} + if current_datatypes: + universals = [dt_name for dt_name in universals] + lower_datatypes = [] + for i in range(0, depth): + lower_datatypes += [dt_name for dt_name in datatype_hierarchies.get(i, [])] + for dt_name in dt_hierarchies[depth]: + dt_hierarchy = dt_hierarchies[depth][dt_name] + if len(dt_hierarchy) > 1: + parent_hierarchy = dt_hierarchy[1:] + parent = parent_hierarchy[0]["datatype"] + if parent not in current_datatypes + lower_datatypes + universals: + higher_datatypes[parent] = parent_hierarchy + return higher_datatypes + + dt_config = config["datatype"] + dt_names = [dt_name for dt_name in dt_config] + dt_hierarchies = {0: {}} + universals = {} + for dt_name in dt_names: + # Add all the leaf datatypes to dt_hierarchies at 0 depth: + children = [child for child in dt_names if dt_config[child].get("parent") == dt_name] + if not children: + dt_hierarchies[0][dt_name] = get_hierarchy_for_dt(config, dt_name) + # Ungrounded and unconditioned datatypes go into the universals category, which are added + # to the top of dt_hierarchies later: + elif not dt_config[dt_name].get("parent") or not dt_config[dt_name].get("condition"): + universals[dt_name] = get_hierarchy_for_dt(config, dt_name) + + depth = 0 + higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth) + while higher_dts: + depth += 1 + dt_hierarchies[depth] = deepcopy(higher_dts) + higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth) + dt_hierarchies[depth + 1] = universals + return dt_hierarchies + + +def get_sql_type(config, datatype): + """Given the config map and the name of a datatype, climb the datatype tree (as required), + and return the first 'SQLite type' found.""" + if "datatype" not in config: + print("Missing datatypes in config") + sys.exit(1) + if datatype not in config["datatype"]: + return None + if config["datatype"][datatype].get("SQLite type"): + return config["datatype"][datatype]["SQLite type"] + return get_sql_type(config, config["datatype"][datatype].get("parent")) + + +def get_potential_foreign_columns(config, datatype): + global SPECIAL_TABLES + + def get_coarser_sql_type(datatype): + sql_type = get_sql_type(config, datatype) + if sql_type not in ["integer", "numeric", "real"]: + return "text" + else: + return sql_type.casefold() + + potential_foreign_columns = [] + this_sql_type = get_coarser_sql_type(datatype) + for table, table_config in config["table"].items(): + if table not in SPECIAL_TABLES: + for column, column_config in table_config["column"].items(): + if column_config.get("structure") in ["primary", "unique"]: + foreign_sql_type = get_coarser_sql_type(column_config["datatype"]) + if foreign_sql_type == this_sql_type: + potential_foreign_columns.append( + { + "table": table, + "column": column, + "sql_type": foreign_sql_type, + } + ) + return potential_foreign_columns + + +SAVED_CONDITIONS = {} + + +def get_compiled_condition(condition, parser): + global SAVED_CONDITIONS + + if condition in SAVED_CONDITIONS: + return SAVED_CONDITIONS[condition] + + parsed_condition = parser.parse(condition) + if len(parsed_condition) != 1: + print( + f"'{condition}' is invalid. Only one condition per column is allowed.", file=sys.stderr + ) + sys.exit(1) + parsed_condition = parsed_condition[0] + if parsed_condition["type"] == "function" and parsed_condition["name"] == "equals": + expected = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["value"]) + compiled_condition = lambda x: x == expected + elif parsed_condition["type"] == "function" and parsed_condition["name"] in ( + "exclude", + "match", + "search", + ): + pattern = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["pattern"]) + flags = parsed_condition["args"][0]["flags"] + flags = "(?" + "".join(flags) + ")" if flags else "" + pattern = re.compile(flags + pattern) + if parsed_condition["name"] == "exclude": + compiled_condition = lambda x: not bool(pattern.search(x)) + elif parsed_condition["name"] == "match": + compiled_condition = lambda x: bool(pattern.fullmatch(x)) + else: + compiled_condition = lambda x: bool(pattern.search(x)) + elif parsed_condition["type"] == "function" and parsed_condition["name"] == "in": + alternatives = [ + re.sub(r"^['\"](.*)['\"]$", r"\1", arg["value"]) for arg in parsed_condition["args"] + ] + compiled_condition = lambda x: x in alternatives + else: + print(f"Unrecognized condition: {condition}", file=sys.stderr) + sys.exit(1) + + SAVED_CONDITIONS[condition] = compiled_condition + return compiled_condition + + +def annotate(label, sample, config, error_rate, is_primary_candidate): + def has_nulltype(target): + num_values = len(target["values"]) + num_empties = target["values"].count("") + return num_empties / num_values > error_rate + + def has_duplicates(target, ignore_empties): + if ignore_empties: + values = [v for v in target["values"] if v != ""] + else: + values = target["values"] + distinct_values = set(values) + return (len(values) - len(distinct_values)) > (error_rate * len(values)) + + def get_datatype(target, dt_hierarchies): + def is_match(datatype): + # If the datatype has no associated condition then it matches anything: + if not datatype.get("condition"): + return True + # If the SQLite type is NULL this datatype is ruled out: + sqlite_type = datatype.get("SQLite type") + if sqlite_type and sqlite_type.casefold() == "null": + return False + + condition = get_compiled_condition(datatype["condition"], config["parser"]) + num_values = len(target["values"]) + num_passed = [condition(v) for v in target["values"]].count(True) + success_rate = num_passed / num_values + if (1 - success_rate) <= error_rate: + return success_rate + + def tiebreak(datatypes): + in_types = [] + other_types = [] + parents = set([dt["datatype"].get("parent") for dt in datatypes]) + parents.discard(None) + for dt in datatypes: + if dt["datatype"]["datatype"] not in parents: + if dt["datatype"].get("condition", "").lstrip().startswith("in("): + in_types.append(dt) + else: + other_types.append(dt) + + if len(in_types) == 1: + return in_types[0]["datatype"] + elif len(in_types) > 1: + in_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + return in_types[0]["datatype"] + elif len(other_types) == 1: + return other_types[0]["datatype"] + elif len(other_types) > 1: + other_types = sorted(other_types, key=lambda k: k["success_rate"], reverse=True) + return other_types[0]["datatype"] + else: + print(f"Error tiebreaking datatypes: {pformat(datatypes)}") + sys.exit(1) + + for depth in range(0, len(dt_hierarchies)): + datatypes_to_check = [dt_hierarchies[depth][dt][0] for dt in dt_hierarchies[depth]] + matching_datatypes = [] + for datatype in datatypes_to_check: + success_rate = is_match(datatype) + if success_rate: + matching_datatypes.append({"datatype": datatype, "success_rate": success_rate}) + + if len(matching_datatypes) == 1: + return matching_datatypes[0]["datatype"] + elif len(matching_datatypes) > 1: + return tiebreak(matching_datatypes) + + def get_from(target, potential_foreign_columns): + candidate_froms = [] + for foreign in potential_foreign_columns: + table = foreign["table"] + column = foreign["column"] + sql_type = foreign["sql_type"] + num_matches = 0 + num_values = len(target["values"]) + for value in target["values"]: + if target.get("nulltype") == "empty" and value == "": + # If this value is legitimately empty then it should not be taken into account + # when counting the number of values in the target that are found in the + # candidate foreign column: + num_values -= 1 + continue + if sql_type != "text" and not isinstance(value, Number): + # If this value is of the wrong type then there is no need to explicitly check + # if it exists in the foreign column: + continue + if sql_type == "text": + value = f"'{value}'" + sql = f'SELECT 1 FROM "{table}" WHERE "{column}" = {value} LIMIT 1' + num_matches += len(config["db"].execute(sql).fetchall()) + if ((num_values - num_matches) / num_values) < error_rate: + candidate_froms.append(f"from({foreign['table']}.{foreign['column']})") + return candidate_froms + + target = sample[label] + if has_nulltype(target): + target["nulltype"] = "empty" + + # Use the valve config to retrieve the valve datatype hierarchies: + dt_hierarchies = get_dt_hierarchies(config) + target["datatype"] = get_datatype(target, dt_hierarchies)["datatype"] + + # Use the valve config to get a list of columns already loaded to the database, then compare + # the contents of each column with the contents of the target column and possibly annotate the + # target with a from() structure, if there is one and only one candidate from(). + potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) + froms = get_from(target, potential_foreign_columns) + if len(froms) == 1: + target["structure"] = froms[0] + elif len(froms) > 1: + print(f"Column '{label}' has multiple from() candidates: {', '.join(froms)}") + + # Check if the column is a unique/primary column: + if not target.get("structure"): + if target.get("nulltype") is None and not has_duplicates(target, True): + if is_primary_candidate: + target["structure"] = "primary" + else: + target["structure"] = "unique" + + +if __name__ == "__main__": + parser = ArgumentParser(description="VALVE guesser (prototype)") + parser.add_argument("--verbose", action="store_true", help="Print logging output to STDERR.") + parser.add_argument( + "--sample_size", + type=int, + default=10000, + help="Sample size to use when guessing (default: 10,000)", + ) + parser.add_argument( + "--error_rate", + type=float, + default=0.1, + help="""A number between 0 and 1 (inclusive) representing the proportion of errors expected + (default: 0.1)""", + ) + parser.add_argument( + "--enum_size", + type=int, + default=10, + help="The maximum number of values to use for in(...) datatype conditions", + ) + parser.add_argument( + "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)" + ) + parser.add_argument( + "--yes", + action="store_true", + help="Do not ask for confirmation before writing suggested modifications to the database", + ) + parser.add_argument( + "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration" + ) + parser.add_argument( + "DATABASE", + help="""Can be one of (A) A URL of the form `postgresql://...` or + `sqlite://...` (B) The filename (including path) of a sqlite database.""", + ) + parser.add_argument( + "TABLE", help="A .TSV file containing the data for which we will be guessing" + ) + args = parser.parse_args() + + VERBOSE = args.verbose + + # Use the seed argument, or the epoch time if no seed is given, to set up the random generator: + if args.seed is not None: + seed = args.seed + else: + seed = time.time_ns() + random.seed(seed) + + # Get the valve configuration and database info: + config = get_valve_config(args.VALVE_TABLE) + table_tsv = args.TABLE + table = Path(args.TABLE).stem + if table in config["table"]: + print(f"{table} is already configured.", file=sys.stderr) + sys.exit(0) + with sqlite3.connect(args.DATABASE) as conn: + config["db"] = conn + + # Attach the condition parser to the config as well: + config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict()) + + log(f"Getting random sample of {args.sample_size} rows from {table_tsv} ...") + sample = get_random_sample(table_tsv, args.sample_size) + for i, label in enumerate(sample): + log(f"Annotating label '{label}' ...") + annotate(label, sample, config, args.error_rate, i == 0) + log("Done!") + + table_table_headers = ["table", "path", "type", "description"] + column_table_headers = [ + "table", + "column", + "label", + "nulltype", + "datatype", + "structure", + "description", + ] + if not args.yes: + print() + + print('The following row will be inserted to "table":') + data = [table_table_headers, [f"{table}", f"{table_tsv}", "", ""]] + # We add +2 for padding + col_width = max(len(word) for row in data for word in row) + 2 + for row in data: + print("".join(word.ljust(col_width) for word in row)) + + print() + + print('The following row will be inserted to "column":') + data = [column_table_headers] + for label in sample: + row = [ + f"{table}", + f"{sample[label]['normalized']}", + f"{label if label != sample[label]['normalized'] else ''}", + f"{sample[label].get('nulltype', '')}", + f"{sample[label]['datatype']}", + f"{sample[label].get('structure', '')}", + f"{sample[label].get('description', '')}", + ] + data.append(row) + # We add +2 for padding + col_width = max(len(word) for row in data for word in row) + 2 + for row in data: + print("".join(word.ljust(col_width) for word in row)) + + print() + + answer = input("Do you want to write this updated configuration to the database? (y/n) ") + if answer.casefold() != "y": + print("Not writing updated configuration to the database.") + sys.exit(0) + + log("Updating table configuration in database ...") + row_number = conn.execute('SELECT MAX(row_number) FROM "table"').fetchall()[0][0] + 1 + sql = dedent( + f""" + INSERT INTO "table" ("row_number", {', '.join([f'"{k}"' for k in table_table_headers])}) + VALUES ({row_number}, '{table}', '{table_tsv}', NULL, NULL)""" + ) + log(sql, suppress_time=True) + log("", suppress_time=True) + conn.execute(sql) + conn.commit() + + log("Updating column configuration in database ...") + row_number = conn.execute('SELECT MAX(row_number) FROM "column"').fetchall()[0][0] + 1 + for label in sample: + values = ", ".join( + [ + f"{row_number}", + f"'{table}'", + f"'{sample[label]['normalized']}'", + f"'{label}'" if label != sample[label]["normalized"] else "NULL", + f"'{sample[label]['nulltype']}'" if sample[label].get("nulltype") else "NULL", + f"'{sample[label]['datatype']}'", + f"'{sample[label]['structure']}'" if sample[label].get("structure") else "NULL", + f"'{sample[label]['description']}'" if sample[label].get("description") else "NULL", + ] + ) + sql = dedent( + f""" + INSERT INTO "column" ("row_number", {', '.join([f'"{k}"' for k in column_table_headers])}) + VALUES ({values})""" + ) + log(sql, suppress_time=True) + conn.execute(sql) + conn.commit() + row_number += 1 + log("", suppress_time=True) + log("Done!") diff --git a/scripts/guess_grammar.py b/scripts/guess_grammar.py new file mode 100644 index 00000000..5e611cfb --- /dev/null +++ b/scripts/guess_grammar.py @@ -0,0 +1,120 @@ +from lark import Transformer + +# Grammar used to parse the the contents of `condition` and `structure` columns. +# See: https://lark-parser.readthedocs.io/en/latest/index.html# +grammar = r""" +%import common.WS +%ignore WS + +start: expression+ +?expression: string | function + +?string: label +label: ALPHANUM | DQSTRING | SQSTRING + +function: function_name "(" arguments ")" +function_name: ALPHANUM +arguments: argument ("," argument)* +?argument: string | field | function | named_arg | regex +field: label "." label +named_arg: label "=" label + +?regex: regex_sub | regex_match +regex_match: "/" regex_pattern "/" regex_flags +regex_sub: SUB_BEGIN "/" regex_pattern "/" regex_pattern "/" regex_flags +regex_pattern: REGEX_WITH_FORWARD_SLASH | REGEX_WITHOUT_FORWARD_SLASH +regex_flags: LOWER_ALPHA* + +SUB_BEGIN: "s" +ALPHANUM: /[a-zA-Z0-9-_]/+ +DQSTRING: "\"" /[^"](\\\")?/* "\"" +SQSTRING: "'" /[^'](\\\')?/* "'" +LOWER_ALPHA: /[a-z]/ +NO_SLASH: /[^\/]/ +REGEX_WITH_FORWARD_SLASH: NO_SLASH* "\\/" NO_SLASH* +REGEX_WITHOUT_FORWARD_SLASH: NO_SLASH+ +""" + + +class TreeToDict(Transformer): + """Transformer to convert a Tree, generated by the grammar used by CMI-PB to parse the contents + of `condition` and `structure` columns, into a list of expressions represented as dicts.""" + + def _sanity_check(self, token_list, expected_len): + if len(token_list) != expected_len: + raise Exception(f"Wrong number of tokens in: {token_list} (expecting {expected_len})") + + def label(self, label): + self._sanity_check(label, 1) + label = label[0] + return {"type": "label", "value": label.value} + + def field(self, field): + self._sanity_check(field, 2) + return {"type": "field", "table": field[0]["value"], "column": field[1]["value"]} + + def named_arg(self, named_arg): + self._sanity_check(named_arg, 2) + return {"type": "named_arg", "key": named_arg[0]["value"], "value": named_arg[1]["value"]} + + def regex_match(self, regex_match): + self._sanity_check(regex_match, 2) + return {"type": "regex", "pattern": regex_match[0], "flags": regex_match[1]} + + def regex_sub(self, regex_sub): + self._sanity_check(regex_sub, 4) + return { + "type": "regex", + "pattern": regex_sub[1], + "replace": regex_sub[2], + "flags": regex_sub[3], + } + + def regex_pattern(self, regex_pattern): + self._sanity_check(regex_pattern, 1) + return regex_pattern[0].value + + def regex_flags(self, flags): + return [flag.value for flag in flags] + + def arguments(self, arguments): + return arguments + + def function_name(self, function_name): + self._sanity_check(function_name, 1) + return function_name[0].value + + def function(self, function): + self._sanity_check(function, 2) + return {"type": "function", "name": function[0], "args": function[1]} + + def start(self, start): + return start + + +def reverse_parse(config, parsed_cond): + """Given a config map and a parsed condition, return the text version of the condition.""" + cond_type = parsed_cond["type"] + text_cond = None + if cond_type == "label": + if config["datatype"].get(parsed_cond["value"]): + text_cond = config["datatype"][parsed_cond["value"]]["datatype"] + else: + text_cond = "'{}'".format(parsed_cond["value"]) + elif cond_type == "field": + return "{}.{}".format(parsed_cond["table"], parsed_cond["column"]) + elif cond_type == "named_arg": + text_cond = "{}={}".format(parsed_cond["key"], parsed_cond["value"]) + elif cond_type == "regex": + pattern = parsed_cond["pattern"] + flags = "".join(parsed_cond["flags"]) + replace = parsed_cond.get("replace") + text_cond = f"/{pattern}/{flags}" if not replace else f"s/{pattern}/{replace}/{flags}" + elif cond_type == "function": + text_cond = map(lambda arg: reverse_parse(config, arg), parsed_cond["args"]) + text_cond = ", ".join(text_cond) + text_cond = "{}({})".format(parsed_cond["name"], text_cond) + else: + raise Exception(f"Unknown parsed_cond type: {cond_type} for {parsed_cond}") + + return text_cond diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..9547a85f --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +lark==1.1.8 diff --git a/src/lib.rs b/src/lib.rs index 16de0d94..296fd7cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -969,31 +969,32 @@ pub async fn configure_db( // use that information to create the associated database tables, while saving constraint // information to constrains_config. let mut setup_statements = HashMap::new(); - let table_names: Vec = tables_config.keys().cloned().collect(); - for table_name in table_names { + for table_name in tables_config.keys().cloned().collect::>() { let optional_path = tables_config .get(&table_name) .and_then(|r| r.get("path")) .and_then(|p| p.as_str()); - let path; + let mut path = None; match optional_path { - // If an entry of the tables_config has no path then it is an internal table which need - // not be configured explicitly. Currently the only example is the message table. - None => continue, + None => { + // If an entry of the tables_config has no path then it is an internal table which + // need not be configured explicitly. Currently the only examples are the message + // and history tables. + if table_name != "message" && table_name != "history" { + panic!("No path defined for table {}", table_name); + } + continue; + } Some(p) if !Path::new(p).is_file() => { eprintln!("WARN: File does not exist {}", p); - continue; } Some(p) if Path::new(p).canonicalize().is_err() => { eprintln!("WARN: File path could not be made canonical {}", p); - continue; } - - Some(p) => path = p.to_string(), + Some(p) => path = Some(p.to_string()), }; - // Get the columns that have been previously configured: let defined_columns: Vec = tables_config .get(&table_name) .and_then(|r| r.get("column")) @@ -1003,65 +1004,58 @@ pub async fn configure_db( .and_then(|k| Some(k.collect())) .unwrap(); - // Get the actual columns from the data itself. Note that we set has_headers to false - // (even though the files have header rows) in order to explicitly read the header row. - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_reader(File::open(path.clone()).unwrap_or_else(|err| { - panic!("Unable to open '{}': {}", path.clone(), err); - })); - let mut iter = rdr.records(); - let actual_columns; - if let Some(result) = iter.next() { - actual_columns = result.unwrap(); - } else { - panic!("'{}' is empty", path); - } - // We use column_order to explicitly indicate the order in which the columns should appear - // in the table, for later reference. + // in the table, for later reference. The default is to preserve the order from the actual + // table file. If that does not exist, we use the ordering in defined_columns. let mut column_order = vec![]; - let mut all_columns: SerdeMap = SerdeMap::new(); - for column_name in &actual_columns { - let column; - if !defined_columns.contains(&column_name.to_string()) { - let mut cmap = SerdeMap::new(); - cmap.insert( - String::from("table"), - SerdeValue::String(table_name.to_string()), - ); - cmap.insert( - String::from("column"), - SerdeValue::String(column_name.to_string()), - ); - cmap.insert( - String::from("nulltype"), - SerdeValue::String(String::from("empty")), - ); - cmap.insert( - String::from("datatype"), - SerdeValue::String(String::from("text")), - ); - column = SerdeValue::Object(cmap); - } else { - column = tables_config - .get(&table_name) - .and_then(|r| r.get("column")) - .and_then(|v| v.as_object()) - .and_then(|o| o.get(column_name)) + if let Some(path) = path { + // Get the actual columns from the data itself. Note that we set has_headers to + // false(even though the files have header rows) in order to explicitly read the + // header row. + let mut rdr = csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_reader(File::open(path.clone()).unwrap_or_else(|err| { + panic!("Unable to open '{}': {}", path.clone(), err); + })); + let mut iter = rdr.records(); + if let Some(result) = iter.next() { + let actual_columns = result .unwrap() - .clone(); + .iter() + .map(|c| c.to_string()) + .collect::>(); + // Make sure that the actual columns found in the table file, and the columns + // defined in the column config, exactly match in terms of their content: + for column_name in &actual_columns { + column_order.push(json!(column_name)); + if !defined_columns.contains(&column_name.to_string()) { + panic!( + "Column '{}.{}' not in column config", + table_name, column_name + ); + } + } + for column_name in &defined_columns { + if !actual_columns.contains(&column_name.to_string()) { + panic!( + "Defined column '{}.{}' not found in table", + table_name, column_name + ); + } + } + } else { + panic!("'{}' is empty", path); } - column_order.push(SerdeValue::String(column_name.to_string())); - all_columns.insert(column_name.to_string(), column); } + if column_order.is_empty() { + column_order = defined_columns.iter().map(|c| json!(c)).collect::>(); + } tables_config .get_mut(&table_name) .and_then(|t| t.as_object_mut()) .and_then(|o| { - o.insert(String::from("column"), SerdeValue::Object(all_columns)); o.insert( String::from("column_order"), SerdeValue::Array(column_order), @@ -1097,9 +1091,11 @@ pub async fn configure_db( } // Sort the tables according to their foreign key dependencies so that tables are always loaded - // after the tables they depend on: - let unsorted_tables: Vec = setup_statements.keys().cloned().collect(); - let sorted_tables = verify_table_deps_and_sort(&unsorted_tables, &constraints_config); + // after the tables they depend on. Ignore the internal message and history tables: + let sorted_tables = verify_table_deps_and_sort( + &setup_statements.keys().cloned().collect(), + &constraints_config, + ); if *command != ValveCommand::Config || verbose { // Generate DDL for the history table: diff --git a/test/expected/table3.tsv b/test/expected/table3.tsv index 04c78efc..c0f31eda 100644 --- a/test/expected/table3.tsv +++ b/test/expected/table3.tsv @@ -1,12 +1,12 @@ -source id label type parent -MOB MOB:0000013 mobecular entity owl:Class material entity -ZOB ZOB:0000013 bar owl:Class car -JOB JOB:0000013 car owl:Class foo -SOB SOB:0000013 foo owl:Class bar -YOB YOB:0000013 mar owl:Class jafar -COB BFO:0000040 material entity owl:Class owl:Thing -CO B COB:0000013 molecular dentity owl:Class material entity -COB COB:0000013 molecular entity owl:Class material entity -COB VO:0000001 vaccine owl:Class material entity -BOB VO:0000001 vaccine owl:Class material entity -BFOBBER BFO:0000027 bazaar owl:Class barrie +source id label type parent related +MOB MOB:0000013 mobecular entity owl:Class material entity +ZOB ZOB:0000013 bar owl:Class car +JOB JOB:0000013 car owl:Class foo +SOB SOB:0000013 foo owl:Class bar +YOB YOB:0000013 mar owl:Class jafar +COB BFO:0000040 material entity owl:Class owl:Thing +CO B COB:0000013 molecular dentity owl:Class material entity +COB COB:0000013 molecular entity owl:Class material entity +COB VO:0000001 vaccine owl:Class material entity +BOB VO:0000001 vaccine owl:Class material entity +BFOBBER BFO:0000027 bazaar owl:Class barrie diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py index 3f8d988d..87008651 100755 --- a/test/generate_random_test_data.py +++ b/test/generate_random_test_data.py @@ -1,209 +1,65 @@ #!/usr/bin/env python3 +import json import math import random import string +import subprocess +import sys from argparse import ArgumentParser TOKEN_LENGTH = 9 +WINDOW_SIZE = 50 -CONFIG = { - "table1": { - "prefix": { - "allow_empty": False, - "datatype": "prefix", - "structure": { - "type": "primary", - }, - }, - "base": { - "allow_empty": False, - "datatype": "IRI", - "structure": { - "type": "unique", - }, - }, - "ontology IRI": { - "allow_empty": True, - "datatype": "IRI", - }, - "version IRI": { - "allow_empty": True, - "datatype": "IRI", - }, - }, - "table2": { - "child": { - "allow_empty": False, - "datatype": "trimmed_line", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "other_foreign_column", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "trimmed_line", - "structure": { - "type": "tree", - "tcolumn": "child", - }, - }, - "xyzzy": { - "allow_empty": True, - "datatype": "trimmed_line", - "structure": { - "type": "under", - "ttable": "table2", - "tcolumn": "child", - "uval": "d", - }, - }, - "foo": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "numeric_foreign_column", - }, - }, - "bar": { - "allow_empty": True, - "datatype": "text", - }, - }, - "table3": { - "source": { - "allow_empty": False, - "datatype": "prefix", - "structure": { - "type": "foreign", - "ftable": "table1", - "fcolumn": "prefix", - }, - }, - "id": { - "allow_empty": False, - "datatype": "curie", - "structure": { - "type": "unique", - }, - }, - "label": { - "allow_empty": False, - "datatype": "label", - "structure": { - "type": "primary", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "label", - "structure": { - "type": "tree", - "tcolumn": "label", - }, - }, - "related": { - "allow_empty": True, - "datatype": "trimmed_line", - }, - }, - "table4": { - "foreign_column": { - "allow_empty": False, - "datatype": "text", - "structure": { - "type": "unique", - }, - }, - "other_foreign_column": { - "allow_empty": False, - "datatype": "text", - "structure": { - "type": "unique", - }, - }, - "numeric_foreign_column": { - "allow_empty": False, - "datatype": "integer", - "structure": { - "type": "primary", - }, - }, - }, - "table5": { - "foo": { - "allow_empty": False, - "datatype": "word", - "structure": { - "type": "primary", - }, - }, - "bar": { - "allow_empty": False, - "datatype": "integer", - }, - }, - "table6": { - "child": { - "allow_empty": False, - "datatype": "integer", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "numeric_foreign_column", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "tree", - "tcolumn": "child", - }, - }, - "xyzzy": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "under", - "ttable": "table6", - "tcolumn": "child", - "uval": "4", - }, - }, - "foo": { - "allow_empty": True, - "datatype": "text", - }, - "bar": { - "allow_empty": True, - "datatype": "integer", - }, - }, -} - - -def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, to_column): - global CONFIG +def get_special_tables(config): + return [k for k, v in config["special"].items() if v is not None] + + +def get_table_columns(config, table): + return [column for column in config["table"][table]["column_order"]] + + +def has_nulltype(config, table, column): + return bool(config["table"][table]["column"][column].get("nulltype")) + + +def get_column_structure(config, table, column): + return config["table"][table]["column"][column].get("structure") + + +def get_column_datatype(config, table, column): + return config["table"][table]["column"][column]["datatype"] + + +def get_foreign_key(config, table, column): + return [f for f in config["constraints"]["foreign"][table] if f["column"] == column][0] + + +def get_tree(config, table, column): + return [f for f in config["constraints"]["tree"][table] if f["parent"] == column][0] + + +def get_under(config, table, column): + return [f for f in config["constraints"]["under"][table] if f["column"] == column][0] + + +def get_value_from_prev_insert(config, prev_inserts, from_table, from_column, to_table, to_column): + global WINDOW_SIZE # Note: because we are loading the tables and columns in the correct order (i.e. such that # all dependencies are loaded before the tables and columns they depend on), the list of # previous inserts for the from_table/from_column will never be empty. if len(prev_inserts[from_table][from_column]) == 1: - if CONFIG[to_table][to_column]["allow_empty"]: + if has_nulltype(config, to_table, to_column): return "" else: return prev_inserts[from_table][from_column][0] else: - # Select at random from the last 100 inserted values: - prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-100:] + # Select at random from the last N inserted values, with N given by WINDOW_SIZE: + prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-WINDOW_SIZE:] from_values = prev_inserts[from_table][from_column] # We'd ideally like to exclude the last inserted value from consideration, but we save it # here in case we cannot: @@ -219,58 +75,58 @@ def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, return values_to_choose_from[random.randrange(len(values_to_choose_from))] -def get_constrained_cell_value(table, column, row_num, prev_inserts): +def get_constrained_cell_value(config, table, column, row_num, prev_inserts): global TOKEN_LENGTH - global CONFIG - - structure = CONFIG[table][column].get("structure") - if structure and structure["type"] == "foreign": - ftable = structure["ftable"] - fcolumn = structure["fcolumn"] - cell = get_value_from_prev_insert(prev_inserts, ftable, fcolumn, table, column) - elif structure and structure["type"] == "tree": - tcolumn = structure["tcolumn"] - cell = get_value_from_prev_insert(prev_inserts, table, tcolumn, table, column) - elif structure and structure["type"] == "under": + + structure = get_column_structure(config, table, column) + datatype = get_column_datatype(config, table, column).casefold() + if structure.startswith("from("): + fkey = get_foreign_key(config, table, column) + ftable = fkey["ftable"] + fcolumn = fkey["fcolumn"] + cell = get_value_from_prev_insert(config, prev_inserts, ftable, fcolumn, table, column) + elif structure.startswith("tree("): + tkey = get_tree(config, table, column) + tcolumn = tkey["child"] + cell = get_value_from_prev_insert(config, prev_inserts, table, tcolumn, table, column) + elif structure.startswith("under("): # Note that properly satisfying the under constraint requires, not only that # the cell is in the specified tree column, but also (a) that the tree # actually exists, and (b) that the value is "under" the under value. To do # this properly, though, would require a decent amount of memory. So perhaps # it's not worth it to check for (a) and (b) and allow any offending cells # to generate errors which we can then verify are handled properly by valve. - ttable = structure["ttable"] - tcolumn = structure["tcolumn"] - cell = get_value_from_prev_insert(prev_inserts, ttable, tcolumn, table, column) - elif CONFIG[table][column]["datatype"] in [ + ukey = get_under(config, table, column) + ttable = ukey["ttable"] + tcolumn = ukey["tcolumn"] + cell = get_value_from_prev_insert(config, prev_inserts, ttable, tcolumn, table, column) + elif datatype in [ "prefix", - "IRI", + "iri", "trimmed_line", "label", "word", ]: cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) - elif CONFIG[table][column]["datatype"] == "curie": + elif datatype == "curie": cell = ( "".join(random.choices(string.ascii_lowercase, k=3)).upper() + ":" + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) - elif CONFIG[table][column]["datatype"] == "text": + elif datatype == "text": cell = ( "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) + " " + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) - elif CONFIG[table][column]["datatype"] == "integer": + elif datatype == "integer": # No leading 0s: cell = "".join(random.choices("123456789", k=1)) + "".join( random.choices(string.digits, k=TOKEN_LENGTH - 1) ) else: - print( - f"Warning: Unknown datatype: {CONFIG[table][column]['datatype']}. " - "Generating a random string." - ) + print(f"Warning: Unknown datatype: {datatype}. Generating a random string.") cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) return cell @@ -278,20 +134,20 @@ def get_constrained_cell_value(table, column, row_num, prev_inserts): def main(): global TOKEN_LENGTH - global CONFIG parser = ArgumentParser( description=""" Deterministically generate a specified amount of data, a specified percentage of which are - errors, using a hard-coded VALVE configuration, given the specified seed, to a specified - output directory. - """ + errors, using the given VALVE table configuration and seed, to the output directory.""" ) parser.add_argument("seed", help="The seed to use to generate the random data") parser.add_argument("num_rows", help="The number of rows per table to generate") parser.add_argument( "pct_errors", help="The percentage of rows in each table that should have errors" ) + parser.add_argument( + "input_table", help="The .TSV file representing the VALVE table configuration" + ) parser.add_argument( "output_dir", help="The output directory to write the new table configuration to" ) @@ -299,48 +155,65 @@ def main(): seed = int(args.seed) num_rows = int(args.num_rows) pct_errors = int(args.pct_errors) + input_table = args.input_table outdir = args.output_dir + # Use the seed argument to seed the random data that will be generated: random.seed(seed) + # Get the VALVE configuration: + result = subprocess.run(["./valve", "--dump_config", input_table], capture_output=True) + if result.returncode != 0: + error = result.stderr.decode() + output = result.stdout.decode() + if output: + error = f"{error}\n{output}" + print(f"{error}", file=sys.stderr) + sys.exit(result.returncode) + config = json.loads(result.stdout.decode()) + # This is a record of the last inserted values for each table and column. When one column # takes its values from another column, then we look here and fetch the last inserted value of # the second column. prev_inserts = {} + + # The data tables to generate: + data_tables = [t for t in config["sorted_table_list"] if t not in get_special_tables(config)] + + # The TSV files corresponding to each data table: tsv_files = {} - tables_in_order = ["table4", "table1", "table2", "table3", "table5", "table6"] - for table in tables_in_order: + for table in data_tables: tsv_files[table] = open(f"{outdir}/{table}.tsv", "w") - columns = [column for column in CONFIG[table]] + columns = get_table_columns(config, table) print("\t".join(columns), file=tsv_files[table]) num_error_rows = math.ceil((pct_errors / 100) * num_rows) error_proportion = None if not num_error_rows else math.floor(num_rows / num_error_rows) for row_num in range(1, num_rows + 1): - for table in tables_in_order: + for table in data_tables: is_error_row = error_proportion and row_num % error_proportion == 1 - columns = [column for column in CONFIG[table]] + columns = get_table_columns(config, table) error_column = random.randrange(len(columns)) row = {} for column_num, column in enumerate(columns): is_error_column = is_error_row and column_num == error_column if ( not is_error_column - and CONFIG[table][column]["allow_empty"] + and has_nulltype(config, table, column) and row_num % random.randrange(2, num_rows) == 1 ): # If the column allows empty values, assign an empty value "sometimes": cell = "" elif not is_error_column: - cell = get_constrained_cell_value(table, column, row_num, prev_inserts) + cell = get_constrained_cell_value(config, table, column, row_num, prev_inserts) else: - if CONFIG[table][column].get("structure") and CONFIG[table][column][ - "structure" - ]["type"] in ["unique", "primary"]: + structure = get_column_structure(config, table, column) + datatype = get_column_datatype(config, table, column) + if structure in ["unique", "primary"]: cell = "" - elif CONFIG[table][column]["datatype"] in [ + elif datatype in [ "prefix", - "IRI", + "iri", "word", "curie", ]: @@ -350,7 +223,7 @@ def main(): + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) else: - if CONFIG[table][column]["datatype"] == "integer": + if datatype == "integer": cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) else: # No leading 0s: diff --git a/test/guess_test_data/column.tsv b/test/guess_test_data/column.tsv new file mode 100644 index 00000000..2659b524 --- /dev/null +++ b/test/guess_test_data/column.tsv @@ -0,0 +1,33 @@ +table column label nulltype datatype structure description +table table table_name table_name primary name of this table +table path path path to the TSV file for this table, relative to the table.tsv file +table type empty table_type type of this table, used for tables with special meanings +table description empty description a description of this table +column table table_name from(table.table) the table that this column belongs to +column column column_name the name of this column +column label empty label the human-readable label for this column +column nulltype empty datatype_name from(datatype.datatype) the datatype for NULL values in this column +column datatype datatype_name from(datatype.datatype) the datatype for this column +column structure empty trimmed_line schema information for this column +column description empty description a description of this column +datatype datatype datatype_name primary the name of this datatype +datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word +datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name +rule table table_name +rule when column column_name +rule when condition datatype_condition +rule then column column_name +rule then condition datatype_condition +rule level word +rule description empty description +table1 prefix prefix primary +table1 base IRI unique +table1 ontology IRI empty IRI +table1 version IRI empty IRI diff --git a/test/guess_test_data/column_expected.tsv b/test/guess_test_data/column_expected.tsv new file mode 100644 index 00000000..f7e6a20e --- /dev/null +++ b/test/guess_test_data/column_expected.tsv @@ -0,0 +1,53 @@ +table column label nulltype datatype structure description +table table table_name table_name primary name of this table +table path path path to the TSV file for this table, relative to the table.tsv file +table type empty table_type type of this table, used for tables with special meanings +table description empty description a description of this table +column table table_name from(table.table) the table that this column belongs to +column column column_name the name of this column +column label empty label the human-readable label for this column +column nulltype empty datatype_name from(datatype.datatype) the datatype for NULL values in this column +column datatype datatype_name from(datatype.datatype) the datatype for this column +column structure empty trimmed_line schema information for this column +column description empty description a description of this column +datatype datatype datatype_name primary the name of this datatype +datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word +datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name +rule table table_name +rule when column column_name +rule when condition datatype_condition +rule then column column_name +rule then condition datatype_condition +rule level word +rule description empty description +table1 prefix prefix primary +table1 base IRI unique +table1 ontology IRI empty IRI +table1 version IRI empty IRI +table2 child trimmed_line from(table4.other_foreign_column) +table2 parent empty trimmed_line tree(child) +table2 xyzzy empty trimmed_line under(table2.child, d) +table2 foo empty integer from(table4.numeric_foreign_column) +table2 bar empty text +table3 source prefix from(table1.prefix) +table3 id CURIE unique +table3 label label primary +table3 parent empty label tree(label) +table3 related empty trimmed_line +table4 foreign_column text unique +table4 other_foreign_column text unique +table4 numeric_foreign_column integer primary +table5 foo word primary +table5 bar integer +table6 child integer from(table4.numeric_foreign_column) +table6 parent empty integer tree(child) +table6 xyzzy empty integer under(table6.child, 4) +table6 foo empty text +table6 bar empty integer diff --git a/test/guess_test_data/datatype.tsv b/test/guess_test_data/datatype.tsv new file mode 100644 index 00000000..c118588d --- /dev/null +++ b/test/guess_test_data/datatype.tsv @@ -0,0 +1,22 @@ +datatype parent transform condition structure description SQLite type PostgreSQL type RDF type HTML type +CURIE nonspace match(/\S+:\S+/) concat(prefix, ":", suffix) a Compact URI CURIE +IRI nonspace exclude(/\s/) an Internationalized Resource Identifier IRI +column_name trimmed_line match(/\S([^\n]*\S)*/) a column name +datatype_condition line exclude(/\n/) a datatype condition specification +datatype_name word exclude(/\W/) a datatype name +description trimmed_text match(/\S(.*\S)*/) a brief description +empty text equals('') the empty string NULL NULL null +integer nonspace match(/-?\d+/) a positive or negative integer INTEGER INTEGER +label trimmed_line match(/\S([^\n]*\S)*/) +line text exclude(/\n/) a line of text input +natural_number integer match(/\d+/) a natural number, including zero INTEGER INTEGER +nonspace trimmed_line exclude(/\s/) text without whitespace +path line exclude(/\n/) a path to a file +prefix word exclude(/\W/) a prefix for a CURIE +suffix word exclude(/\W/) a suffix for a CURIE +table_name word exclude(/\W/) a table name +table_type word lowercase in('table', 'column', 'datatype') a table type +text any text TEXT TEXT xsd:string textarea +trimmed_line line match(/\S([^\n]*\S)*/) a line of text that does not begin or end with whitespace +trimmed_text text exclude(/^\s+|\s+$/) text that does not begin or end with whitespace +word nonspace exclude(/\W/) a single word: letters, numbers, underscore diff --git a/test/guess_test_data/rule.tsv b/test/guess_test_data/rule.tsv new file mode 100644 index 00000000..a46b8d52 --- /dev/null +++ b/test/guess_test_data/rule.tsv @@ -0,0 +1,2 @@ +table when column when condition then column then condition level description +table1 ontology IRI null version IRI null error 'version IRI' must be null whenever 'ontology IRI' is null diff --git a/test/guess_test_data/table.tsv b/test/guess_test_data/table.tsv new file mode 100644 index 00000000..ac5800f1 --- /dev/null +++ b/test/guess_test_data/table.tsv @@ -0,0 +1,6 @@ +table path description type +column test/guess_test_data/column.tsv Columns for all of the tables. column +datatype test/guess_test_data/datatype.tsv Datatypes for all of the columns datatype +rule test/guess_test_data/rule.tsv More complex "when" rules rule +table test/guess_test_data/table.tsv All of the user-editable tables in this project. table +table1 test/guess_test_data/table1.tsv The first data table diff --git a/test/guess_test_data/table_expected.tsv b/test/guess_test_data/table_expected.tsv new file mode 100644 index 00000000..dfb683c4 --- /dev/null +++ b/test/guess_test_data/table_expected.tsv @@ -0,0 +1,11 @@ +table path description type +column test/guess_test_data/column_expected.tsv Columns for all of the tables. column +datatype test/guess_test_data/datatype.tsv Datatypes for all of the columns datatype +rule test/guess_test_data/rule.tsv More complex "when" rules rule +table test/guess_test_data/table_expected.tsv All of the user-editable tables in this project. table +table1 test/guess_test_data/table1.tsv The first data table +table2 test/guess_test_data/ontology/table2.tsv The second data table +table3 test/guess_test_data/ontology/table3.tsv The third data table +table4 test/guess_test_data/ontology/table4.tsv The fourth data table +table5 test/guess_test_data/ontology/table5.tsv The fifth data table +table6 test/guess_test_data/ontology/table6.tsv The sixth data table (like table2 but all numeric) diff --git a/test/perf_test_data/column.tsv b/test/perf_test_data/column.tsv index 80268a30..f7e6a20e 100644 --- a/test/perf_test_data/column.tsv +++ b/test/perf_test_data/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition diff --git a/test/random_test_data/column.tsv b/test/random_test_data/column.tsv index 80268a30..f7e6a20e 100644 --- a/test/random_test_data/column.tsv +++ b/test/random_test_data/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition diff --git a/test/src/column.tsv b/test/src/column.tsv index 07f38290..9c6c8256 100644 --- a/test/src/column.tsv +++ b/test/src/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty trimmed_line +datatype PostgreSQL type empty trimmed_line +datatype RDF type empty trimmed_line +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition @@ -32,6 +39,7 @@ table2 bar empty text table3 source prefix from(table1.prefix) table3 id CURIE unique table3 label label primary +table3 type empty CURIE table3 parent empty label tree(label) table3 related empty trimmed_line table4 foreign_column text unique diff --git a/test/src/ontology/table3.tsv b/test/src/ontology/table3.tsv index 710e1e16..e8d75e99 100644 --- a/test/src/ontology/table3.tsv +++ b/test/src/ontology/table3.tsv @@ -1,11 +1,11 @@ -source id label type parent -MOB MOB:0000013 mobecular entity owl:Class material entity -ZOB ZOB:0000013 bar owl:Class car -JOB JOB:0000013 car owl:Class foo -SOB SOB:0000013 foo owl:Class bar -YOB YOB:0000013 mar owl:Class jafar -COB BFO:0000040 material entity owl:Class owl:Thing -CO B COB:0000013 molecular dentity owl:Class material entity -COB COB:0000013 molecular entity owl:Class material entity -COB VO:0000001 vaccine owl:Class material entity -BOB VO:0000001 vaccine owl:Class material entity +source id label type parent related +MOB MOB:0000013 mobecular entity owl:Class material entity +ZOB ZOB:0000013 bar owl:Class car +JOB JOB:0000013 car owl:Class foo +SOB SOB:0000013 foo owl:Class bar +YOB YOB:0000013 mar owl:Class jafar +COB BFO:0000040 material entity owl:Class owl:Thing +CO B COB:0000013 molecular dentity owl:Class material entity +COB COB:0000013 molecular entity owl:Class material entity +COB VO:0000001 vaccine owl:Class material entity +BOB VO:0000001 vaccine owl:Class material entity