From 21dc32653285263198b6aef81f8e90a7fada0ee9 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 6 Nov 2023 13:12:16 -0500 Subject: [PATCH 01/48] require that actual and defined columns always match when the former exist --- src/lib.rs | 122 +++++++++++++++---------------- test/expected/table3.tsv | 24 +++--- test/random_test_data/column.tsv | 7 ++ test/src/column.tsv | 8 ++ test/src/ontology/table3.tsv | 22 +++--- 5 files changed, 97 insertions(+), 86 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 16de0d94..296fd7cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -969,31 +969,32 @@ pub async fn configure_db( // use that information to create the associated database tables, while saving constraint // information to constrains_config. let mut setup_statements = HashMap::new(); - let table_names: Vec = tables_config.keys().cloned().collect(); - for table_name in table_names { + for table_name in tables_config.keys().cloned().collect::>() { let optional_path = tables_config .get(&table_name) .and_then(|r| r.get("path")) .and_then(|p| p.as_str()); - let path; + let mut path = None; match optional_path { - // If an entry of the tables_config has no path then it is an internal table which need - // not be configured explicitly. Currently the only example is the message table. - None => continue, + None => { + // If an entry of the tables_config has no path then it is an internal table which + // need not be configured explicitly. Currently the only examples are the message + // and history tables. + if table_name != "message" && table_name != "history" { + panic!("No path defined for table {}", table_name); + } + continue; + } Some(p) if !Path::new(p).is_file() => { eprintln!("WARN: File does not exist {}", p); - continue; } Some(p) if Path::new(p).canonicalize().is_err() => { eprintln!("WARN: File path could not be made canonical {}", p); - continue; } - - Some(p) => path = p.to_string(), + Some(p) => path = Some(p.to_string()), }; - // Get the columns that have been previously configured: let defined_columns: Vec = tables_config .get(&table_name) .and_then(|r| r.get("column")) @@ -1003,65 +1004,58 @@ pub async fn configure_db( .and_then(|k| Some(k.collect())) .unwrap(); - // Get the actual columns from the data itself. Note that we set has_headers to false - // (even though the files have header rows) in order to explicitly read the header row. - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_reader(File::open(path.clone()).unwrap_or_else(|err| { - panic!("Unable to open '{}': {}", path.clone(), err); - })); - let mut iter = rdr.records(); - let actual_columns; - if let Some(result) = iter.next() { - actual_columns = result.unwrap(); - } else { - panic!("'{}' is empty", path); - } - // We use column_order to explicitly indicate the order in which the columns should appear - // in the table, for later reference. + // in the table, for later reference. The default is to preserve the order from the actual + // table file. If that does not exist, we use the ordering in defined_columns. let mut column_order = vec![]; - let mut all_columns: SerdeMap = SerdeMap::new(); - for column_name in &actual_columns { - let column; - if !defined_columns.contains(&column_name.to_string()) { - let mut cmap = SerdeMap::new(); - cmap.insert( - String::from("table"), - SerdeValue::String(table_name.to_string()), - ); - cmap.insert( - String::from("column"), - SerdeValue::String(column_name.to_string()), - ); - cmap.insert( - String::from("nulltype"), - SerdeValue::String(String::from("empty")), - ); - cmap.insert( - String::from("datatype"), - SerdeValue::String(String::from("text")), - ); - column = SerdeValue::Object(cmap); - } else { - column = tables_config - .get(&table_name) - .and_then(|r| r.get("column")) - .and_then(|v| v.as_object()) - .and_then(|o| o.get(column_name)) + if let Some(path) = path { + // Get the actual columns from the data itself. Note that we set has_headers to + // false(even though the files have header rows) in order to explicitly read the + // header row. + let mut rdr = csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_reader(File::open(path.clone()).unwrap_or_else(|err| { + panic!("Unable to open '{}': {}", path.clone(), err); + })); + let mut iter = rdr.records(); + if let Some(result) = iter.next() { + let actual_columns = result .unwrap() - .clone(); + .iter() + .map(|c| c.to_string()) + .collect::>(); + // Make sure that the actual columns found in the table file, and the columns + // defined in the column config, exactly match in terms of their content: + for column_name in &actual_columns { + column_order.push(json!(column_name)); + if !defined_columns.contains(&column_name.to_string()) { + panic!( + "Column '{}.{}' not in column config", + table_name, column_name + ); + } + } + for column_name in &defined_columns { + if !actual_columns.contains(&column_name.to_string()) { + panic!( + "Defined column '{}.{}' not found in table", + table_name, column_name + ); + } + } + } else { + panic!("'{}' is empty", path); } - column_order.push(SerdeValue::String(column_name.to_string())); - all_columns.insert(column_name.to_string(), column); } + if column_order.is_empty() { + column_order = defined_columns.iter().map(|c| json!(c)).collect::>(); + } tables_config .get_mut(&table_name) .and_then(|t| t.as_object_mut()) .and_then(|o| { - o.insert(String::from("column"), SerdeValue::Object(all_columns)); o.insert( String::from("column_order"), SerdeValue::Array(column_order), @@ -1097,9 +1091,11 @@ pub async fn configure_db( } // Sort the tables according to their foreign key dependencies so that tables are always loaded - // after the tables they depend on: - let unsorted_tables: Vec = setup_statements.keys().cloned().collect(); - let sorted_tables = verify_table_deps_and_sort(&unsorted_tables, &constraints_config); + // after the tables they depend on. Ignore the internal message and history tables: + let sorted_tables = verify_table_deps_and_sort( + &setup_statements.keys().cloned().collect(), + &constraints_config, + ); if *command != ValveCommand::Config || verbose { // Generate DDL for the history table: diff --git a/test/expected/table3.tsv b/test/expected/table3.tsv index 04c78efc..c0f31eda 100644 --- a/test/expected/table3.tsv +++ b/test/expected/table3.tsv @@ -1,12 +1,12 @@ -source id label type parent -MOB MOB:0000013 mobecular entity owl:Class material entity -ZOB ZOB:0000013 bar owl:Class car -JOB JOB:0000013 car owl:Class foo -SOB SOB:0000013 foo owl:Class bar -YOB YOB:0000013 mar owl:Class jafar -COB BFO:0000040 material entity owl:Class owl:Thing -CO B COB:0000013 molecular dentity owl:Class material entity -COB COB:0000013 molecular entity owl:Class material entity -COB VO:0000001 vaccine owl:Class material entity -BOB VO:0000001 vaccine owl:Class material entity -BFOBBER BFO:0000027 bazaar owl:Class barrie +source id label type parent related +MOB MOB:0000013 mobecular entity owl:Class material entity +ZOB ZOB:0000013 bar owl:Class car +JOB JOB:0000013 car owl:Class foo +SOB SOB:0000013 foo owl:Class bar +YOB YOB:0000013 mar owl:Class jafar +COB BFO:0000040 material entity owl:Class owl:Thing +CO B COB:0000013 molecular dentity owl:Class material entity +COB COB:0000013 molecular entity owl:Class material entity +COB VO:0000001 vaccine owl:Class material entity +BOB VO:0000001 vaccine owl:Class material entity +BFOBBER BFO:0000027 bazaar owl:Class barrie diff --git a/test/random_test_data/column.tsv b/test/random_test_data/column.tsv index 80268a30..f7e6a20e 100644 --- a/test/random_test_data/column.tsv +++ b/test/random_test_data/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition diff --git a/test/src/column.tsv b/test/src/column.tsv index 07f38290..9c6c8256 100644 --- a/test/src/column.tsv +++ b/test/src/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty trimmed_line +datatype PostgreSQL type empty trimmed_line +datatype RDF type empty trimmed_line +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition @@ -32,6 +39,7 @@ table2 bar empty text table3 source prefix from(table1.prefix) table3 id CURIE unique table3 label label primary +table3 type empty CURIE table3 parent empty label tree(label) table3 related empty trimmed_line table4 foreign_column text unique diff --git a/test/src/ontology/table3.tsv b/test/src/ontology/table3.tsv index 710e1e16..e8d75e99 100644 --- a/test/src/ontology/table3.tsv +++ b/test/src/ontology/table3.tsv @@ -1,11 +1,11 @@ -source id label type parent -MOB MOB:0000013 mobecular entity owl:Class material entity -ZOB ZOB:0000013 bar owl:Class car -JOB JOB:0000013 car owl:Class foo -SOB SOB:0000013 foo owl:Class bar -YOB YOB:0000013 mar owl:Class jafar -COB BFO:0000040 material entity owl:Class owl:Thing -CO B COB:0000013 molecular dentity owl:Class material entity -COB COB:0000013 molecular entity owl:Class material entity -COB VO:0000001 vaccine owl:Class material entity -BOB VO:0000001 vaccine owl:Class material entity +source id label type parent related +MOB MOB:0000013 mobecular entity owl:Class material entity +ZOB ZOB:0000013 bar owl:Class car +JOB JOB:0000013 car owl:Class foo +SOB SOB:0000013 foo owl:Class bar +YOB YOB:0000013 mar owl:Class jafar +COB BFO:0000040 material entity owl:Class owl:Thing +CO B COB:0000013 molecular dentity owl:Class material entity +COB COB:0000013 molecular entity owl:Class material entity +COB VO:0000001 vaccine owl:Class material entity +BOB VO:0000001 vaccine owl:Class material entity From c133071377693f88cb36304eb3dc7662025d6875 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Tue, 7 Nov 2023 13:59:10 -0500 Subject: [PATCH 02/48] read config from file when generating random data --- Makefile | 8 +- test/generate_random_test_data.py | 325 +++++++++--------------------- test/perf_test_data/column.tsv | 7 + 3 files changed, 110 insertions(+), 230 deletions(-) diff --git a/Makefile b/Makefile index 024cd745..d13430b5 100644 --- a/Makefile +++ b/Makefile @@ -95,8 +95,8 @@ random_test: sqlite_random_test pg_random_test $(random_test_dir)/ontology: mkdir -p $(random_test_dir)/ontology -random_test_data: test/generate_random_test_data.py | $(random_test_dir)/ontology - ./$< $$(date +"%s") 100 5 $| +random_test_data: test/generate_random_test_data.py valve valve test/random_test_data/table.tsv | $(random_test_dir)/ontology + ./$< $$(date +"%s") 100 5 $(word 3,$^) $| sqlite_random_test: valve clean random_test_data | build test/output @echo "Testing with random data on sqlite ..." @@ -110,9 +110,9 @@ pg_random_test: valve clean random_test_data | build test/output test/round_trip.sh postgresql:///valve_postgres $(random_test_dir)/table.tsv @echo "Test succeeded!" -test/perf_test_data/ontology: test/generate_random_test_data.py +test/perf_test_data/ontology: test/generate_random_test_data.py valve test/random_test_data/table.tsv mkdir $@ - ./$< 1 10000 5 $@ + ./$< 1 10000 5 $(word 3,$^) $@ build/valve_perf.db: valve | test/perf_test_data/ontology build @if [ -f $@ ]; \ diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py index 3f8d988d..e851727d 100755 --- a/test/generate_random_test_data.py +++ b/test/generate_random_test_data.py @@ -1,209 +1,65 @@ #!/usr/bin/env python3 +import json import math import random import string +import subprocess +import sys from argparse import ArgumentParser TOKEN_LENGTH = 9 +WINDOW_SIZE = 100 -CONFIG = { - "table1": { - "prefix": { - "allow_empty": False, - "datatype": "prefix", - "structure": { - "type": "primary", - }, - }, - "base": { - "allow_empty": False, - "datatype": "IRI", - "structure": { - "type": "unique", - }, - }, - "ontology IRI": { - "allow_empty": True, - "datatype": "IRI", - }, - "version IRI": { - "allow_empty": True, - "datatype": "IRI", - }, - }, - "table2": { - "child": { - "allow_empty": False, - "datatype": "trimmed_line", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "other_foreign_column", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "trimmed_line", - "structure": { - "type": "tree", - "tcolumn": "child", - }, - }, - "xyzzy": { - "allow_empty": True, - "datatype": "trimmed_line", - "structure": { - "type": "under", - "ttable": "table2", - "tcolumn": "child", - "uval": "d", - }, - }, - "foo": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "numeric_foreign_column", - }, - }, - "bar": { - "allow_empty": True, - "datatype": "text", - }, - }, - "table3": { - "source": { - "allow_empty": False, - "datatype": "prefix", - "structure": { - "type": "foreign", - "ftable": "table1", - "fcolumn": "prefix", - }, - }, - "id": { - "allow_empty": False, - "datatype": "curie", - "structure": { - "type": "unique", - }, - }, - "label": { - "allow_empty": False, - "datatype": "label", - "structure": { - "type": "primary", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "label", - "structure": { - "type": "tree", - "tcolumn": "label", - }, - }, - "related": { - "allow_empty": True, - "datatype": "trimmed_line", - }, - }, - "table4": { - "foreign_column": { - "allow_empty": False, - "datatype": "text", - "structure": { - "type": "unique", - }, - }, - "other_foreign_column": { - "allow_empty": False, - "datatype": "text", - "structure": { - "type": "unique", - }, - }, - "numeric_foreign_column": { - "allow_empty": False, - "datatype": "integer", - "structure": { - "type": "primary", - }, - }, - }, - "table5": { - "foo": { - "allow_empty": False, - "datatype": "word", - "structure": { - "type": "primary", - }, - }, - "bar": { - "allow_empty": False, - "datatype": "integer", - }, - }, - "table6": { - "child": { - "allow_empty": False, - "datatype": "integer", - "structure": { - "type": "foreign", - "ftable": "table4", - "fcolumn": "numeric_foreign_column", - }, - }, - "parent": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "tree", - "tcolumn": "child", - }, - }, - "xyzzy": { - "allow_empty": True, - "datatype": "integer", - "structure": { - "type": "under", - "ttable": "table6", - "tcolumn": "child", - "uval": "4", - }, - }, - "foo": { - "allow_empty": True, - "datatype": "text", - }, - "bar": { - "allow_empty": True, - "datatype": "integer", - }, - }, -} - - -def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, to_column): - global CONFIG +def get_special_tables(config): + return [k for k, v in config["special"].items() if v is not None] + + +def get_table_columns(config, table): + return [column for column in config["table"][table]["column_order"]] + + +def has_nulltype(config, table, column): + return bool(config["table"][table]["column"][column].get("nulltype")) + + +def get_column_structure(config, table, column): + return config["table"][table]["column"][column].get("structure") + + +def get_column_datatype(config, table, column): + return config["table"][table]["column"][column]["datatype"] + + +def get_foreign_key(config, table, column): + return [f for f in config["constraints"]["foreign"][table] if f["column"] == column][0] + + +def get_tree(config, table, column): + return [f for f in config["constraints"]["tree"][table] if f["parent"] == column][0] + + +def get_under(config, table, column): + return [f for f in config["constraints"]["under"][table] if f["column"] == column][0] + + +def get_value_from_prev_insert(config, prev_inserts, from_table, from_column, to_table, to_column): + global WINDOW_SIZE # Note: because we are loading the tables and columns in the correct order (i.e. such that # all dependencies are loaded before the tables and columns they depend on), the list of # previous inserts for the from_table/from_column will never be empty. if len(prev_inserts[from_table][from_column]) == 1: - if CONFIG[to_table][to_column]["allow_empty"]: + if has_nulltype(config, to_table, to_column): return "" else: return prev_inserts[from_table][from_column][0] else: - # Select at random from the last 100 inserted values: - prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-100:] + # Select at random from the last N inserted values, with N given by WINDOW_SIZE: + prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-WINDOW_SIZE:] from_values = prev_inserts[from_table][from_column] # We'd ideally like to exclude the last inserted value from consideration, but we save it # here in case we cannot: @@ -219,58 +75,58 @@ def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, return values_to_choose_from[random.randrange(len(values_to_choose_from))] -def get_constrained_cell_value(table, column, row_num, prev_inserts): +def get_constrained_cell_value(config, table, column, row_num, prev_inserts): global TOKEN_LENGTH - global CONFIG - - structure = CONFIG[table][column].get("structure") - if structure and structure["type"] == "foreign": - ftable = structure["ftable"] - fcolumn = structure["fcolumn"] - cell = get_value_from_prev_insert(prev_inserts, ftable, fcolumn, table, column) - elif structure and structure["type"] == "tree": - tcolumn = structure["tcolumn"] - cell = get_value_from_prev_insert(prev_inserts, table, tcolumn, table, column) - elif structure and structure["type"] == "under": + + structure = get_column_structure(config, table, column) + datatype = get_column_datatype(config, table, column).casefold() + if structure.startswith("from("): + fkey = get_foreign_key(config, table, column) + ftable = fkey["ftable"] + fcolumn = fkey["fcolumn"] + cell = get_value_from_prev_insert(config, prev_inserts, ftable, fcolumn, table, column) + elif structure.startswith("tree("): + tkey = get_tree(config, table, column) + tcolumn = tkey["child"] + cell = get_value_from_prev_insert(config, prev_inserts, table, tcolumn, table, column) + elif structure.startswith("under("): # Note that properly satisfying the under constraint requires, not only that # the cell is in the specified tree column, but also (a) that the tree # actually exists, and (b) that the value is "under" the under value. To do # this properly, though, would require a decent amount of memory. So perhaps # it's not worth it to check for (a) and (b) and allow any offending cells # to generate errors which we can then verify are handled properly by valve. - ttable = structure["ttable"] - tcolumn = structure["tcolumn"] - cell = get_value_from_prev_insert(prev_inserts, ttable, tcolumn, table, column) - elif CONFIG[table][column]["datatype"] in [ + ukey = get_under(config, table, column) + ttable = ukey["ttable"] + tcolumn = ukey["tcolumn"] + cell = get_value_from_prev_insert(config, prev_inserts, ttable, tcolumn, table, column) + elif datatype in [ "prefix", - "IRI", + "iri", "trimmed_line", "label", "word", ]: cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) - elif CONFIG[table][column]["datatype"] == "curie": + elif datatype == "curie": cell = ( "".join(random.choices(string.ascii_lowercase, k=3)).upper() + ":" + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) - elif CONFIG[table][column]["datatype"] == "text": + elif datatype == "text": cell = ( "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) + " " + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) - elif CONFIG[table][column]["datatype"] == "integer": + elif datatype == "integer": # No leading 0s: cell = "".join(random.choices("123456789", k=1)) + "".join( random.choices(string.digits, k=TOKEN_LENGTH - 1) ) else: - print( - f"Warning: Unknown datatype: {CONFIG[table][column]['datatype']}. " - "Generating a random string." - ) + print(f"Warning: Unknown datatype: {datatype}. Generating a random string.") cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) return cell @@ -278,20 +134,20 @@ def get_constrained_cell_value(table, column, row_num, prev_inserts): def main(): global TOKEN_LENGTH - global CONFIG parser = ArgumentParser( description=""" Deterministically generate a specified amount of data, a specified percentage of which are - errors, using a hard-coded VALVE configuration, given the specified seed, to a specified - output directory. - """ + errors, using the given VALVE table configuration and seed, to the output directory.""" ) parser.add_argument("seed", help="The seed to use to generate the random data") parser.add_argument("num_rows", help="The number of rows per table to generate") parser.add_argument( "pct_errors", help="The percentage of rows in each table that should have errors" ) + parser.add_argument( + "input_table", help="The .TSV file representing the VALVE table configuration" + ) parser.add_argument( "output_dir", help="The output directory to write the new table configuration to" ) @@ -299,48 +155,65 @@ def main(): seed = int(args.seed) num_rows = int(args.num_rows) pct_errors = int(args.pct_errors) + input_table = args.input_table outdir = args.output_dir + # Use the seed argument to seed the random data that will be generated: random.seed(seed) + # Get the VALVE configuration: + result = subprocess.run(["valve", "--dump_config", input_table], capture_output=True) + if result.returncode != 0: + error = result.stderr.decode() + output = result.stdout.decode() + if output: + error = f"{error}\n{output}" + print(f"{error}", file=sys.stderr) + sys.exit(result.returncode) + config = json.loads(result.stdout.decode()) + # This is a record of the last inserted values for each table and column. When one column # takes its values from another column, then we look here and fetch the last inserted value of # the second column. prev_inserts = {} + + # The data tables to generate: + data_tables = [t for t in config["sorted_table_list"] if t not in get_special_tables(config)] + + # The TSV files corresponding to each data table: tsv_files = {} - tables_in_order = ["table4", "table1", "table2", "table3", "table5", "table6"] - for table in tables_in_order: + for table in data_tables: tsv_files[table] = open(f"{outdir}/{table}.tsv", "w") - columns = [column for column in CONFIG[table]] + columns = get_table_columns(config, table) print("\t".join(columns), file=tsv_files[table]) num_error_rows = math.ceil((pct_errors / 100) * num_rows) error_proportion = None if not num_error_rows else math.floor(num_rows / num_error_rows) for row_num in range(1, num_rows + 1): - for table in tables_in_order: + for table in data_tables: is_error_row = error_proportion and row_num % error_proportion == 1 - columns = [column for column in CONFIG[table]] + columns = get_table_columns(config, table) error_column = random.randrange(len(columns)) row = {} for column_num, column in enumerate(columns): is_error_column = is_error_row and column_num == error_column if ( not is_error_column - and CONFIG[table][column]["allow_empty"] + and has_nulltype(config, table, column) and row_num % random.randrange(2, num_rows) == 1 ): # If the column allows empty values, assign an empty value "sometimes": cell = "" elif not is_error_column: - cell = get_constrained_cell_value(table, column, row_num, prev_inserts) + cell = get_constrained_cell_value(config, table, column, row_num, prev_inserts) else: - if CONFIG[table][column].get("structure") and CONFIG[table][column][ - "structure" - ]["type"] in ["unique", "primary"]: + structure = get_column_structure(config, table, column) + datatype = get_column_datatype(config, table, column) + if structure in ["unique", "primary"]: cell = "" - elif CONFIG[table][column]["datatype"] in [ + elif datatype in [ "prefix", - "IRI", + "iri", "word", "curie", ]: @@ -350,7 +223,7 @@ def main(): + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) ) else: - if CONFIG[table][column]["datatype"] == "integer": + if datatype == "integer": cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH)) else: # No leading 0s: diff --git a/test/perf_test_data/column.tsv b/test/perf_test_data/column.tsv index 80268a30..f7e6a20e 100644 --- a/test/perf_test_data/column.tsv +++ b/test/perf_test_data/column.tsv @@ -12,7 +12,14 @@ column structure empty trimmed_line schema information for this column column description empty description a description of this column datatype datatype datatype_name primary the name of this datatype datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name rule table table_name rule when column column_name rule when condition datatype_condition From 33ce7af45b3b6041083676c187aea98887fce0c1 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Tue, 7 Nov 2023 14:15:12 -0500 Subject: [PATCH 03/48] fix path to valve in random data generation script --- test/generate_random_test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py index e851727d..63e3f32e 100755 --- a/test/generate_random_test_data.py +++ b/test/generate_random_test_data.py @@ -162,7 +162,7 @@ def main(): random.seed(seed) # Get the VALVE configuration: - result = subprocess.run(["valve", "--dump_config", input_table], capture_output=True) + result = subprocess.run(["./valve", "--dump_config", input_table], capture_output=True) if result.returncode != 0: error = result.stderr.decode() output = result.stdout.decode() From 729a894967e6e0cc1999617ad18c7278f9263a8c Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Tue, 7 Nov 2023 14:44:34 -0500 Subject: [PATCH 04/48] add guess test files --- Makefile | 27 ++++++++++++---- test/guess_test_data/column.tsv | 53 +++++++++++++++++++++++++++++++ test/guess_test_data/datatype.tsv | 22 +++++++++++++ test/guess_test_data/rule.tsv | 9 ++++++ test/guess_test_data/table.tsv | 11 +++++++ 5 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 test/guess_test_data/column.tsv create mode 100644 test/guess_test_data/datatype.tsv create mode 100644 test/guess_test_data/rule.tsv create mode 100644 test/guess_test_data/table.tsv diff --git a/Makefile b/Makefile index d13430b5..d2bd4a2b 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ build: .PHONY: doc time test sqlite_test pg_test .PHONY: api_test sqlite_api_test pg_qpi_test -.PHONY: random_test_data random_test sqlite_random_test pg_random_test +.PHONY: random_test_data random_test sqlite_random_test pg_random_test valve_debug valve_release doc: cargo doc --document-private-items @@ -22,12 +22,17 @@ doc: readme: cargo readme --no-title > README.md -valve: src/*.rs src/*.lalrpop +valve: valve_debug + +valve_release: src/*.rs src/*.lalrpop rm -f valve cargo build --release ln -s target/release/ontodev_valve valve - # cargo build - # ln -s target/debug/ontodev_valve valve + +valve_debug: src/*.rs src/*.lalrpop + rm -f valve + cargo build + ln -s target/debug/ontodev_valve valve build/valve.db: test/src/table.tsv valve clean | build ./valve $< $@ @@ -93,7 +98,7 @@ random_test_dir = test/random_test_data random_test: sqlite_random_test pg_random_test $(random_test_dir)/ontology: - mkdir -p $(random_test_dir)/ontology + mkdir -p $@ random_test_data: test/generate_random_test_data.py valve valve test/random_test_data/table.tsv | $(random_test_dir)/ontology ./$< $$(date +"%s") 100 5 $(word 3,$^) $| @@ -110,7 +115,15 @@ pg_random_test: valve clean random_test_data | build test/output test/round_trip.sh postgresql:///valve_postgres $(random_test_dir)/table.tsv @echo "Test succeeded!" -test/perf_test_data/ontology: test/generate_random_test_data.py valve test/random_test_data/table.tsv +guess_test_dir = test/guess_test_data + +$(guess_test_dir)/ontology: + mkdir -p $@ + +guess_test_data: test/generate_random_test_data.py valve valve $(guess_test_dir)/table.tsv | $(guess_test_dir)/ontology + ./$< $$(date +"%s") 10000 5 $(word 3,$^) $| + +test/perf_test_data/ontology: test/generate_random_test_data.py valve test/perf_test_data/table.tsv mkdir $@ ./$< 1 10000 5 $(word 3,$^) $@ @@ -136,7 +149,7 @@ pg_perf_test: valve test/perf_test_data/ontology | test/output perf_test: sqlite_perf_test pg_perf_test clean: - rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology + rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology $(guess_test_dir)/ontology cleanperfdb: rm -Rf build/valve_perf.db diff --git a/test/guess_test_data/column.tsv b/test/guess_test_data/column.tsv new file mode 100644 index 00000000..552b1036 --- /dev/null +++ b/test/guess_test_data/column.tsv @@ -0,0 +1,53 @@ +table column label nulltype datatype structure description +table table table_name table_name primary name of this table +table path path path to the TSV file for this table, relative to the table.tsv file +table type empty table_type type of this table, used for tables with special meanings +table description empty description a description of this table +column table table_name from(table.table) the table that this column belongs to +column column column_name the name of this column +column label empty label the human-readable label for this column +column nulltype empty datatype_name from(datatype.datatype) the datatype for NULL values in this column +column datatype datatype_name from(datatype.datatype) the datatype for this column +column structure empty trimmed_line schema information for this column +column description empty description a description of this column +datatype datatype datatype_name primary the name of this datatype +datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word +datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name +rule table table_name +rule when column column_name +rule when condition datatype_condition +rule then column column_name +rule then condition datatype_condition +rule level word +rule description empty description +table1 prefix prefix primary +table1 base IRI unique +table1 ontology IRI empty IRI +table1 version IRI empty IRI +table2 child trimmed_line +table2 parent empty trimmed_line +table2 xyzzy empty trimmed_line +table2 foo empty integer +table2 bar empty text +table3 source prefix +table3 id CURIE unique +table3 label label primary +table3 parent empty label +table3 related empty trimmed_line +table4 foreign_column text unique +table4 other_foreign_column text unique +table4 numeric_foreign_column integer primary +table5 foo word primary +table5 bar integer +table6 child integer +table6 parent empty integer +table6 xyzzy empty integer +table6 foo empty text +table6 bar empty integer diff --git a/test/guess_test_data/datatype.tsv b/test/guess_test_data/datatype.tsv new file mode 100644 index 00000000..c118588d --- /dev/null +++ b/test/guess_test_data/datatype.tsv @@ -0,0 +1,22 @@ +datatype parent transform condition structure description SQLite type PostgreSQL type RDF type HTML type +CURIE nonspace match(/\S+:\S+/) concat(prefix, ":", suffix) a Compact URI CURIE +IRI nonspace exclude(/\s/) an Internationalized Resource Identifier IRI +column_name trimmed_line match(/\S([^\n]*\S)*/) a column name +datatype_condition line exclude(/\n/) a datatype condition specification +datatype_name word exclude(/\W/) a datatype name +description trimmed_text match(/\S(.*\S)*/) a brief description +empty text equals('') the empty string NULL NULL null +integer nonspace match(/-?\d+/) a positive or negative integer INTEGER INTEGER +label trimmed_line match(/\S([^\n]*\S)*/) +line text exclude(/\n/) a line of text input +natural_number integer match(/\d+/) a natural number, including zero INTEGER INTEGER +nonspace trimmed_line exclude(/\s/) text without whitespace +path line exclude(/\n/) a path to a file +prefix word exclude(/\W/) a prefix for a CURIE +suffix word exclude(/\W/) a suffix for a CURIE +table_name word exclude(/\W/) a table name +table_type word lowercase in('table', 'column', 'datatype') a table type +text any text TEXT TEXT xsd:string textarea +trimmed_line line match(/\S([^\n]*\S)*/) a line of text that does not begin or end with whitespace +trimmed_text text exclude(/^\s+|\s+$/) text that does not begin or end with whitespace +word nonspace exclude(/\W/) a single word: letters, numbers, underscore diff --git a/test/guess_test_data/rule.tsv b/test/guess_test_data/rule.tsv new file mode 100644 index 00000000..3a9356ff --- /dev/null +++ b/test/guess_test_data/rule.tsv @@ -0,0 +1,9 @@ +table when column when condition then column then condition level description +table2 foo null bar null error bar must be null whenever foo is null +table2 foo not null bar not null error bar cannot be null if foo is not null +table2 foo IRI bar label error bar must be a label if foo is an IRI +table2 foo equals(5) bar in('y', 'z') error bar must be 'y' or 'z' if foo = 5 +table6 foo null bar null error bar must be null whenever foo is null +table6 foo not null bar not null error bar cannot be null if foo is not null +table6 foo IRI bar label error bar must be a label if foo is an IRI +table6 foo equals(e) bar in(25, 26) error bar must be 25 or 26 if foo = 'e' diff --git a/test/guess_test_data/table.tsv b/test/guess_test_data/table.tsv new file mode 100644 index 00000000..1fcc8584 --- /dev/null +++ b/test/guess_test_data/table.tsv @@ -0,0 +1,11 @@ +table path description type +column test/guess_test_data/column.tsv Columns for all of the tables. column +datatype test/guess_test_data/datatype.tsv Datatypes for all of the columns datatype +rule test/guess_test_data/rule.tsv More complex "when" rules rule +table test/guess_test_data/table.tsv All of the user-editable tables in this project. table +table1 test/guess_test_data/ontology/table1.tsv The first data table +table2 test/guess_test_data/ontology/table2.tsv The second data table +table3 test/guess_test_data/ontology/table3.tsv The third data table +table4 test/guess_test_data/ontology/table4.tsv The fourth data table +table5 test/guess_test_data/ontology/table5.tsv The fifth data table +table6 test/guess_test_data/ontology/table6.tsv The sixth data table (like table2 but all numeric) From cdb657fa685bcdfc8a4406104582ab5683535a95 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 10:23:49 -0500 Subject: [PATCH 05/48] setup Makefile for guess tests --- Makefile | 70 ++++++++++++++++-------- confirm_overwrite.sh | 14 +++++ test/generate_random_test_data.py | 2 +- test/guess_test_data/column.tsv | 20 ------- test/guess_test_data/column_expected.tsv | 53 ++++++++++++++++++ test/guess_test_data/rule.tsv | 9 +-- test/guess_test_data/table.tsv | 7 +-- test/guess_test_data/table_expected.tsv | 11 ++++ 8 files changed, 128 insertions(+), 58 deletions(-) create mode 100755 confirm_overwrite.sh create mode 100644 test/guess_test_data/column_expected.tsv create mode 100644 test/guess_test_data/table_expected.tsv diff --git a/Makefile b/Makefile index d2bd4a2b..01e4f65f 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,8 @@ doc: readme: cargo readme --no-title > README.md -valve: valve_debug +valve: src/*.rs src/*.lalrpop + @$(MAKE) valve_debug valve_release: src/*.rs src/*.lalrpop rm -f valve @@ -34,7 +35,7 @@ valve_debug: src/*.rs src/*.lalrpop cargo build ln -s target/debug/ontodev_valve valve -build/valve.db: test/src/table.tsv valve clean | build +build/valve.db: test/src/table.tsv clean valve | build ./valve $< $@ test/output: @@ -116,47 +117,70 @@ pg_random_test: valve clean random_test_data | build test/output @echo "Test succeeded!" guess_test_dir = test/guess_test_data +guess_test_db = build/valve_guess.db +.PHONY: guess_test_data + +$(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv + ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) $(guess_test_dir)/ontology: mkdir -p $@ -guess_test_data: test/generate_random_test_data.py valve valve $(guess_test_dir)/table.tsv | $(guess_test_dir)/ontology - ./$< $$(date +"%s") 10000 5 $(word 3,$^) $| +guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology + ./confirm_overwrite.sh $(guess_test_dir)/ontology + rm -f $(guess_test_dir)/table1.tsv + ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) + rm -f $(guess_test_dir)/ontology/*.tsv + ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table_expected.tsv $| + rm -f $(guess_test_dir)/ontology/table1.tsv + +$(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology + rm -f $@ + ./$< $(guess_test_dir)/table.tsv $@ + +perf_test_dir = test/perf_test_data +perf_test_db = build/valve_perf.db +.PHONY: perf_test_data + +$(perf_test_dir)/ontology: + mkdir -p $@ -test/perf_test_data/ontology: test/generate_random_test_data.py valve test/perf_test_data/table.tsv - mkdir $@ - ./$< 1 10000 5 $(word 3,$^) $@ +perf_test_data: test/generate_random_test_data.py valve confirm_overwrite.sh $(perf_test_dir)/*.tsv | $(perf_test_dir)/ontology + ./confirm_overwrite.sh $(perf_test_dir)/ontology + rm -f $(perf_test_dir)/ontology/*.tsv + ./$< $$(date +"%s") 10000 5 $(perf_test_dir)/table.tsv $| -build/valve_perf.db: valve | test/perf_test_data/ontology build - @if [ -f $@ ]; \ - then \ - echo "'$@' exists but is out of date. To rebuild '$@', run \`make cleanperfdb\`" \ - "before running \`make $@\`" ; \ - false; \ - fi - time -p ./$< --verbose test/perf_test_data/table.tsv $@ +$(perf_test_db): valve perf_test_data $(perf_test_dir)/*.tsv | build $(perf_test_dir)/ontology + rm -f $@ + time -p ./$< --verbose $(perf_test_dir)/table.tsv $@ .PHONY: sqlite_perf_test sqlite_perf_test: build/valve_perf.db | test/output time -p scripts/export.py messages $< $| $(tables_to_test) .PHONY: pg_perf_test -pg_perf_test: valve test/perf_test_data/ontology | test/output - time -p ./$< --verbose test/perf_test_data/table.tsv postgresql:///valve_postgres +pg_perf_test: valve $(perf_test_dir)/ontology | test/output + time -p ./$< --verbose $(perf_test_dir)/table.tsv postgresql:///valve_postgres time -p scripts/export.py messages postgresql:///valve_postgres $| $(tables_to_test) .PHONY: perf_test perf_test: sqlite_perf_test pg_perf_test clean: - rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology $(guess_test_dir)/ontology + rm -Rf build/valve.db* build/valve_random.db* test/output $(random_test_dir)/ontology valve + +clean_guess_db: + rm -Rf build/valve_guess.db -cleanperfdb: +clean_guess_data: + rm -Rf $(guess_test_dir)/table1.tsv $(guess_test_dir)/ontology + +clean_perf_db: rm -Rf build/valve_perf.db -cleanperfdata: - rm -Rf test/perf_test_data/ontology +clean_perf_data: + rm -Rf $(perf_test_dir)/ontology -cleanall: clean cleanperfdb cleanperfdata +cleanall: clean clean_perf_db clean_perf_data clean_guess_db clean_guess_data cargo clean - rm -Rf valve + rm -f valve diff --git a/confirm_overwrite.sh b/confirm_overwrite.sh new file mode 100755 index 00000000..aa58cd50 --- /dev/null +++ b/confirm_overwrite.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh + +if [ -d $1 -a ! -z "$(ls -A $1)" ] +then + printf "$1 already exists and contains the following files: $(ls -A -m -w 0 $1)\nAre you sure (y/n)? " + read enter + if [ $enter = 'y' ] + then + exit 0 + else + echo "Understood. Exiting with error code." + exit 1 + fi +fi diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py index 63e3f32e..87008651 100755 --- a/test/generate_random_test_data.py +++ b/test/generate_random_test_data.py @@ -11,7 +11,7 @@ TOKEN_LENGTH = 9 -WINDOW_SIZE = 100 +WINDOW_SIZE = 50 def get_special_tables(config): diff --git a/test/guess_test_data/column.tsv b/test/guess_test_data/column.tsv index 552b1036..2659b524 100644 --- a/test/guess_test_data/column.tsv +++ b/test/guess_test_data/column.tsv @@ -31,23 +31,3 @@ table1 prefix prefix primary table1 base IRI unique table1 ontology IRI empty IRI table1 version IRI empty IRI -table2 child trimmed_line -table2 parent empty trimmed_line -table2 xyzzy empty trimmed_line -table2 foo empty integer -table2 bar empty text -table3 source prefix -table3 id CURIE unique -table3 label label primary -table3 parent empty label -table3 related empty trimmed_line -table4 foreign_column text unique -table4 other_foreign_column text unique -table4 numeric_foreign_column integer primary -table5 foo word primary -table5 bar integer -table6 child integer -table6 parent empty integer -table6 xyzzy empty integer -table6 foo empty text -table6 bar empty integer diff --git a/test/guess_test_data/column_expected.tsv b/test/guess_test_data/column_expected.tsv new file mode 100644 index 00000000..f7e6a20e --- /dev/null +++ b/test/guess_test_data/column_expected.tsv @@ -0,0 +1,53 @@ +table column label nulltype datatype structure description +table table table_name table_name primary name of this table +table path path path to the TSV file for this table, relative to the table.tsv file +table type empty table_type type of this table, used for tables with special meanings +table description empty description a description of this table +column table table_name from(table.table) the table that this column belongs to +column column column_name the name of this column +column label empty label the human-readable label for this column +column nulltype empty datatype_name from(datatype.datatype) the datatype for NULL values in this column +column datatype datatype_name from(datatype.datatype) the datatype for this column +column structure empty trimmed_line schema information for this column +column description empty description a description of this column +datatype datatype datatype_name primary the name of this datatype +datatype parent empty datatype_name tree(datatype) the parent datatype +datatype transform empty word +datatype condition empty datatype_condition +datatype structure empty trimmed_line +datatype description empty trimmed_text +datatype SQLite type empty datatype_name +datatype PostgreSQL type empty datatype_name +datatype RDF type empty datatype_name +datatype HTML type empty datatype_name +rule table table_name +rule when column column_name +rule when condition datatype_condition +rule then column column_name +rule then condition datatype_condition +rule level word +rule description empty description +table1 prefix prefix primary +table1 base IRI unique +table1 ontology IRI empty IRI +table1 version IRI empty IRI +table2 child trimmed_line from(table4.other_foreign_column) +table2 parent empty trimmed_line tree(child) +table2 xyzzy empty trimmed_line under(table2.child, d) +table2 foo empty integer from(table4.numeric_foreign_column) +table2 bar empty text +table3 source prefix from(table1.prefix) +table3 id CURIE unique +table3 label label primary +table3 parent empty label tree(label) +table3 related empty trimmed_line +table4 foreign_column text unique +table4 other_foreign_column text unique +table4 numeric_foreign_column integer primary +table5 foo word primary +table5 bar integer +table6 child integer from(table4.numeric_foreign_column) +table6 parent empty integer tree(child) +table6 xyzzy empty integer under(table6.child, 4) +table6 foo empty text +table6 bar empty integer diff --git a/test/guess_test_data/rule.tsv b/test/guess_test_data/rule.tsv index 3a9356ff..a46b8d52 100644 --- a/test/guess_test_data/rule.tsv +++ b/test/guess_test_data/rule.tsv @@ -1,9 +1,2 @@ table when column when condition then column then condition level description -table2 foo null bar null error bar must be null whenever foo is null -table2 foo not null bar not null error bar cannot be null if foo is not null -table2 foo IRI bar label error bar must be a label if foo is an IRI -table2 foo equals(5) bar in('y', 'z') error bar must be 'y' or 'z' if foo = 5 -table6 foo null bar null error bar must be null whenever foo is null -table6 foo not null bar not null error bar cannot be null if foo is not null -table6 foo IRI bar label error bar must be a label if foo is an IRI -table6 foo equals(e) bar in(25, 26) error bar must be 25 or 26 if foo = 'e' +table1 ontology IRI null version IRI null error 'version IRI' must be null whenever 'ontology IRI' is null diff --git a/test/guess_test_data/table.tsv b/test/guess_test_data/table.tsv index 1fcc8584..ac5800f1 100644 --- a/test/guess_test_data/table.tsv +++ b/test/guess_test_data/table.tsv @@ -3,9 +3,4 @@ column test/guess_test_data/column.tsv Columns for all of the tables. column datatype test/guess_test_data/datatype.tsv Datatypes for all of the columns datatype rule test/guess_test_data/rule.tsv More complex "when" rules rule table test/guess_test_data/table.tsv All of the user-editable tables in this project. table -table1 test/guess_test_data/ontology/table1.tsv The first data table -table2 test/guess_test_data/ontology/table2.tsv The second data table -table3 test/guess_test_data/ontology/table3.tsv The third data table -table4 test/guess_test_data/ontology/table4.tsv The fourth data table -table5 test/guess_test_data/ontology/table5.tsv The fifth data table -table6 test/guess_test_data/ontology/table6.tsv The sixth data table (like table2 but all numeric) +table1 test/guess_test_data/table1.tsv The first data table diff --git a/test/guess_test_data/table_expected.tsv b/test/guess_test_data/table_expected.tsv new file mode 100644 index 00000000..dfb683c4 --- /dev/null +++ b/test/guess_test_data/table_expected.tsv @@ -0,0 +1,11 @@ +table path description type +column test/guess_test_data/column_expected.tsv Columns for all of the tables. column +datatype test/guess_test_data/datatype.tsv Datatypes for all of the columns datatype +rule test/guess_test_data/rule.tsv More complex "when" rules rule +table test/guess_test_data/table_expected.tsv All of the user-editable tables in this project. table +table1 test/guess_test_data/table1.tsv The first data table +table2 test/guess_test_data/ontology/table2.tsv The second data table +table3 test/guess_test_data/ontology/table3.tsv The third data table +table4 test/guess_test_data/ontology/table4.tsv The fourth data table +table5 test/guess_test_data/ontology/table5.tsv The fifth data table +table6 test/guess_test_data/ontology/table6.tsv The sixth data table (like table2 but all numeric) From 701cc84f59073ec97b614d9cdf16a28ff3e588d7 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 15:36:26 -0500 Subject: [PATCH 06/48] tweaks to Makefile --- Makefile | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 01e4f65f..8e53c26a 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,10 @@ MAKEFLAGS += --warn-undefined-variables build: mkdir build -.PHONY: doc time test sqlite_test pg_test -.PHONY: api_test sqlite_api_test pg_qpi_test -.PHONY: random_test_data random_test sqlite_random_test pg_random_test valve_debug valve_release +.PHONY: doc readme valve_debug valve_release test sqlite_test pg_test api_test sqlite_api_test \ + pg_qpi_test random_test_data random_test sqlite_random_test pg_random_test guess_test_data \ + perf_test_data sqlite_perf_test pg_perf_test perf_test + doc: cargo doc --document-private-items @@ -25,12 +26,12 @@ readme: valve: src/*.rs src/*.lalrpop @$(MAKE) valve_debug -valve_release: src/*.rs src/*.lalrpop +valve_release: rm -f valve cargo build --release ln -s target/release/ontodev_valve valve -valve_debug: src/*.rs src/*.lalrpop +valve_debug: rm -f valve cargo build ln -s target/debug/ontodev_valve valve @@ -43,7 +44,8 @@ test/output: test: sqlite_test pg_test api_test random_test -tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 table9 table10 table11 +tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 \ + table9 table10 table11 sqlite_test: build/valve.db test/src/table.tsv | test/output @echo "Testing valve on sqlite ..." @@ -118,7 +120,6 @@ pg_random_test: valve clean random_test_data | build test/output guess_test_dir = test/guess_test_data guess_test_db = build/valve_guess.db -.PHONY: guess_test_data $(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) @@ -140,7 +141,6 @@ $(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_ perf_test_dir = test/perf_test_data perf_test_db = build/valve_perf.db -.PHONY: perf_test_data $(perf_test_dir)/ontology: mkdir -p $@ @@ -154,16 +154,13 @@ $(perf_test_db): valve perf_test_data $(perf_test_dir)/*.tsv | build $(perf_test rm -f $@ time -p ./$< --verbose $(perf_test_dir)/table.tsv $@ -.PHONY: sqlite_perf_test sqlite_perf_test: build/valve_perf.db | test/output time -p scripts/export.py messages $< $| $(tables_to_test) -.PHONY: pg_perf_test pg_perf_test: valve $(perf_test_dir)/ontology | test/output time -p ./$< --verbose $(perf_test_dir)/table.tsv postgresql:///valve_postgres time -p scripts/export.py messages postgresql:///valve_postgres $| $(tables_to_test) -.PHONY: perf_test perf_test: sqlite_perf_test pg_perf_test clean: From 474c7a597d6bf4f3a9d1b0d0c29dbe17ad19689d Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 15:37:25 -0500 Subject: [PATCH 07/48] initial version of guess prototype --- scripts/guess.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 scripts/guess.py diff --git a/scripts/guess.py b/scripts/guess.py new file mode 100755 index 00000000..4c93e570 --- /dev/null +++ b/scripts/guess.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import csv +import random +import re +import sys +import time + +from argparse import ArgumentParser + + +def has_ncolumn(sample, ncolumn): + return bool([label for label in sample if sample[label]["normalized"] == ncolumn]) + + +def get_random_sample(table, sample_size): + # Get the number of rows in the file (we substract 1 for the header row): + with open(table, "rb") as f: + total_rows = sum(1 for _ in f) - 1 + + if total_rows < sample_size: + sample_size = total_rows + + if sample_size == total_rows: + sample_row_numbers = range(1, total_rows + 1) + else: + sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size)) + with open(table) as f: + rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + sample = {} + for i, row in enumerate(rows, start=1): + if i in sample_row_numbers: + for label, value in row.items(): + if label not in sample: + ncolumn = re.sub(r"[^0-9a-zA-Z_]+", "", label).casefold() + if has_ncolumn(sample, ncolumn): + print( + "The data has more than one column with the normalized name " + f"{ncolumn}" + ) + sys.exit(1) + sample[label] = { + "normalized": ncolumn, + "values": [], + } + sample[label]["values"].append(value) + return sample + + +if __name__ == "__main__": + parser = ArgumentParser(description="VALVE guesser (prototype)") + parser.add_argument( + "--sample_size", + type=int, + default=10000, + help="Sample size to use when guessing (default: 10,000)", + ) + parser.add_argument( + "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%)" + ) + parser.add_argument( + "--enum_size", + type=int, + default=10, + help="The maximum number of values to use for in(...) datatype conditions", + ) + parser.add_argument( + "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)" + ) + parser.add_argument( + "TABLE", help="The name of the .TSV file containing the data for which we will be guessing" + ) + args = parser.parse_args() + + # Use the seed argument, or the epoch time if no seed is given, to set up the random generator: + if args.seed is not None: + seed = args.seed + else: + seed = time.time_ns() + random.seed(seed) + + sample = get_random_sample(args.TABLE, args.sample_size) + print(sample) From d401835f3eb403d52acda614d1b621610d6e2947 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 16:50:58 -0500 Subject: [PATCH 08/48] guess nulltype --- scripts/guess.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index 4c93e570..01a2fe85 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -47,6 +47,17 @@ def get_random_sample(table, sample_size): return sample +def annotate(label, sample, error_rate): + def has_nulltype(target): + num_values = len(target["values"]) + num_empties = target["values"].count("") + return num_empties / num_values > error_rate + + target = sample[label] + if has_nulltype(target): + target["nulltype"] = "empty" + + if __name__ == "__main__": parser = ArgumentParser(description="VALVE guesser (prototype)") parser.add_argument( @@ -80,4 +91,12 @@ def get_random_sample(table, sample_size): random.seed(seed) sample = get_random_sample(args.TABLE, args.sample_size) - print(sample) + for label in sample: + annotate(label, sample, args.error_rate) + + # For debugging + for label in sample: + print(f"{label}: ", end="") + for annotation in sample[label]: + print(f"{annotation} ", end="") + print() From 5a6611844567f001899bbb75439ce2dc28fa049d Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 17:35:30 -0500 Subject: [PATCH 09/48] guess primary/unique --- scripts/guess.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 01a2fe85..57b4143b 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -47,15 +47,30 @@ def get_random_sample(table, sample_size): return sample -def annotate(label, sample, error_rate): +def annotate(label, sample, error_rate, is_primary_candidate): def has_nulltype(target): num_values = len(target["values"]) num_empties = target["values"].count("") return num_empties / num_values > error_rate + def has_duplicates(target, ignore_empties): + if ignore_empties: + values = [v for v in target["values"] if v != ""] + else: + values = target["values"] + distinct_values = set(values) + return (len(values) - len(distinct_values)) > (error_rate * len(values)) + target = sample[label] if has_nulltype(target): target["nulltype"] = "empty" + # Since the target has no nulltype (because the previous branch of the if-statement did not + # apply), all empties are assumed to be errors, so we pass True here: + elif not has_duplicates(target, True): + if is_primary_candidate: + target["structure"] = "primary" + else: + target["structure"] = "unique" if __name__ == "__main__": @@ -91,12 +106,16 @@ def has_nulltype(target): random.seed(seed) sample = get_random_sample(args.TABLE, args.sample_size) - for label in sample: - annotate(label, sample, args.error_rate) + for i, label in enumerate(sample): + annotate(label, sample, args.error_rate, i == 0) # For debugging - for label in sample: - print(f"{label}: ", end="") - for annotation in sample[label]: - print(f"{annotation} ", end="") - print() + # for label in sample: + # print(f"{label}: ", end="") + # for annotation in sample[label]: + # print(f"{annotation} ", end="") + # print() + + from pprint import pprint + + pprint(sample) From 64502b68f222ebcc43f1b81c5bb3ad5dcb75ff7b Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 9 Nov 2023 17:44:27 -0500 Subject: [PATCH 10/48] tweak --- scripts/guess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 57b4143b..d4e9ee6c 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -18,10 +18,8 @@ def get_random_sample(table, sample_size): with open(table, "rb") as f: total_rows = sum(1 for _ in f) - 1 - if total_rows < sample_size: + if total_rows <= sample_size: sample_size = total_rows - - if sample_size == total_rows: sample_row_numbers = range(1, total_rows + 1) else: sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size)) From f02740e49e720c823ef31e40130fc0d4e10529af Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sat, 11 Nov 2023 10:51:20 -0500 Subject: [PATCH 11/48] annotate datatype (WIP) --- scripts/guess.py | 127 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 120 insertions(+), 7 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index d4e9ee6c..fc7cd4f5 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -1,13 +1,18 @@ #!/usr/bin/env python3 import csv +import json import random import re +import subprocess import sys import time from argparse import ArgumentParser +# TODO: Remove this import later (used only for debugging): +from pprint import pprint, pformat + def has_ncolumn(sample, ncolumn): return bool([label for label in sample if sample[label]["normalized"] == ncolumn]) @@ -45,7 +50,19 @@ def get_random_sample(table, sample_size): return sample -def annotate(label, sample, error_rate, is_primary_candidate): +def get_valve_config(valve_table): + result = subprocess.run(["./valve", "--dump_config", valve_table], capture_output=True) + if result.returncode != 0: + error = result.stderr.decode() + output = result.stdout.decode() + if output: + error = f"{error}\n{output}" + print(f"{error}", file=sys.stderr) + sys.exit(result.returncode) + return json.loads(result.stdout.decode()) + + +def annotate(label, sample, dt_hierarchy, error_rate, is_primary_candidate): def has_nulltype(target): num_values = len(target["values"]) num_empties = target["values"].count("") @@ -59,6 +76,52 @@ def has_duplicates(target, ignore_empties): distinct_values = set(values) return (len(values) - len(distinct_values)) > (error_rate * len(values)) + def get_datatype(target): + # For each tree in the hierarchy: + # Look for a match with the 0th element and possibly add it to matching_datatypes. + # If there are matches in matching_datatypes: + # Use the tiebreaker rules to find the best match and annotate the target with it. + # Else: + # Try again with the next highest element of each tree (if one exists) + # + # Note that this is guaranteed to work since the get_datatype_hierarchy() function includes + # the 'text' datatype which matches anything. So if no matches are found raise an error. + + def is_match(datatype): + # If the datatype has no associated condition then it matches anything: + if not datatype.get("condition"): + return True + # TODO: Replace this with actual code to check if there is a match: + return bool(random.getrandbits(1)) + + def tiebreak(datatypes): + # TODO: Replace this with actual code to implement the tiebreaker rules: + return random.choice(datatypes) + + curr_index = 0 + while True: + matching_datatypes = [] + datatypes_to_check = [] + for dt_name in dt_hierarchy: + if len(dt_hierarchy[dt_name]) > curr_index: + datatypes_to_check.append(dt_hierarchy[dt_name][curr_index]) + if len(datatypes_to_check) == 0: + print(f"Could not find a datatype match for column '{label}'") + sys.exit(1) + + for datatype in datatypes_to_check: + if is_match(datatype): + matching_datatypes.append(datatype) + + if len(matching_datatypes) == 0: + continue + elif len(matching_datatypes) == 1: + return matching_datatypes[0] + else: + return tiebreak(matching_datatypes) + + curr_index += 1 + target = sample[label] if has_nulltype(target): target["nulltype"] = "empty" @@ -70,6 +133,50 @@ def has_duplicates(target, ignore_empties): else: target["structure"] = "unique" + target["datatype"] = get_datatype(target)["datatype"] + + +def get_datatype_hierarchy(config): + """ + Given a VALVE configuration, return a datatype hierarchy that looks like this: + {'dt_name_1': [{'datatype': 'dt_name_1', + 'description': 'a description', + ...}, + {'datatype': 'parent datatype', + 'description': 'a description', + ...}, + {'datatype': 'grandparent datatype', + 'description': 'a description', + ...}, + ...], + 'dt_name_2': etc. + """ + + def get_hierarchy_for_dt(primary_dt_name): + def get_parents(dt_name): + datatypes = [] + if dt_name is not None: + datatype = config["datatype"][dt_name] + if datatype["datatype"] != primary_dt_name: + datatypes.append(datatype) + datatypes += get_parents(datatype.get("parent")) + return datatypes + + return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) + + dt_config = config["datatype"] + dt_names = [dt_name for dt_name in dt_config] + leaf_dts = [] + for dt in dt_names: + children = [child for child in dt_names if dt_config[child].get("parent") == dt] + if not children: + leaf_dts.append(dt) + + dt_hierarchy = {} + for leaf_dt in leaf_dts: + dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt) + return dt_hierarchy + if __name__ == "__main__": parser = ArgumentParser(description="VALVE guesser (prototype)") @@ -92,7 +199,10 @@ def has_duplicates(target, ignore_empties): "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)" ) parser.add_argument( - "TABLE", help="The name of the .TSV file containing the data for which we will be guessing" + "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration" + ) + parser.add_argument( + "TABLE", help="A .TSV file containing the data for which we will be guessing" ) args = parser.parse_args() @@ -103,17 +213,20 @@ def has_duplicates(target, ignore_empties): seed = time.time_ns() random.seed(seed) + # Get the valve configuration: + config = get_valve_config(args.VALVE_TABLE) + + # Use the valve config to retrieve the valve datatype hierarchy: + dt_hierarchy = get_datatype_hierarchy(config) + sample = get_random_sample(args.TABLE, args.sample_size) for i, label in enumerate(sample): - annotate(label, sample, args.error_rate, i == 0) + annotate(label, sample, dt_hierarchy, args.error_rate, i == 0) + pprint(sample) # For debugging # for label in sample: # print(f"{label}: ", end="") # for annotation in sample[label]: # print(f"{annotation} ", end="") # print() - - from pprint import pprint - - pprint(sample) From 5200b5f63796aa3b7be5ff4c305d4bf3026ec604 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 09:40:11 -0500 Subject: [PATCH 12/48] add stubs for functions to retrieve from() structures --- scripts/guess.py | 112 +++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index fc7cd4f5..8811569e 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -62,7 +62,54 @@ def get_valve_config(valve_table): return json.loads(result.stdout.decode()) -def annotate(label, sample, dt_hierarchy, error_rate, is_primary_candidate): +def get_datatype_hierarchy(config): + """ + Given a VALVE configuration, return a datatype hierarchy that looks like this: + {'dt_name_1': [{'datatype': 'dt_name_1', + 'description': 'a description', + ...}, + {'datatype': 'parent datatype', + 'description': 'a description', + ...}, + {'datatype': 'grandparent datatype', + 'description': 'a description', + ...}, + ...], + 'dt_name_2': etc. + """ + + def get_hierarchy_for_dt(primary_dt_name): + def get_parents(dt_name): + datatypes = [] + if dt_name is not None: + datatype = config["datatype"][dt_name] + if datatype["datatype"] != primary_dt_name: + datatypes.append(datatype) + datatypes += get_parents(datatype.get("parent")) + return datatypes + + return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) + + dt_config = config["datatype"] + dt_names = [dt_name for dt_name in dt_config] + leaf_dts = [] + for dt in dt_names: + children = [child for child in dt_names if dt_config[child].get("parent") == dt] + if not children: + leaf_dts.append(dt) + + dt_hierarchy = {} + for leaf_dt in leaf_dts: + dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt) + return dt_hierarchy + + +def get_foreign_column_data(config): + # TODO. + pass + + +def annotate(label, sample, config, error_rate, is_primary_candidate): def has_nulltype(target): num_values = len(target["values"]) num_empties = target["values"].count("") @@ -76,7 +123,7 @@ def has_duplicates(target, ignore_empties): distinct_values = set(values) return (len(values) - len(distinct_values)) > (error_rate * len(values)) - def get_datatype(target): + def get_datatype(target, dt_hierarchy): # For each tree in the hierarchy: # Look for a match with the 0th element and possibly add it to matching_datatypes. # If there are matches in matching_datatypes: @@ -122,6 +169,10 @@ def tiebreak(datatypes): curr_index += 1 + def get_from(target, foreign_column_data): + # TODO. + pass + target = sample[label] if has_nulltype(target): target["nulltype"] = "empty" @@ -133,49 +184,17 @@ def tiebreak(datatypes): else: target["structure"] = "unique" - target["datatype"] = get_datatype(target)["datatype"] - - -def get_datatype_hierarchy(config): - """ - Given a VALVE configuration, return a datatype hierarchy that looks like this: - {'dt_name_1': [{'datatype': 'dt_name_1', - 'description': 'a description', - ...}, - {'datatype': 'parent datatype', - 'description': 'a description', - ...}, - {'datatype': 'grandparent datatype', - 'description': 'a description', - ...}, - ...], - 'dt_name_2': etc. - """ - - def get_hierarchy_for_dt(primary_dt_name): - def get_parents(dt_name): - datatypes = [] - if dt_name is not None: - datatype = config["datatype"][dt_name] - if datatype["datatype"] != primary_dt_name: - datatypes.append(datatype) - datatypes += get_parents(datatype.get("parent")) - return datatypes - - return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) - - dt_config = config["datatype"] - dt_names = [dt_name for dt_name in dt_config] - leaf_dts = [] - for dt in dt_names: - children = [child for child in dt_names if dt_config[child].get("parent") == dt] - if not children: - leaf_dts.append(dt) + # Use the valve config to retrieve the valve datatype hierarchy: + dt_hierarchy = get_datatype_hierarchy(config) + target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"] - dt_hierarchy = {} - for leaf_dt in leaf_dts: - dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt) - return dt_hierarchy + # TODO: Use the valve config to get a list of columns already loaded to the database, then + # compare the contents of each column with the contents of the target column and possibly + # annotate the target with a from() structure. + foreign_column_data = get_foreign_column_data(config) + from_structure = get_from(target, foreign_column_data) + if from_structure and not target.get("structure"): + target["structure"] = from_structure if __name__ == "__main__": @@ -216,12 +235,9 @@ def get_parents(dt_name): # Get the valve configuration: config = get_valve_config(args.VALVE_TABLE) - # Use the valve config to retrieve the valve datatype hierarchy: - dt_hierarchy = get_datatype_hierarchy(config) - sample = get_random_sample(args.TABLE, args.sample_size) for i, label in enumerate(sample): - annotate(label, sample, dt_hierarchy, args.error_rate, i == 0) + annotate(label, sample, config, args.error_rate, i == 0) pprint(sample) # For debugging From bc7cb2caec01b7f96c3b39af7cdb3e5eca2741fa Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 09:47:58 -0500 Subject: [PATCH 13/48] small optimization --- scripts/guess.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 8811569e..1e208e77 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -191,10 +191,11 @@ def get_from(target, foreign_column_data): # TODO: Use the valve config to get a list of columns already loaded to the database, then # compare the contents of each column with the contents of the target column and possibly # annotate the target with a from() structure. - foreign_column_data = get_foreign_column_data(config) - from_structure = get_from(target, foreign_column_data) - if from_structure and not target.get("structure"): - target["structure"] = from_structure + if not target.get("structure"): + foreign_column_data = get_foreign_column_data(config) + from_structure = get_from(target, foreign_column_data) + if from_structure: + target["structure"] = from_structure if __name__ == "__main__": From 6dfac263a84956e8b5a0a92c81a22a90b156bf94 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 09:55:31 -0500 Subject: [PATCH 14/48] add db parameter --- scripts/guess.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index 1e208e77..542444e5 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -221,6 +221,11 @@ def get_from(target, foreign_column_data): parser.add_argument( "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration" ) + parser.add_argument( + "DATABASE", + help="""Can be one of (A) A URL of the form `postgresql://...` or + `sqlite://...` (B) The filename (including path) of a sqlite database.""", + ) parser.add_argument( "TABLE", help="A .TSV file containing the data for which we will be guessing" ) @@ -233,8 +238,9 @@ def get_from(target, foreign_column_data): seed = time.time_ns() random.seed(seed) - # Get the valve configuration: + # Get the valve configuration and database info: config = get_valve_config(args.VALVE_TABLE) + config["db"] = args.DATABASE sample = get_random_sample(args.TABLE, args.sample_size) for i, label in enumerate(sample): From 4a83313fba6df4ded14a6624302b7f69a3a2452a Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 09:59:41 -0500 Subject: [PATCH 15/48] fix unsupported format error in help --- scripts/guess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index 542444e5..eef31645 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -207,7 +207,7 @@ def get_from(target, foreign_column_data): help="Sample size to use when guessing (default: 10,000)", ) parser.add_argument( - "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%)" + "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%%)" ) parser.add_argument( "--enum_size", From c956c59c55475345978e20d0abf386465ea2942b Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 11:31:37 -0500 Subject: [PATCH 16/48] rename foreign stub --- scripts/guess.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index eef31645..13be592e 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -104,8 +104,9 @@ def get_parents(dt_name): return dt_hierarchy -def get_foreign_column_data(config): - # TODO. +def get_potential_foreign_columns(config, datatype): + # TODO. Look for primary and unique columns in other tables that have the same SQL type as the + # one associated with the given datatype. pass @@ -192,8 +193,8 @@ def get_from(target, foreign_column_data): # compare the contents of each column with the contents of the target column and possibly # annotate the target with a from() structure. if not target.get("structure"): - foreign_column_data = get_foreign_column_data(config) - from_structure = get_from(target, foreign_column_data) + potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) + from_structure = get_from(target, potential_foreign_columns) if from_structure: target["structure"] = from_structure @@ -207,7 +208,10 @@ def get_from(target, foreign_column_data): help="Sample size to use when guessing (default: 10,000)", ) parser.add_argument( - "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%%)" + "--error_rate", + type=float, + default=0.1, + help="Proportion of errors expected (default: 10%%)", ) parser.add_argument( "--enum_size", From 76be3bd0f26350692f44ea68982daf69e9e94e2b Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 13 Nov 2023 13:33:40 -0500 Subject: [PATCH 17/48] implement datatype guess --- scripts/guess.py | 97 +++++++++++++++++++++++++++---- scripts/guess_grammar.py | 120 +++++++++++++++++++++++++++++++++++++++ scripts/requirements.txt | 1 + 3 files changed, 208 insertions(+), 10 deletions(-) create mode 100644 scripts/guess_grammar.py create mode 100644 scripts/requirements.txt diff --git a/scripts/guess.py b/scripts/guess.py index 13be592e..871c9749 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -8,7 +8,11 @@ import sys import time +from guess_grammar import grammar, TreeToDict, reverse_parse + from argparse import ArgumentParser +from lark import Lark +from lark.exceptions import VisitError # TODO: Remove this import later (used only for debugging): from pprint import pprint, pformat @@ -105,11 +109,58 @@ def get_parents(dt_name): def get_potential_foreign_columns(config, datatype): - # TODO. Look for primary and unique columns in other tables that have the same SQL type as the + # TODO: Look for primary and unique columns in other tables that have the same SQL type as the # one associated with the given datatype. pass +SAVED_CONDITIONS = {} + + +def get_compiled_condition(condition, parser): + global SAVED_CONDITIONS + + if condition in SAVED_CONDITIONS: + return SAVED_CONDITIONS[condition] + + parsed_condition = parser.parse(condition) + if len(parsed_condition) != 1: + print( + f"'{condition}' is invalid. Only one condition per column is allowed.", file=sys.stderr + ) + sys.exit(1) + parsed_condition = parsed_condition[0] + if parsed_condition["type"] == "function" and parsed_condition["name"] == "equals": + expected = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["value"]) + compiled_condition = lambda x: x == expected + elif parsed_condition["type"] == "function" and parsed_condition["name"] in ( + "exclude", + "match", + "search", + ): + pattern = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["pattern"]) + flags = parsed_condition["args"][0]["flags"] + flags = "(?" + "".join(flags) + ")" if flags else "" + pattern = re.compile(flags + pattern) + if parsed_condition["name"] == "exclude": + compiled_condition = lambda x: not bool(pattern.search(x)) + elif parsed_condition["name"] == "match": + compiled_condition = lambda x: bool(pattern.fullmatch(x)) + else: + compiled_condition = lambda x: bool(pattern.search(x)) + elif parsed_condition["type"] == "function" and parsed_condition["name"] == "in": + alternatives = [ + re.sub(r"^['\"](.*)['\"]$", r"\1", arg["value"]) for arg in parsed_condition["args"] + ] + compiled_condition = lambda x: x in alternatives + else: + print(f"Unrecognized condition: {condition}", file=sys.stderr) + sys.exit(1) + + SAVED_CONDITIONS[condition] = compiled_condition + return compiled_condition + + def annotate(label, sample, config, error_rate, is_primary_candidate): def has_nulltype(target): num_values = len(target["values"]) @@ -139,12 +190,26 @@ def is_match(datatype): # If the datatype has no associated condition then it matches anything: if not datatype.get("condition"): return True - # TODO: Replace this with actual code to check if there is a match: - return bool(random.getrandbits(1)) + + condition = get_compiled_condition(datatype["condition"], config["parser"]) + num_values = len(target["values"]) + num_passed = [condition(v) for v in target["values"]].count(True) + success_rate = num_passed / num_values + if (1 - success_rate) <= error_rate: + return success_rate def tiebreak(datatypes): - # TODO: Replace this with actual code to implement the tiebreaker rules: - return random.choice(datatypes) + in_types = [] + other_types = [] + for dt in datatypes: + if dt["datatype"]["condition"].startswith("in("): + in_types.append(dt) + else: + other_types.append(dt) + sorted_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + sorted( + other_types, key=lambda k: k["success_rate"], reverse=True + ) + return sorted_types[0]["datatype"] curr_index = 0 while True: @@ -158,20 +223,28 @@ def tiebreak(datatypes): sys.exit(1) for datatype in datatypes_to_check: - if is_match(datatype): - matching_datatypes.append(datatype) + success_rate = is_match(datatype) + if success_rate: + matching_datatypes.append( + { + "datatype": datatype, + "success_rate": success_rate, + } + ) if len(matching_datatypes) == 0: continue elif len(matching_datatypes) == 1: - return matching_datatypes[0] + return matching_datatypes[0]["datatype"] else: return tiebreak(matching_datatypes) curr_index += 1 - def get_from(target, foreign_column_data): - # TODO. + def get_from(target, potential_foreign_columns): + # TODO: If there is one and only potential foreign column that matches the target, return + # it. If there are none, return None. If there is more than one, then also return None, but + # print the potential matches to STDOUT. pass target = sample[label] @@ -246,11 +319,15 @@ def get_from(target, foreign_column_data): config = get_valve_config(args.VALVE_TABLE) config["db"] = args.DATABASE + # Attach the condition parser to the config as well: + config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict()) + sample = get_random_sample(args.TABLE, args.sample_size) for i, label in enumerate(sample): annotate(label, sample, config, args.error_rate, i == 0) pprint(sample) + # For debugging # for label in sample: # print(f"{label}: ", end="") diff --git a/scripts/guess_grammar.py b/scripts/guess_grammar.py new file mode 100644 index 00000000..5e611cfb --- /dev/null +++ b/scripts/guess_grammar.py @@ -0,0 +1,120 @@ +from lark import Transformer + +# Grammar used to parse the the contents of `condition` and `structure` columns. +# See: https://lark-parser.readthedocs.io/en/latest/index.html# +grammar = r""" +%import common.WS +%ignore WS + +start: expression+ +?expression: string | function + +?string: label +label: ALPHANUM | DQSTRING | SQSTRING + +function: function_name "(" arguments ")" +function_name: ALPHANUM +arguments: argument ("," argument)* +?argument: string | field | function | named_arg | regex +field: label "." label +named_arg: label "=" label + +?regex: regex_sub | regex_match +regex_match: "/" regex_pattern "/" regex_flags +regex_sub: SUB_BEGIN "/" regex_pattern "/" regex_pattern "/" regex_flags +regex_pattern: REGEX_WITH_FORWARD_SLASH | REGEX_WITHOUT_FORWARD_SLASH +regex_flags: LOWER_ALPHA* + +SUB_BEGIN: "s" +ALPHANUM: /[a-zA-Z0-9-_]/+ +DQSTRING: "\"" /[^"](\\\")?/* "\"" +SQSTRING: "'" /[^'](\\\')?/* "'" +LOWER_ALPHA: /[a-z]/ +NO_SLASH: /[^\/]/ +REGEX_WITH_FORWARD_SLASH: NO_SLASH* "\\/" NO_SLASH* +REGEX_WITHOUT_FORWARD_SLASH: NO_SLASH+ +""" + + +class TreeToDict(Transformer): + """Transformer to convert a Tree, generated by the grammar used by CMI-PB to parse the contents + of `condition` and `structure` columns, into a list of expressions represented as dicts.""" + + def _sanity_check(self, token_list, expected_len): + if len(token_list) != expected_len: + raise Exception(f"Wrong number of tokens in: {token_list} (expecting {expected_len})") + + def label(self, label): + self._sanity_check(label, 1) + label = label[0] + return {"type": "label", "value": label.value} + + def field(self, field): + self._sanity_check(field, 2) + return {"type": "field", "table": field[0]["value"], "column": field[1]["value"]} + + def named_arg(self, named_arg): + self._sanity_check(named_arg, 2) + return {"type": "named_arg", "key": named_arg[0]["value"], "value": named_arg[1]["value"]} + + def regex_match(self, regex_match): + self._sanity_check(regex_match, 2) + return {"type": "regex", "pattern": regex_match[0], "flags": regex_match[1]} + + def regex_sub(self, regex_sub): + self._sanity_check(regex_sub, 4) + return { + "type": "regex", + "pattern": regex_sub[1], + "replace": regex_sub[2], + "flags": regex_sub[3], + } + + def regex_pattern(self, regex_pattern): + self._sanity_check(regex_pattern, 1) + return regex_pattern[0].value + + def regex_flags(self, flags): + return [flag.value for flag in flags] + + def arguments(self, arguments): + return arguments + + def function_name(self, function_name): + self._sanity_check(function_name, 1) + return function_name[0].value + + def function(self, function): + self._sanity_check(function, 2) + return {"type": "function", "name": function[0], "args": function[1]} + + def start(self, start): + return start + + +def reverse_parse(config, parsed_cond): + """Given a config map and a parsed condition, return the text version of the condition.""" + cond_type = parsed_cond["type"] + text_cond = None + if cond_type == "label": + if config["datatype"].get(parsed_cond["value"]): + text_cond = config["datatype"][parsed_cond["value"]]["datatype"] + else: + text_cond = "'{}'".format(parsed_cond["value"]) + elif cond_type == "field": + return "{}.{}".format(parsed_cond["table"], parsed_cond["column"]) + elif cond_type == "named_arg": + text_cond = "{}={}".format(parsed_cond["key"], parsed_cond["value"]) + elif cond_type == "regex": + pattern = parsed_cond["pattern"] + flags = "".join(parsed_cond["flags"]) + replace = parsed_cond.get("replace") + text_cond = f"/{pattern}/{flags}" if not replace else f"s/{pattern}/{replace}/{flags}" + elif cond_type == "function": + text_cond = map(lambda arg: reverse_parse(config, arg), parsed_cond["args"]) + text_cond = ", ".join(text_cond) + text_cond = "{}({})".format(parsed_cond["name"], text_cond) + else: + raise Exception(f"Unknown parsed_cond type: {cond_type} for {parsed_cond}") + + return text_cond diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000..9547a85f --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +lark==1.1.8 From a2f64eec77f2d4152ee0f56445957f8d20dc1786 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Wed, 15 Nov 2023 10:32:51 -0500 Subject: [PATCH 18/48] call lstrip() on in() conditions --- scripts/guess.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 871c9749..75123b6e 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -8,11 +8,10 @@ import sys import time -from guess_grammar import grammar, TreeToDict, reverse_parse +from guess_grammar import grammar, TreeToDict from argparse import ArgumentParser from lark import Lark -from lark.exceptions import VisitError # TODO: Remove this import later (used only for debugging): from pprint import pprint, pformat @@ -202,7 +201,7 @@ def tiebreak(datatypes): in_types = [] other_types = [] for dt in datatypes: - if dt["datatype"]["condition"].startswith("in("): + if dt["datatype"]["condition"].lstrip().startswith("in("): in_types.append(dt) else: other_types.append(dt) From 2a4db64a6922daf147b511d85aac3ffecd34c2dd Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Wed, 15 Nov 2023 11:56:31 -0500 Subject: [PATCH 19/48] implement get_potential_foreign_columns() --- scripts/guess.py | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 75123b6e..ba05c240 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -17,6 +17,9 @@ from pprint import pprint, pformat +SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] + + def has_ncolumn(sample, ncolumn): return bool([label for label in sample if sample[label]["normalized"] == ncolumn]) @@ -107,10 +110,39 @@ def get_parents(dt_name): return dt_hierarchy +def get_sql_type(config, datatype): + """Given the config map and the name of a datatype, climb the datatype tree (as required), + and return the first 'SQL type' found.""" + if "datatype" not in config: + print("Missing datatypes in config") + sys.exit(1) + if datatype not in config["datatype"]: + return None + if config["datatype"][datatype].get("SQL type"): + return config["datatype"][datatype]["SQL type"] + return get_sql_type(config, config["datatype"][datatype].get("parent")) + + def get_potential_foreign_columns(config, datatype): - # TODO: Look for primary and unique columns in other tables that have the same SQL type as the - # one associated with the given datatype. - pass + global SPECIAL_TABLES + + def get_coarser_sql_type(datatype): + sql_type = get_sql_type(config, datatype) + if sql_type not in ["integer", "numeric", "real"]: + return "text" + else: + return sql_type.casefold() + + potential_foreign_columns = [] + this_sql_type = get_coarser_sql_type(datatype) + for table, table_config in config["table"].items(): + if table not in SPECIAL_TABLES: + for column, column_config in table_config["column"].items(): + if column_config.get("structure") in ["primary", "unique"]: + foreign_sql_type = get_coarser_sql_type(column_config["datatype"]) + if foreign_sql_type == this_sql_type: + potential_foreign_columns.append({"table": table, "column": column}) + return potential_foreign_columns SAVED_CONDITIONS = {} @@ -266,6 +298,7 @@ def get_from(target, potential_foreign_columns): # annotate the target with a from() structure. if not target.get("structure"): potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) + pprint(potential_foreign_columns) from_structure = get_from(target, potential_foreign_columns) if from_structure: target["structure"] = from_structure @@ -316,6 +349,9 @@ def get_from(target, potential_foreign_columns): # Get the valve configuration and database info: config = get_valve_config(args.VALVE_TABLE) + if args.TABLE.removesuffix(".tsv") in config["table"]: + print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr) + sys.exit(0) config["db"] = args.DATABASE # Attach the condition parser to the config as well: @@ -325,7 +361,7 @@ def get_from(target, potential_foreign_columns): for i, label in enumerate(sample): annotate(label, sample, config, args.error_rate, i == 0) - pprint(sample) + # pprint(sample) # For debugging # for label in sample: From e8e163a5e87b1e714b73d283be46ba964f0c03c7 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Wed, 15 Nov 2023 14:54:08 -0500 Subject: [PATCH 20/48] implement get_froms() --- scripts/guess.py | 59 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index ba05c240..0fdc622d 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -4,6 +4,7 @@ import json import random import re +import sqlite3 import subprocess import sys import time @@ -12,8 +13,7 @@ from argparse import ArgumentParser from lark import Lark - -# TODO: Remove this import later (used only for debugging): +from numbers import Number from pprint import pprint, pformat @@ -141,7 +141,13 @@ def get_coarser_sql_type(datatype): if column_config.get("structure") in ["primary", "unique"]: foreign_sql_type = get_coarser_sql_type(column_config["datatype"]) if foreign_sql_type == this_sql_type: - potential_foreign_columns.append({"table": table, "column": column}) + potential_foreign_columns.append( + { + "table": table, + "column": column, + "sql_type": foreign_sql_type, + } + ) return potential_foreign_columns @@ -273,10 +279,31 @@ def tiebreak(datatypes): curr_index += 1 def get_from(target, potential_foreign_columns): - # TODO: If there is one and only potential foreign column that matches the target, return - # it. If there are none, return None. If there is more than one, then also return None, but - # print the potential matches to STDOUT. - pass + candidate_froms = [] + for foreign in potential_foreign_columns: + table = foreign["table"] + column = foreign["column"] + sql_type = foreign["sql_type"] + num_matches = 0 + num_values = len(target["values"]) + for value in target["values"]: + if target.get("nulltype") == "empty" and value == "": + # If this value is legitimately empty then it should not be taken into account + # when counting the number of values in the target that are found in the + # candidate foreign column: + num_values -= 1 + continue + if sql_type != "text" and not isinstance(value, Number): + # If this value is of the wrong type then there is no need to explicitly check + # if it exists in the foreign column: + continue + if sql_type == "text": + value = f"'{value}'" + sql = f'SELECT 1 FROM "{table}" WHERE "{column}" = {value} LIMIT 1' + num_matches += len(config["db"].execute(sql).fetchall()) + if ((num_values - num_matches) / num_values) < error_rate: + candidate_froms.append(foreign) + return candidate_froms target = sample[label] if has_nulltype(target): @@ -293,15 +320,16 @@ def get_from(target, potential_foreign_columns): dt_hierarchy = get_datatype_hierarchy(config) target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"] - # TODO: Use the valve config to get a list of columns already loaded to the database, then - # compare the contents of each column with the contents of the target column and possibly - # annotate the target with a from() structure. + # Use the valve config to get a list of columns already loaded to the database, then compare + # the contents of each column with the contents of the target column and possibly annotate the + # target with a from() structure, if there is one and only one candidate from(). if not target.get("structure"): potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) - pprint(potential_foreign_columns) - from_structure = get_from(target, potential_foreign_columns) - if from_structure: - target["structure"] = from_structure + froms = get_from(target, potential_foreign_columns) + if len(froms) == 1: + target["structure"] = froms[0] + elif len(froms) > 1: + print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}") if __name__ == "__main__": @@ -352,7 +380,8 @@ def get_from(target, potential_foreign_columns): if args.TABLE.removesuffix(".tsv") in config["table"]: print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr) sys.exit(0) - config["db"] = args.DATABASE + with sqlite3.connect(args.DATABASE) as conn: + config["db"] = conn # Attach the condition parser to the config as well: config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict()) From 43e48ec29b425dc14ca83fdbac1f98d190431acf Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Wed, 15 Nov 2023 16:01:20 -0500 Subject: [PATCH 21/48] do froms before uniques --- Makefile | 6 +++--- scripts/guess.py | 40 +++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 8e53c26a..5832e279 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,7 @@ guess_test_dir = test/guess_test_data guess_test_db = build/valve_guess.db $(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv - ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) + ./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) $(guess_test_dir)/ontology: mkdir -p $@ @@ -130,9 +130,9 @@ $(guess_test_dir)/ontology: guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology ./confirm_overwrite.sh $(guess_test_dir)/ontology rm -f $(guess_test_dir)/table1.tsv - ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) + ./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir) rm -f $(guess_test_dir)/ontology/*.tsv - ./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table_expected.tsv $| + ./$< 0 30000 5 $(guess_test_dir)/table_expected.tsv $| rm -f $(guess_test_dir)/ontology/table1.tsv $(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology diff --git a/scripts/guess.py b/scripts/guess.py index 0fdc622d..cb416a91 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -308,13 +308,6 @@ def get_from(target, potential_foreign_columns): target = sample[label] if has_nulltype(target): target["nulltype"] = "empty" - # Since the target has no nulltype (because the previous branch of the if-statement did not - # apply), all empties are assumed to be errors, so we pass True here: - elif not has_duplicates(target, True): - if is_primary_candidate: - target["structure"] = "primary" - else: - target["structure"] = "unique" # Use the valve config to retrieve the valve datatype hierarchy: dt_hierarchy = get_datatype_hierarchy(config) @@ -323,13 +316,20 @@ def get_from(target, potential_foreign_columns): # Use the valve config to get a list of columns already loaded to the database, then compare # the contents of each column with the contents of the target column and possibly annotate the # target with a from() structure, if there is one and only one candidate from(). + potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) + froms = get_from(target, potential_foreign_columns) + if len(froms) == 1: + target["structure"] = froms[0] + elif len(froms) > 1: + print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}") + + # Check if the column is a unique/primary column: if not target.get("structure"): - potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"]) - froms = get_from(target, potential_foreign_columns) - if len(froms) == 1: - target["structure"] = froms[0] - elif len(froms) > 1: - print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}") + if target.get("nulltype") is None and not has_duplicates(target, True): + if is_primary_candidate: + target["structure"] = "primary" + else: + target["structure"] = "unique" if __name__ == "__main__": @@ -390,11 +390,13 @@ def get_from(target, potential_foreign_columns): for i, label in enumerate(sample): annotate(label, sample, config, args.error_rate, i == 0) + # For debugging: # pprint(sample) - # For debugging - # for label in sample: - # print(f"{label}: ", end="") - # for annotation in sample[label]: - # print(f"{annotation} ", end="") - # print() + # For debugging without values: + for label in sample: + print(f"{label}: ", end="") + for annotation, data in sample[label].items(): + if annotation != "values": + print(f"{annotation}: {data}, ", end="") + print() From d6e286238433821b9d2be877af690ef2fdea53ed Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 10:52:44 -0500 Subject: [PATCH 22/48] textify from structures --- scripts/guess.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index cb416a91..4fe84331 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -14,7 +14,6 @@ from argparse import ArgumentParser from lark import Lark from numbers import Number -from pprint import pprint, pformat SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] @@ -302,7 +301,7 @@ def get_from(target, potential_foreign_columns): sql = f'SELECT 1 FROM "{table}" WHERE "{column}" = {value} LIMIT 1' num_matches += len(config["db"].execute(sql).fetchall()) if ((num_values - num_matches) / num_values) < error_rate: - candidate_froms.append(foreign) + candidate_froms.append(f"from({foreign['table']}.{foreign['column']})") return candidate_froms target = sample[label] @@ -321,7 +320,7 @@ def get_from(target, potential_foreign_columns): if len(froms) == 1: target["structure"] = froms[0] elif len(froms) > 1: - print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}") + print(f"Column '{label}' has multiple from() candidates: {', '.join(froms)}") # Check if the column is a unique/primary column: if not target.get("structure"): From 70cc3d7d8e4bb412b7c9e3c3f95a2885a2590828 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 12:07:40 -0500 Subject: [PATCH 23/48] fix bugs that causes infinite loop and that attempt to dereference a NoneType --- scripts/guess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index 4fe84331..e619ca43 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -238,7 +238,7 @@ def tiebreak(datatypes): in_types = [] other_types = [] for dt in datatypes: - if dt["datatype"]["condition"].lstrip().startswith("in("): + if dt["datatype"].get("condition", "").lstrip().startswith("in("): in_types.append(dt) else: other_types.append(dt) @@ -269,6 +269,7 @@ def tiebreak(datatypes): ) if len(matching_datatypes) == 0: + curr_index += 1 continue elif len(matching_datatypes) == 1: return matching_datatypes[0]["datatype"] From 0554aa609181b164997a405c19f6b44fa8ee1d97 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 14:50:08 -0500 Subject: [PATCH 24/48] optimize sampling --- scripts/guess.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index e619ca43..df6a066c 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -32,26 +32,27 @@ def get_random_sample(table, sample_size): sample_size = total_rows sample_row_numbers = range(1, total_rows + 1) else: - sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size)) + sample_row_numbers = random.sample(range(1, total_rows + 1), sample_size) with open(table) as f: rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + rows = [r for r in rows] sample = {} - for i, row in enumerate(rows, start=1): - if i in sample_row_numbers: - for label, value in row.items(): - if label not in sample: - ncolumn = re.sub(r"[^0-9a-zA-Z_]+", "", label).casefold() - if has_ncolumn(sample, ncolumn): - print( - "The data has more than one column with the normalized name " - f"{ncolumn}" - ) - sys.exit(1) - sample[label] = { - "normalized": ncolumn, - "values": [], - } - sample[label]["values"].append(value) + pattern = re.compile(r"[^0-9a-zA-Z_]+") + for i in sample_row_numbers: + for label, value in rows[i].items(): + if label not in sample: + ncolumn = re.sub(pattern, "", label).casefold() + if has_ncolumn(sample, ncolumn): + print( + "The data has more than one column with the normalized name " + f"{ncolumn}" + ) + sys.exit(1) + sample[label] = { + "normalized": ncolumn, + "values": [], + } + sample[label]["values"].append(value) return sample From 81d973a66c24de1aae49ff4e8a3425a114ec157b Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 15:01:51 -0500 Subject: [PATCH 25/48] tweak --- scripts/guess.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index df6a066c..6f762032 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -30,12 +30,11 @@ def get_random_sample(table, sample_size): if total_rows <= sample_size: sample_size = total_rows - sample_row_numbers = range(1, total_rows + 1) + sample_row_numbers = range(0, total_rows) else: - sample_row_numbers = random.sample(range(1, total_rows + 1), sample_size) + sample_row_numbers = random.sample(range(0, total_rows), sample_size) with open(table) as f: - rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) - rows = [r for r in rows] + rows = [r for r in csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)] sample = {} pattern = re.compile(r"[^0-9a-zA-Z_]+") for i in sample_row_numbers: From ea0630ae74265d8050f1fee27d8f26329a3ad620 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 16:04:59 -0500 Subject: [PATCH 26/48] also sort datatypes by depth --- scripts/guess.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 6f762032..488cfa59 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -242,9 +242,9 @@ def tiebreak(datatypes): in_types.append(dt) else: other_types.append(dt) - sorted_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + sorted( - other_types, key=lambda k: k["success_rate"], reverse=True - ) + sorted_types = sorted( + in_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True + ) + sorted(other_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True) return sorted_types[0]["datatype"] curr_index = 0 @@ -264,6 +264,7 @@ def tiebreak(datatypes): matching_datatypes.append( { "datatype": datatype, + "depth": curr_index, "success_rate": success_rate, } ) From 14e00a094c5879e10a65a58bfdb28d585efaf8ec Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 16:17:51 -0500 Subject: [PATCH 27/48] don't duplicate datatype check --- scripts/guess.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index 488cfa59..3fd484e9 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -252,7 +252,10 @@ def tiebreak(datatypes): matching_datatypes = [] datatypes_to_check = [] for dt_name in dt_hierarchy: - if len(dt_hierarchy[dt_name]) > curr_index: + if ( + len(dt_hierarchy[dt_name]) > curr_index + and dt_hierarchy[dt_name][curr_index] not in datatypes_to_check + ): datatypes_to_check.append(dt_hierarchy[dt_name][curr_index]) if len(datatypes_to_check) == 0: print(f"Could not find a datatype match for column '{label}'") From 2e12ddaafd41d35180674080c53f5f56d0810395 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Thu, 16 Nov 2023 16:45:36 -0500 Subject: [PATCH 28/48] make get_hierarchy_for_dt() an outer function --- scripts/guess.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 3fd484e9..b1af55bc 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -67,6 +67,19 @@ def get_valve_config(valve_table): return json.loads(result.stdout.decode()) +def get_hierarchy_for_dt(config, primary_dt_name): + def get_parents(dt_name): + datatypes = [] + if dt_name is not None: + datatype = config["datatype"][dt_name] + if datatype["datatype"] != primary_dt_name: + datatypes.append(datatype) + datatypes += get_parents(datatype.get("parent")) + return datatypes + + return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) + + def get_datatype_hierarchy(config): """ Given a VALVE configuration, return a datatype hierarchy that looks like this: @@ -82,19 +95,6 @@ def get_datatype_hierarchy(config): ...], 'dt_name_2': etc. """ - - def get_hierarchy_for_dt(primary_dt_name): - def get_parents(dt_name): - datatypes = [] - if dt_name is not None: - datatype = config["datatype"][dt_name] - if datatype["datatype"] != primary_dt_name: - datatypes.append(datatype) - datatypes += get_parents(datatype.get("parent")) - return datatypes - - return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) - dt_config = config["datatype"] dt_names = [dt_name for dt_name in dt_config] leaf_dts = [] @@ -105,7 +105,7 @@ def get_parents(dt_name): dt_hierarchy = {} for leaf_dt in leaf_dts: - dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt) + dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(config, leaf_dt) return dt_hierarchy @@ -235,6 +235,14 @@ def is_match(datatype): return success_rate def tiebreak(datatypes): + # TODO: There is a problem with this algorithm, since it implicitly assumes that if two + # datatypes are of the same depth, then neither can be a parent of the other. But this + # is false. We could have, for example, + # leaf_1 -> non_space -> trimmed_line + # leaf_2 -> word -> non_space -> trimmed_line + # Even though non-space is a parent of word, the algorithm classifies both as depth 1. + # We need to have another check in this function to determine whether there are any + # parent-child dependencies between the datatypes in the tiebreaker list. in_types = [] other_types = [] for dt in datatypes: From 318da294400811edf4958fd2b075bf673a0b7b70 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 19 Nov 2023 20:10:54 -0500 Subject: [PATCH 29/48] redesign algorithm for get_datatype() --- scripts/guess.py | 160 ++++++++++++++++++++++++----------------------- 1 file changed, 82 insertions(+), 78 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index b1af55bc..c0c9f4d8 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -9,11 +9,13 @@ import sys import time +from copy import deepcopy from guess_grammar import grammar, TreeToDict from argparse import ArgumentParser from lark import Lark from numbers import Number +from pprint import pformat SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] @@ -80,33 +82,62 @@ def get_parents(dt_name): return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name) -def get_datatype_hierarchy(config): +def get_dt_hierarchies(config): """ Given a VALVE configuration, return a datatype hierarchy that looks like this: - {'dt_name_1': [{'datatype': 'dt_name_1', - 'description': 'a description', - ...}, - {'datatype': 'parent datatype', - 'description': 'a description', - ...}, - {'datatype': 'grandparent datatype', - 'description': 'a description', - ...}, - ...], - 'dt_name_2': etc. + {0: {'dt_name_1': [{'datatype': 'dt_name_1', + 'description': 'a description', + ...}, + {'datatype': 'parent datatype', + 'description': 'a description', + ...}, + {'datatype': 'grandparent datatype', + 'description': 'a description', + ...}, + ...], + 'dt_name_2': etc.}, + 1: ... etc.} """ + + def get_higher_datatypes(datatype_hierarchies, universals, depth): + current_datatypes = [dt_name for dt_name in datatype_hierarchies.get(depth, [])] + higher_datatypes = {} + if current_datatypes: + universals = [dt_name for dt_name in universals] + lower_datatypes = [] + for i in range(0, depth): + lower_datatypes += [dt_name for dt_name in datatype_hierarchies.get(i, [])] + for dt_name in dt_hierarchies[depth]: + dt_hierarchy = dt_hierarchies[depth][dt_name] + if len(dt_hierarchy) > 1: + parent_hierarchy = dt_hierarchy[1:] + parent = parent_hierarchy[0]["datatype"] + if parent not in current_datatypes + lower_datatypes + universals: + higher_datatypes[parent] = parent_hierarchy + return higher_datatypes + dt_config = config["datatype"] dt_names = [dt_name for dt_name in dt_config] - leaf_dts = [] - for dt in dt_names: - children = [child for child in dt_names if dt_config[child].get("parent") == dt] + dt_hierarchies = {0: {}} + universals = {} + for dt_name in dt_names: + # Add all the leaf datatypes to dt_hierarchies at 0 depth: + children = [child for child in dt_names if dt_config[child].get("parent") == dt_name] if not children: - leaf_dts.append(dt) - - dt_hierarchy = {} - for leaf_dt in leaf_dts: - dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(config, leaf_dt) - return dt_hierarchy + dt_hierarchies[0][dt_name] = get_hierarchy_for_dt(config, dt_name) + # Ungrounded and unconditioned datatypes go into the universals category, which are added + # to the top of dt_hierarchies later: + elif not dt_config[dt_name].get("parent") or not dt_config[dt_name].get("condition"): + universals[dt_name] = get_hierarchy_for_dt(config, dt_name) + + depth = 0 + higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth) + while higher_dts: + depth += 1 + dt_hierarchies[depth] = deepcopy(higher_dts) + higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth) + dt_hierarchies[depth + 1] = universals + return dt_hierarchies def get_sql_type(config, datatype): @@ -211,17 +242,7 @@ def has_duplicates(target, ignore_empties): distinct_values = set(values) return (len(values) - len(distinct_values)) > (error_rate * len(values)) - def get_datatype(target, dt_hierarchy): - # For each tree in the hierarchy: - # Look for a match with the 0th element and possibly add it to matching_datatypes. - # If there are matches in matching_datatypes: - # Use the tiebreaker rules to find the best match and annotate the target with it. - # Else: - # Try again with the next highest element of each tree (if one exists) - # - # Note that this is guaranteed to work since the get_datatype_hierarchy() function includes - # the 'text' datatype which matches anything. So if no matches are found raise an error. - + def get_datatype(target, dt_hierarchies): def is_match(datatype): # If the datatype has no associated condition then it matches anything: if not datatype.get("condition"): @@ -235,61 +256,44 @@ def is_match(datatype): return success_rate def tiebreak(datatypes): - # TODO: There is a problem with this algorithm, since it implicitly assumes that if two - # datatypes are of the same depth, then neither can be a parent of the other. But this - # is false. We could have, for example, - # leaf_1 -> non_space -> trimmed_line - # leaf_2 -> word -> non_space -> trimmed_line - # Even though non-space is a parent of word, the algorithm classifies both as depth 1. - # We need to have another check in this function to determine whether there are any - # parent-child dependencies between the datatypes in the tiebreaker list. in_types = [] other_types = [] + parents = set([dt["datatype"].get("parent") for dt in datatypes]) + parents.discard(None) for dt in datatypes: - if dt["datatype"].get("condition", "").lstrip().startswith("in("): - in_types.append(dt) - else: - other_types.append(dt) - sorted_types = sorted( - in_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True - ) + sorted(other_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True) - return sorted_types[0]["datatype"] - - curr_index = 0 - while True: - matching_datatypes = [] - datatypes_to_check = [] - for dt_name in dt_hierarchy: - if ( - len(dt_hierarchy[dt_name]) > curr_index - and dt_hierarchy[dt_name][curr_index] not in datatypes_to_check - ): - datatypes_to_check.append(dt_hierarchy[dt_name][curr_index]) - if len(datatypes_to_check) == 0: - print(f"Could not find a datatype match for column '{label}'") + if dt["datatype"]["datatype"] not in parents: + if dt["datatype"].get("condition", "").lstrip().startswith("in("): + in_types.append(dt) + else: + other_types.append(dt) + + if len(in_types) == 1: + return in_types[0]["datatype"] + elif len(in_types) > 1: + in_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + return in_types[0]["datatype"] + elif len(other_types) == 1: + return other_types[0]["datatype"] + elif len(other_types) > 1: + other_types = sorted(other_types, key=lambda k: k["success_rate"], reverse=True) + return other_types[0]["datatype"] + else: + print(f"Error tiebreaking datatypes: {pformat(datatypes)}") sys.exit(1) + for depth in range(0, len(dt_hierarchies)): + datatypes_to_check = [dt_hierarchies[depth][dt][0] for dt in dt_hierarchies[depth]] + matching_datatypes = [] for datatype in datatypes_to_check: success_rate = is_match(datatype) if success_rate: - matching_datatypes.append( - { - "datatype": datatype, - "depth": curr_index, - "success_rate": success_rate, - } - ) - - if len(matching_datatypes) == 0: - curr_index += 1 - continue - elif len(matching_datatypes) == 1: + matching_datatypes.append({"datatype": datatype, "success_rate": success_rate}) + + if len(matching_datatypes) == 1: return matching_datatypes[0]["datatype"] - else: + elif len(matching_datatypes) > 1: return tiebreak(matching_datatypes) - curr_index += 1 - def get_from(target, potential_foreign_columns): candidate_froms = [] for foreign in potential_foreign_columns: @@ -322,8 +326,8 @@ def get_from(target, potential_foreign_columns): target["nulltype"] = "empty" # Use the valve config to retrieve the valve datatype hierarchy: - dt_hierarchy = get_datatype_hierarchy(config) - target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"] + dt_hierarchies = get_dt_hierarchies(config) + target["datatype"] = get_datatype(target, dt_hierarchies)["datatype"] # Use the valve config to get a list of columns already loaded to the database, then compare # the contents of each column with the contents of the target column and possibly annotate the From 0ea815fa1112f63a2cfe4f455688c28c80e4ad73 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Wed, 22 Nov 2023 11:05:48 -0500 Subject: [PATCH 30/48] add verbose flag --- scripts/guess.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/guess.py b/scripts/guess.py index c0c9f4d8..bbb3bfc7 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -19,6 +19,17 @@ SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] +VERBOSE = False + + +def log(message, force=False, suppress_time=False): + global VERBOSE + + if force or VERBOSE: + if not suppress_time: + print(f"{time.asctime()} {message}", file=sys.stderr) + else: + print(f"{message}", file=sys.stderr) def has_ncolumn(sample, ncolumn): @@ -325,7 +336,7 @@ def get_from(target, potential_foreign_columns): if has_nulltype(target): target["nulltype"] = "empty" - # Use the valve config to retrieve the valve datatype hierarchy: + # Use the valve config to retrieve the valve datatype hierarchies: dt_hierarchies = get_dt_hierarchies(config) target["datatype"] = get_datatype(target, dt_hierarchies)["datatype"] @@ -350,6 +361,7 @@ def get_from(target, potential_foreign_columns): if __name__ == "__main__": parser = ArgumentParser(description="VALVE guesser (prototype)") + parser.add_argument("--verbose", action="store_true", help="Print logging output to STDERR.") parser.add_argument( "--sample_size", type=int, @@ -384,6 +396,8 @@ def get_from(target, potential_foreign_columns): ) args = parser.parse_args() + VERBOSE = args.verbose + # Use the seed argument, or the epoch time if no seed is given, to set up the random generator: if args.seed is not None: seed = args.seed @@ -402,9 +416,12 @@ def get_from(target, potential_foreign_columns): # Attach the condition parser to the config as well: config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict()) + log(f"Getting random sample of {args.sample_size} rows from {args.TABLE} ...") sample = get_random_sample(args.TABLE, args.sample_size) for i, label in enumerate(sample): + log(f"Annotating label '{label}' ...") annotate(label, sample, config, args.error_rate, i == 0) + log("Done!") # For debugging: # pprint(sample) From 4a4ea326874057696ec49bdc5de13052ce85a37e Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 13:36:06 -0500 Subject: [PATCH 31/48] write table and column config to db --- scripts/guess.py | 121 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 16 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index bbb3bfc7..61e4ea14 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -15,7 +15,9 @@ from argparse import ArgumentParser from lark import Lark from numbers import Number +from pathlib import Path from pprint import pformat +from textwrap import dedent SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"] @@ -153,14 +155,14 @@ def get_higher_datatypes(datatype_hierarchies, universals, depth): def get_sql_type(config, datatype): """Given the config map and the name of a datatype, climb the datatype tree (as required), - and return the first 'SQL type' found.""" + and return the first 'SQLite type' found.""" if "datatype" not in config: print("Missing datatypes in config") sys.exit(1) if datatype not in config["datatype"]: return None - if config["datatype"][datatype].get("SQL type"): - return config["datatype"][datatype]["SQL type"] + if config["datatype"][datatype].get("SQLite type"): + return config["datatype"][datatype]["SQLite type"] return get_sql_type(config, config["datatype"][datatype].get("parent")) @@ -258,6 +260,10 @@ def is_match(datatype): # If the datatype has no associated condition then it matches anything: if not datatype.get("condition"): return True + # If the SQLite type is NULL this datatype is ruled out: + sqlite_type = datatype.get("SQLite type") + if sqlite_type and sqlite_type.casefold() == "null": + return False condition = get_compiled_condition(datatype["condition"], config["parser"]) num_values = len(target["values"]) @@ -372,7 +378,8 @@ def get_from(target, potential_foreign_columns): "--error_rate", type=float, default=0.1, - help="Proportion of errors expected (default: 10%%)", + help="""A number between 0 and 1 (inclusive) representing the proportion of errors expected + (default: 0.1)""", ) parser.add_argument( "--enum_size", @@ -383,6 +390,11 @@ def get_from(target, potential_foreign_columns): parser.add_argument( "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)" ) + parser.add_argument( + "--yes", + action="store_true", + help="Do not ask for confirmation before writing suggested modifications to the database", + ) parser.add_argument( "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration" ) @@ -407,8 +419,10 @@ def get_from(target, potential_foreign_columns): # Get the valve configuration and database info: config = get_valve_config(args.VALVE_TABLE) - if args.TABLE.removesuffix(".tsv") in config["table"]: - print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr) + table_tsv = args.TABLE + table = Path(args.TABLE).stem + if table in config["table"]: + print(f"{table} is already configured.", file=sys.stderr) sys.exit(0) with sqlite3.connect(args.DATABASE) as conn: config["db"] = conn @@ -416,20 +430,95 @@ def get_from(target, potential_foreign_columns): # Attach the condition parser to the config as well: config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict()) - log(f"Getting random sample of {args.sample_size} rows from {args.TABLE} ...") - sample = get_random_sample(args.TABLE, args.sample_size) + log(f"Getting random sample of {args.sample_size} rows from {table_tsv} ...") + sample = get_random_sample(table_tsv, args.sample_size) for i, label in enumerate(sample): log(f"Annotating label '{label}' ...") annotate(label, sample, config, args.error_rate, i == 0) log("Done!") - # For debugging: - # pprint(sample) + table_table_headers = ["table", "path", "type", "description"] + column_table_headers = [ + "table", + "column", + "label", + "nulltype", + "datatype", + "structure", + "description", + ] + if not args.yes: + print() + + print('The following row will be inserted to "table":') + data = [table_table_headers, [f"{table}", f"{table_tsv}", "", ""]] + # We add +2 for padding + col_width = max(len(word) for row in data for word in row) + 2 + for row in data: + print("".join(word.ljust(col_width) for word in row)) + + print() + + print('The following row will be inserted to "column":') + data = [column_table_headers] + for label in sample: + row = [ + f"{table}", + f"{sample[label]['normalized']}", + f"{label}", + f"{sample[label].get('nulltype', '')}", + f"{sample[label]['datatype']}", + f"{sample[label].get('structure', '')}", + f"{sample[label].get('description', '')}", + ] + data.append(row) + # We add +2 for padding + col_width = max(len(word) for row in data for word in row) + 2 + for row in data: + print("".join(word.ljust(col_width) for word in row)) - # For debugging without values: - for label in sample: - print(f"{label}: ", end="") - for annotation, data in sample[label].items(): - if annotation != "values": - print(f"{annotation}: {data}, ", end="") print() + + answer = input("Do you want to write this updated configuration to the database? (y/n) ") + if answer.casefold() != "y": + print("Not writing updated configuration to the database.") + sys.exit(0) + + log("Updating table configuration in database ...") + row_number = conn.execute('SELECT MAX(row_number) FROM "table"').fetchall()[0][0] + 1 + sql = dedent( + f""" + INSERT INTO "table" ("row_number", {', '.join([f'"{k}"' for k in table_table_headers])}) + VALUES ({row_number}, '{table}', '{table_tsv}', NULL, NULL)""" + ) + log(sql, suppress_time=True) + log("", suppress_time=True) + conn.execute(sql) + conn.commit() + + log("Updating column configuration in database ...") + row_number = conn.execute('SELECT MAX(row_number) FROM "column"').fetchall()[0][0] + 1 + for label in sample: + values = ", ".join( + [ + f"{row_number}", + f"'{table}'", + f"'{sample[label]['normalized']}'", + f"'{label}'", + f"'{sample[label]['nulltype']}'" if sample[label].get("nulltype") else "NULL", + f"'{sample[label]['datatype']}'", + f"'{sample[label]['structure']}'" if sample[label].get("structure") else "NULL", + f"'{sample[label]['description']}'" if sample[label].get("description") else "NULL", + ] + ) + sql = dedent( + f""" + INSERT INTO "column" ("row_number", {', '.join([f'"{k}"' for k in column_table_headers])}) + VALUES ({values})""" + ) + log(sql, suppress_time=True) + conn.execute(sql) + conn.commit() + row_number += 1 + log("", suppress_time=True) + log("Done!") From 70d035937d2a8effbfc1d6803542d2522036e9d3 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 14:30:17 -0500 Subject: [PATCH 32/48] warn but do not panic, during load, if table file doesn't exist --- src/lib.rs | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 296fd7cf..638770a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4403,6 +4403,27 @@ async fn load_db( let mut total_infos = 0; let mut table_num = 1; for table_name in table_list { + let path = String::from( + config + .get("table") + .and_then(|t| t.as_object()) + .and_then(|o| o.get(&table_name)) + .and_then(|n| n.get("path")) + .and_then(|p| p.as_str()) + .unwrap(), + ); + let mut rdr = { + match File::open(path.clone()) { + Err(e) => { + eprintln!("WARN: Unable to open '{}': {}", path.clone(), e); + continue; + } + Ok(table_file) => csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_reader(table_file), + } + }; if verbose { eprintln!( "{} - Loading table {}/{}: {}", @@ -4413,21 +4434,6 @@ async fn load_db( ); } table_num += 1; - let path = String::from( - config - .get("table") - .and_then(|t| t.as_object()) - .and_then(|o| o.get(&table_name)) - .and_then(|n| n.get("path")) - .and_then(|p| p.as_str()) - .unwrap(), - ); - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_reader(File::open(path.clone()).unwrap_or_else(|err| { - panic!("Unable to open '{}': {}", path.clone(), err); - })); // Extract the headers, which we will need later: let mut records = rdr.records(); From 0bd3ad3a8de27cf49895298e87414682602404b4 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 14:48:15 -0500 Subject: [PATCH 33/48] add ValveRow alias --- src/lib.rs | 43 ++++++++++++++++++++++--------------------- src/validate.rs | 22 +++++++++++----------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 638770a0..ba4b2167 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,11 @@ lazy_static! { static ref SL_SQL_TYPES: Vec<&'static str> = vec!["text", "numeric", "integer", "real"]; } -/// An alias for [serde_json::Map](..//serde_json/struct.Map.html). +/// Aliases for [serde_json::Map](..//serde_json/struct.Map.html). // Note: serde_json::Map is // [backed by a BTreeMap by default](https://docs.serde.rs/serde_json/map/index.html) pub type SerdeMap = serde_json::Map; +pub type ValveRow = serde_json::Map; /// Represents a structure such as those found in the `structure` column of the `column` table in /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its @@ -1444,7 +1445,7 @@ pub async fn get_affected_rows( global_config: &SerdeMap, pool: &AnyPool, tx: &mut Transaction<'_, sqlx::Any>, -) -> Result, String> { +) -> Result, String> { // Since the consequence of an update could involve currently invalid rows // (in the conflict table) becoming valid or vice versa, we need to check rows for // which the value of the column is the same as `value` @@ -1470,7 +1471,7 @@ pub async fn get_affected_rows( .await .map_err(|e| e.to_string())? { - let mut table_row = SerdeMap::new(); + let mut table_row = ValveRow::new(); let mut row_number: Option = None; for column in row.columns() { let cname = column.name(); @@ -1508,7 +1509,7 @@ pub async fn get_row_from_db( tx: &mut Transaction<'_, sqlx::Any>, table: &str, row_number: &u32, -) -> Result { +) -> Result { let sql = format!( "{} WHERE row_number = {}", query_with_message_values(table, global_config, pool), @@ -1541,7 +1542,7 @@ pub async fn get_row_from_db( } }; - let mut row = SerdeMap::new(); + let mut row = ValveRow::new(); for column in sql_row.columns() { let cname = column.name(); if !vec!["row_number", "message"].contains(&cname) { @@ -1649,7 +1650,7 @@ pub async fn get_rows_to_update( ), String, > { - fn get_cell_value(row: &SerdeMap, column: &str) -> Result { + fn get_cell_value(row: &ValveRow, column: &str) -> Result { match row.get(column).and_then(|cell| cell.get("value")) { Some(SerdeValue::String(s)) => Ok(format!("{}", s)), Some(SerdeValue::Number(n)) => Ok(format!("{}", n)), @@ -1900,8 +1901,8 @@ pub async fn record_row_change( tx: &mut Transaction<'_, sqlx::Any>, table: &str, row_number: &u32, - from: Option<&SerdeMap>, - to: Option<&SerdeMap>, + from: Option<&ValveRow>, + to: Option<&ValveRow>, user: &str, ) -> Result<(), sqlx::Error> { if let (None, None) = (from, to) { @@ -1910,8 +1911,8 @@ pub async fn record_row_change( )); } - fn to_text(smap: Option<&SerdeMap>, quoted: bool) -> String { - match smap { + fn to_text(row: Option<&ValveRow>, quoted: bool) -> String { + match row { None => "NULL".to_string(), Some(r) => { let inner = format!("{}", json!(r)).replace("'", "''"); @@ -1932,7 +1933,7 @@ pub async fn record_row_change( } } - fn summarize(from: Option<&SerdeMap>, to: Option<&SerdeMap>) -> Result { + fn summarize(from: Option<&ValveRow>, to: Option<&ValveRow>) -> Result { // Constructs a summary of the form: // { // "column":"bar", @@ -2420,7 +2421,7 @@ pub async fn insert_new_row( compiled_rule_conditions: &HashMap>>, pool: &AnyPool, table: &str, - row: &SerdeMap, + row: &ValveRow, new_row_number: Option, user: &str, ) -> Result { @@ -2469,7 +2470,7 @@ pub async fn insert_new_row_tx( pool: &AnyPool, tx: &mut Transaction, table: &str, - row: &SerdeMap, + row: &ValveRow, new_row_number: Option, skip_validation: bool, ) -> Result { @@ -2789,7 +2790,7 @@ pub async fn update_row( compiled_rule_conditions: &HashMap>>, pool: &AnyPool, table_name: &str, - row: &SerdeMap, + row: &ValveRow, row_number: &u32, user: &str, ) -> Result<(), sqlx::Error> { @@ -2854,7 +2855,7 @@ pub async fn update_row_tx( pool: &AnyPool, tx: &mut Transaction, table: &str, - row: &SerdeMap, + row: &ValveRow, row_number: &u32, skip_validation: bool, do_not_recurse: bool, @@ -2967,10 +2968,10 @@ pub async fn update_row_tx( Ok(()) } -/// Given a path, read a TSV file and return a vector of rows represented as SerdeMaps. +/// Given a path, read a TSV file and return a vector of rows represented as ValveRows. /// Note: Use this function to read "small" TSVs only. In particular, use this for the special /// configuration tables. -fn read_tsv_into_vector(path: &str) -> Vec { +fn read_tsv_into_vector(path: &str) -> Vec { let mut rdr = csv::ReaderBuilder::new() .delimiter(b'\t') @@ -2981,7 +2982,7 @@ fn read_tsv_into_vector(path: &str) -> Vec { let rows: Vec<_> = rdr .deserialize() .map(|result| { - let row: SerdeMap = result.expect(format!("Error reading: {}", path).as_str()); + let row: ValveRow = result.expect(format!("Error reading: {}", path).as_str()); row }) .collect(); @@ -3010,8 +3011,8 @@ fn read_tsv_into_vector(path: &str) -> Vec { } /// Given a database at the specified location, query the "table" table and return a vector of rows -/// represented as SerdeMaps. -fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec { +/// represented as ValveRows. +fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec { let connection_options; if database.starts_with("postgresql://") { connection_options = AnyConnectOptions::from_str(database).unwrap(); @@ -3036,7 +3037,7 @@ fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec, + pub row: Option, } /// Given a config map, maps of compiled datatype and rule conditions, a database connection @@ -62,10 +62,10 @@ pub async fn validate_row( pool: &AnyPool, tx: Option<&mut Transaction<'_, sqlx::Any>>, table_name: &str, - row: &SerdeMap, + row: &ValveRow, row_number: Option, query_as_if: Option<&QueryAsIf>, -) -> Result { +) -> Result { // Fallback to a default transaction if it is not given. Since we do not commit before it falls // out of scope the transaction will be rolled back at the end of this function. And since this // function is read-only the rollback is trivial and therefore inconsequential. @@ -944,10 +944,10 @@ pub fn validate_rows_intra( result_rows } -/// Given a row represented as a SerdeMap, remove any duplicate messages from the row's cells, so +/// Given a row represented as a ValveRow, remove any duplicate messages from the row's cells, so /// that no cell has messages with the same level, rule, and message text. -fn remove_duplicate_messages(row: &SerdeMap) -> Result { - let mut deduped_row = SerdeMap::new(); +fn remove_duplicate_messages(row: &ValveRow) -> Result { + let mut deduped_row = ValveRow::new(); for (column_name, cell) in row.iter() { let mut messages = cell .get("messages") @@ -981,12 +981,12 @@ fn remove_duplicate_messages(row: &SerdeMap) -> Result { Ok(deduped_row) } -/// Given a result row, convert it to a SerdeMap and return it. +/// Given a result row, convert it to a ValveRow and return it. /// Note that if the incoming result row has an associated row_number, this is ignored. -fn result_row_to_config_map(incoming: &ResultRow) -> SerdeMap { - let mut outgoing = SerdeMap::new(); +fn result_row_to_config_map(incoming: &ResultRow) -> ValveRow { + let mut outgoing = ValveRow::new(); for (column, cell) in incoming.contents.iter() { - let mut cell_map = SerdeMap::new(); + let mut cell_map = ValveRow::new(); if let Some(nulltype) = &cell.nulltype { cell_map.insert( "nulltype".to_string(), From c20bca7e7c094ce70b4e448dd83d25adbde7cb9b Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 15:15:25 -0500 Subject: [PATCH 34/48] add stubs for new API --- src/lib.rs | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index ba4b2167..1d056e99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,198 @@ lazy_static! { pub type SerdeMap = serde_json::Map; pub type ValveRow = serde_json::Map; +pub struct Valve { + global_config: SerdeMap, + compiled_datatype_conditions: HashMap, + compiled_rule_conditions: HashMap>>, + pool: AnyPool, + user: String, +} + +impl Valve { + /// Given a path to a table table, + /// read it, configure VALVE, and return a new Valve struct. + /// Return an error if reading or configuration fails. + pub fn build(mut self, table_path: &str) -> Result { + // Should be ConfigError + todo!(); + Ok(self) + } + + /// Set the user name for this instance. + /// The username must be a short string without newlines. + /// Return an error on invalid username. + pub fn set_user(mut self, user: &str) -> Result { + // ConfigError + todo!(); + Ok(self) + } + + /// Given a database connection string, + /// create a database connection for VALVE to use. + /// Drop and replace any current database connection. + /// Return an error if the connection cannot be created. + pub fn connect(mut self, connection: &str) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Create all configured database tables and views + /// if they do not already exist as configured. + /// Return an error on database problems. + pub fn create_all_tables(mut self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Drop all configured tables, in reverse dependency order. + /// Return an error on database problem. + pub fn drop_all_tables(self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// drop those tables, in the given order. + /// Return an error on invalid table name or database problem. + pub fn drop_tables(self, tables: Vec<&str>) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Truncate all configured tables, in reverse dependency order. + /// Return an error on database problem. + pub fn truncate_all_tables(self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// truncate those tables, in the given order. + /// Return an error on invalid table name or database problem. + pub fn truncate_tables(self, tables: Vec<&str>) -> Result { + // ConfigOrDatabaseError + //self.create_all_tables(); + todo!(); + Ok(self) + } + + /// Load all configured tables in dependency order. + /// If `validate` is false, just try to insert all rows. + /// Return an error on database problem, + /// including database conflicts that prevent rows being inserted. + pub fn load_all_tables(self, validate: bool) -> Result { + // DatabaseError + //self.create_all_tables(); + //self.truncate_all_tables(); + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// load those tables in the given order. + /// If `validate` is false, just try to insert all rows. + /// Return an error on invalid table name or database problem. + pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result { + // ConfigOrDatabaseError + //self.create_all_tables(); + //self.truncate_tables(tables); + todo!(); + Ok(self) + } + + /// Save all configured tables to their 'path's. + /// Return an error on writing or database problem. + pub fn save_all_tables(self) -> Result { + // WriteOrDatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// Save thosee tables to their 'path's, in the given order. + /// Return an error on writing or database problem. + pub fn save_tables(self, tables: Vec<&str>) -> Result { + // WriteOrDatabaseError + todo!(); + Ok(self) + } + + /// Given a table name and a row as JSON, + /// return the validated row. + /// Return an error on database problem. + pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result { + // DatabaseError + todo!(); + } + + /// Given a table name and a row as JSON, + /// add the row to the table in the database, + /// and return the validated row, including its new row_number. + /// Return an error invalid table name or database problem. + pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result { + // ConfigOrDatabaseError + todo!(); + } + + /// Given a table name, a row number, and a row as JSON, + /// update the row in the database, + /// and return the validated row. + /// Return an error invalid table name or row number or database problem. + pub fn update_row( + self, + table_name: &str, + row_number: usize, + row: &ValveRow, + ) -> Result { + // ConfigOrDatabaseError + todo!(); + } + + /// Given a table name and a row number, + /// delete that row from the table. + /// Return an error invalid table name or row number or database problem. + pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { + // ConfigOrDatabaseError + todo!(); + } + + /// Return the next change to undo, or None. + /// Return an error on database problem. + pub fn get_record_to_undo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Return the next change to redo, or None. + /// Return an error on database problem. + pub fn get_record_to_redo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Undo one change and return the change record + /// or None if there was no change to undo. + /// Return an error on database problem. + pub fn undo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Redo one change and return the change record + /// or None if there was no change to redo. + /// Return an error on database problem. + pub fn redo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } +} + /// Represents a structure such as those found in the `structure` column of the `column` table in /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its /// original format (i.e., as a plain String). From 2a6eabcf928ffbdde709c4c5d3b80a99cd6ce2e0 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 19:27:17 -0500 Subject: [PATCH 35/48] implement Valve::build() --- src/lib.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1d056e99..8407b621 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,22 +83,116 @@ lazy_static! { pub type SerdeMap = serde_json::Map; pub type ValveRow = serde_json::Map; +#[derive(Debug)] pub struct Valve { - global_config: SerdeMap, - compiled_datatype_conditions: HashMap, - compiled_rule_conditions: HashMap>>, - pool: AnyPool, - user: String, + pub global_config: SerdeMap, + pub compiled_datatype_conditions: HashMap, + pub compiled_rule_conditions: HashMap>>, + pub pool: Option, + pub user: String, } +// TODO NEXT: Move the existing public functions into this interface: impl Valve { - /// Given a path to a table table, - /// read it, configure VALVE, and return a new Valve struct. + /// Given a path to a table table and its name, read the table table, configure VALVE + /// partially ... TODO: finish this. + /// , and return a new Valve struct. /// Return an error if reading or configuration fails. - pub fn build(mut self, table_path: &str) -> Result { + pub async fn build( + table_path: &str, + config_table: &str, + // TODO: We need to refactor configure_db() so that it no longer collects the constraints + // configuration. We will do that in read_config_files() instead. + // Once this is implemented, the code below to construct the AnyPool which is used to + // call configure_db() should be removed. + // We will also remove the `database`, `initial_load` and `verbose` parameters. + database: &str, + initial_load: bool, + verbose: bool, + ) -> Result { // Should be ConfigError - todo!(); - Ok(self) + + let parser = StartParser::new(); + + let (specials_config, mut tables_config, mut datatypes_config, rules_config) = + read_config_files(table_path, config_table); + + //////////////////////////////////////////////////////////////////////////////////////// + // TODO: Remove this block of code later (see comment above) + let connection_options; + if database.starts_with("postgresql://") { + connection_options = AnyConnectOptions::from_str(database)?; + } else { + let connection_string; + if !database.starts_with("sqlite://") { + connection_string = format!("sqlite://{}?mode=rwc", database); + } else { + connection_string = database.to_string(); + } + connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); + } + + let pool = AnyPoolOptions::new() + .max_connections(5) + .connect_with(connection_options) + .await?; + + let (sorted_table_list, constraints_config) = configure_db( + &mut tables_config, + &mut datatypes_config, + &pool, + &parser, + verbose, + &ValveCommand::Config, + ) + .await?; + //////////////////////////////////////////////////////////////////////////////////////// + + let mut global_config = SerdeMap::new(); + global_config.insert( + String::from("special"), + SerdeValue::Object(specials_config.clone()), + ); + global_config.insert( + String::from("table"), + SerdeValue::Object(tables_config.clone()), + ); + global_config.insert( + String::from("datatype"), + SerdeValue::Object(datatypes_config.clone()), + ); + global_config.insert( + String::from("rule"), + SerdeValue::Object(rules_config.clone()), + ); + global_config.insert( + String::from("constraints"), + SerdeValue::Object(constraints_config.clone()), + ); + let mut sorted_table_serdevalue_list: Vec = vec![]; + for table in &sorted_table_list { + sorted_table_serdevalue_list.push(SerdeValue::String(table.to_string())); + } + global_config.insert( + String::from("sorted_table_list"), + SerdeValue::Array(sorted_table_serdevalue_list), + ); + + let compiled_datatype_conditions = + get_compiled_datatype_conditions(&global_config, &parser); + let compiled_rule_conditions = get_compiled_rule_conditions( + &global_config, + compiled_datatype_conditions.clone(), + &parser, + ); + + Ok(Self { + global_config: global_config, + compiled_datatype_conditions: compiled_datatype_conditions, + compiled_rule_conditions: compiled_rule_conditions, + pool: None, + user: String::from("Valve"), + }) } /// Set the user name for this instance. @@ -106,7 +200,7 @@ impl Valve { /// Return an error on invalid username. pub fn set_user(mut self, user: &str) -> Result { // ConfigError - todo!(); + self.user = user.to_string(); Ok(self) } From 29a1be894a811c417841a3e2a2a39becaa5aaddb Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 08:23:25 -0500 Subject: [PATCH 36/48] refactor, fix api sigs, implement Valve::connect() and Valve::create_tables() --- src/lib.rs | 112 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 41 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8407b621..fd14adf4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,7 +95,7 @@ pub struct Valve { // TODO NEXT: Move the existing public functions into this interface: impl Valve { /// Given a path to a table table and its name, read the table table, configure VALVE - /// partially ... TODO: finish this. + /// partially ... TODO: finish rewriting this doc string. /// , and return a new Valve struct. /// Return an error if reading or configuration fails. pub async fn build( @@ -110,7 +110,7 @@ impl Valve { initial_load: bool, verbose: bool, ) -> Result { - // Should be ConfigError + // TODO: Error type should be ConfigError let parser = StartParser::new(); @@ -119,24 +119,7 @@ impl Valve { //////////////////////////////////////////////////////////////////////////////////////// // TODO: Remove this block of code later (see comment above) - let connection_options; - if database.starts_with("postgresql://") { - connection_options = AnyConnectOptions::from_str(database)?; - } else { - let connection_string; - if !database.starts_with("sqlite://") { - connection_string = format!("sqlite://{}?mode=rwc", database); - } else { - connection_string = database.to_string(); - } - connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); - } - - let pool = AnyPoolOptions::new() - .max_connections(5) - .connect_with(connection_options) - .await?; - + let pool = get_pool_from_connection_string(database).await?; let (sorted_table_list, constraints_config) = configure_db( &mut tables_config, &mut datatypes_config, @@ -198,7 +181,7 @@ impl Valve { /// Set the user name for this instance. /// The username must be a short string without newlines. /// Return an error on invalid username. - pub fn set_user(mut self, user: &str) -> Result { + pub fn set_user(&mut self, user: &str) -> Result<&mut Self, sqlx::Error> { // ConfigError self.user = user.to_string(); Ok(self) @@ -208,24 +191,48 @@ impl Valve { /// create a database connection for VALVE to use. /// Drop and replace any current database connection. /// Return an error if the connection cannot be created. - pub fn connect(mut self, connection: &str) -> Result { + pub async fn connect(&mut self, connection: &str) -> Result<&mut Self, sqlx::Error> { // DatabaseError - todo!(); + self.pool = Some(get_pool_from_connection_string(connection).await?); Ok(self) } /// Create all configured database tables and views /// if they do not already exist as configured. /// Return an error on database problems. - pub fn create_all_tables(mut self) -> Result { + pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError - todo!(); + let mut tables_config = self + .global_config + .get_mut("table") + .and_then(|t| t.as_object_mut()) + .unwrap(); + let mut tables_config = tables_config.clone(); + let mut datatypes_config = self + .global_config + .get_mut("datatype") + .and_then(|d| d.as_object_mut()) + .unwrap(); + let mut datatypes_config = datatypes_config.clone(); + let pool = self.pool.as_ref().unwrap(); + let parser = StartParser::new(); + + // TODO: Revisit this once te configure_db() function has been refactored: + let (_, _) = configure_db( + &mut tables_config, + &mut datatypes_config, + &pool, + &parser, + verbose, + &ValveCommand::Create, + ) + .await?; Ok(self) } /// Drop all configured tables, in reverse dependency order. /// Return an error on database problem. - pub fn drop_all_tables(self) -> Result { + pub fn drop_all_tables(&self) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -234,7 +241,7 @@ impl Valve { /// Given a vector of table names, /// drop those tables, in the given order. /// Return an error on invalid table name or database problem. - pub fn drop_tables(self, tables: Vec<&str>) -> Result { + pub fn drop_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -242,7 +249,7 @@ impl Valve { /// Truncate all configured tables, in reverse dependency order. /// Return an error on database problem. - pub fn truncate_all_tables(self) -> Result { + pub fn truncate_all_tables(&self) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -251,7 +258,7 @@ impl Valve { /// Given a vector of table names, /// truncate those tables, in the given order. /// Return an error on invalid table name or database problem. - pub fn truncate_tables(self, tables: Vec<&str>) -> Result { + pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError //self.create_all_tables(); todo!(); @@ -262,7 +269,7 @@ impl Valve { /// If `validate` is false, just try to insert all rows. /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. - pub fn load_all_tables(self, validate: bool) -> Result { + pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { // DatabaseError //self.create_all_tables(); //self.truncate_all_tables(); @@ -274,7 +281,7 @@ impl Valve { /// load those tables in the given order. /// If `validate` is false, just try to insert all rows. /// Return an error on invalid table name or database problem. - pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result { + pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError //self.create_all_tables(); //self.truncate_tables(tables); @@ -284,7 +291,7 @@ impl Valve { /// Save all configured tables to their 'path's. /// Return an error on writing or database problem. - pub fn save_all_tables(self) -> Result { + pub fn save_all_tables(&self) -> Result<&Self, sqlx::Error> { // WriteOrDatabaseError todo!(); Ok(self) @@ -293,7 +300,7 @@ impl Valve { /// Given a vector of table names, /// Save thosee tables to their 'path's, in the given order. /// Return an error on writing or database problem. - pub fn save_tables(self, tables: Vec<&str>) -> Result { + pub fn save_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // WriteOrDatabaseError todo!(); Ok(self) @@ -302,7 +309,7 @@ impl Valve { /// Given a table name and a row as JSON, /// return the validated row. /// Return an error on database problem. - pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result { + pub fn validate_row(&self, table_name: &str, row: &ValveRow) -> Result { // DatabaseError todo!(); } @@ -311,7 +318,7 @@ impl Valve { /// add the row to the table in the database, /// and return the validated row, including its new row_number. /// Return an error invalid table name or database problem. - pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result { + pub fn insert_row(&self, table_name: &str, row: &ValveRow) -> Result { // ConfigOrDatabaseError todo!(); } @@ -321,7 +328,7 @@ impl Valve { /// and return the validated row. /// Return an error invalid table name or row number or database problem. pub fn update_row( - self, + &self, table_name: &str, row_number: usize, row: &ValveRow, @@ -333,21 +340,21 @@ impl Valve { /// Given a table name and a row number, /// delete that row from the table. /// Return an error invalid table name or row number or database problem. - pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { + pub fn delete_row(&self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { // ConfigOrDatabaseError todo!(); } /// Return the next change to undo, or None. /// Return an error on database problem. - pub fn get_record_to_undo(self) -> Result, sqlx::Error> { + pub fn get_record_to_undo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } /// Return the next change to redo, or None. /// Return an error on database problem. - pub fn get_record_to_redo(self) -> Result, sqlx::Error> { + pub fn get_record_to_redo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -355,7 +362,7 @@ impl Valve { /// Undo one change and return the change record /// or None if there was no change to undo. /// Return an error on database problem. - pub fn undo(self) -> Result, sqlx::Error> { + pub fn undo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -363,7 +370,7 @@ impl Valve { /// Redo one change and return the change record /// or None if there was no change to redo. /// Return an error on database problem. - pub fn redo(self) -> Result, sqlx::Error> { + pub fn redo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -432,6 +439,29 @@ impl std::fmt::Debug for ColumnRule { } } +/// TODO: Add docstring here. Note that once we have refactored configure_db() (see above) it may +/// make more sense for this function to be an inner function of Valve. +pub async fn get_pool_from_connection_string(database: &str) -> Result { + let connection_options; + if database.starts_with("postgresql://") { + connection_options = AnyConnectOptions::from_str(database)?; + } else { + let connection_string; + if !database.starts_with("sqlite://") { + connection_string = format!("sqlite://{}?mode=rwc", database); + } else { + connection_string = database.to_string(); + } + connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); + } + + let pool = AnyPoolOptions::new() + .max_connections(5) + .connect_with(connection_options) + .await?; + Ok(pool) +} + /// Given the path to a configuration table (either a table.tsv file or a database containing a /// table named "table"), load and check the 'table', 'column', and 'datatype' tables, and return /// SerdeMaps corresponding to specials, tables, datatypes, and rules. From c6f96fc4598e54ae86ebeeae45b89cac65f6b380 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 08:25:36 -0500 Subject: [PATCH 37/48] rename create_all_tables to create_missing_tables --- src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fd14adf4..73721b07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ impl Valve { /// Create all configured database tables and views /// if they do not already exist as configured. /// Return an error on database problems. - pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { + pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError let mut tables_config = self .global_config @@ -260,7 +260,7 @@ impl Valve { /// Return an error on invalid table name or database problem. pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); todo!(); Ok(self) } @@ -270,8 +270,10 @@ impl Valve { /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { + // YOU ARE HERE. + // DatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); //self.truncate_all_tables(); todo!(); Ok(self) @@ -283,7 +285,7 @@ impl Valve { /// Return an error on invalid table name or database problem. pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); //self.truncate_tables(tables); todo!(); Ok(self) From 83e741ed51030529f98d3885874749e260b25099 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 09:07:17 -0500 Subject: [PATCH 38/48] implement (rough) load_all_tables() --- src/lib.rs | 63 ++++++++++++++++++++++++++++++++++++++++++++++------- src/main.rs | 23 +++++++++++-------- 2 files changed, 69 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 73721b07..4eefe7e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,9 +105,8 @@ impl Valve { // configuration. We will do that in read_config_files() instead. // Once this is implemented, the code below to construct the AnyPool which is used to // call configure_db() should be removed. - // We will also remove the `database`, `initial_load` and `verbose` parameters. + // We will also remove the `database` and `verbose` parameters. database: &str, - initial_load: bool, verbose: bool, ) -> Result { // TODO: Error type should be ConfigError @@ -202,6 +201,10 @@ impl Valve { /// Return an error on database problems. pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError + + // TODO: Revisit the implementation of this once te configure_db() function has been + // refactored. Currently it implicitly drops and recreates _all_ tables but eventually this + // function needs to do this only for _missing_ tables. let mut tables_config = self .global_config .get_mut("table") @@ -217,7 +220,6 @@ impl Valve { let pool = self.pool.as_ref().unwrap(); let parser = StartParser::new(); - // TODO: Revisit this once te configure_db() function has been refactored: let (_, _) = configure_db( &mut tables_config, &mut datatypes_config, @@ -269,13 +271,58 @@ impl Valve { /// If `validate` is false, just try to insert all rows. /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. - pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { - // YOU ARE HERE. - + pub async fn load_all_tables( + &mut self, + validate: bool, + verbose: bool, + initial_load: bool, + ) -> Result<&mut Self, sqlx::Error> { // DatabaseError - //self.create_missing_tables(); + + self.create_missing_tables(verbose); //self.truncate_all_tables(); - todo!(); + if let Some(pool) = &self.pool { + if pool.any_kind() == AnyKind::Sqlite { + sqlx_query("PRAGMA foreign_keys = ON").execute(pool).await?; + if initial_load { + // These pragmas are unsafe but they are used during initial loading since data + // integrity is not a priority in this case. + sqlx_query("PRAGMA journal_mode = OFF") + .execute(pool) + .await?; + sqlx_query("PRAGMA synchronous = 0").execute(pool).await?; + sqlx_query("PRAGMA cache_size = 1000000") + .execute(pool) + .await?; + sqlx_query("PRAGMA temp_store = MEMORY") + .execute(pool) + .await?; + } + } + + if verbose { + eprintln!( + "{} - Processing {} tables.", + Utc::now(), + self.global_config + .get("sorted_table_list") + .and_then(|l| l.as_array()) + .unwrap() + .len() + ); + } + load_db( + &self.global_config, + &pool, + &self.compiled_datatype_conditions, + &self.compiled_rule_conditions, + verbose, + ) + .await?; + } else { + eprintln!("WARN: Attempt to load tables but Valve is not connected to a database."); + } + Ok(self) } diff --git a/src/main.rs b/src/main.rs index 7e61aba4..4c919167 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use argparse::{ArgumentParser, Store, StoreTrue}; use ontodev_valve::{ get_compiled_datatype_conditions, get_compiled_rule_conditions, get_parsed_structure_conditions, valve, valve_grammar::StartParser, ValveCommand, + Valve }; use serde_json::{from_str, Value as SerdeValue}; use std::{env, process}; @@ -156,15 +157,19 @@ async fn main() -> Result<(), sqlx::Error> { ) .await?; } else { - valve( - &source, - &destination, - &ValveCommand::Load, - verbose, - initial_load, - &config_table, - ) - .await?; + let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?; + valve.connect(&destination).await?; + valve.create_missing_tables(verbose).await?; + valve.load_all_tables(true, verbose, initial_load).await?; + // valve( + // &source, + // &destination, + // &ValveCommand::Load, + // verbose, + // initial_load, + // &config_table, + // ) + // .await?; } Ok(()) From 8dceabf780e8f1943f20baa00b13dde97eb1ba54 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 09:14:36 -0500 Subject: [PATCH 39/48] fix small bug in call to create_all_tables() --- src/lib.rs | 2 +- src/main.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4eefe7e9..5bc104bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -279,7 +279,7 @@ impl Valve { ) -> Result<&mut Self, sqlx::Error> { // DatabaseError - self.create_missing_tables(verbose); + self.create_missing_tables(verbose).await?; //self.truncate_all_tables(); if let Some(pool) = &self.pool { if pool.any_kind() == AnyKind::Sqlite { diff --git a/src/main.rs b/src/main.rs index 4c919167..486cb522 100644 --- a/src/main.rs +++ b/src/main.rs @@ -159,7 +159,6 @@ async fn main() -> Result<(), sqlx::Error> { } else { let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?; valve.connect(&destination).await?; - valve.create_missing_tables(verbose).await?; valve.load_all_tables(true, verbose, initial_load).await?; // valve( // &source, From f4495be667609d30bee4039a07e9f9f201f7cb56 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 11:49:34 -0500 Subject: [PATCH 40/48] do not store label if it is the same as the normalized column name, and replace consecutive illegal characters with a single underscore (and remove trailing underscores). --- scripts/guess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/guess.py b/scripts/guess.py index 61e4ea14..0f9ab864 100755 --- a/scripts/guess.py +++ b/scripts/guess.py @@ -55,7 +55,7 @@ def get_random_sample(table, sample_size): for i in sample_row_numbers: for label, value in rows[i].items(): if label not in sample: - ncolumn = re.sub(pattern, "", label).casefold() + ncolumn = re.sub(pattern, "_", label).casefold().strip("_") if has_ncolumn(sample, ncolumn): print( "The data has more than one column with the normalized name " @@ -465,7 +465,7 @@ def get_from(target, potential_foreign_columns): row = [ f"{table}", f"{sample[label]['normalized']}", - f"{label}", + f"{label if label != sample[label]['normalized'] else ''}", f"{sample[label].get('nulltype', '')}", f"{sample[label]['datatype']}", f"{sample[label].get('structure', '')}", @@ -504,7 +504,7 @@ def get_from(target, potential_foreign_columns): f"{row_number}", f"'{table}'", f"'{sample[label]['normalized']}'", - f"'{label}'", + f"'{label}'" if label != sample[label]["normalized"] else "NULL", f"'{sample[label]['nulltype']}'" if sample[label].get("nulltype") else "NULL", f"'{sample[label]['datatype']}'", f"'{sample[label]['structure']}'" if sample[label].get("structure") else "NULL", From dca2095bb4953720aa05820567011d60de8dcdd6 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 14:30:17 -0500 Subject: [PATCH 41/48] warn but do not panic, during load, if table file doesn't exist --- src/lib.rs | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 296fd7cf..638770a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4403,6 +4403,27 @@ async fn load_db( let mut total_infos = 0; let mut table_num = 1; for table_name in table_list { + let path = String::from( + config + .get("table") + .and_then(|t| t.as_object()) + .and_then(|o| o.get(&table_name)) + .and_then(|n| n.get("path")) + .and_then(|p| p.as_str()) + .unwrap(), + ); + let mut rdr = { + match File::open(path.clone()) { + Err(e) => { + eprintln!("WARN: Unable to open '{}': {}", path.clone(), e); + continue; + } + Ok(table_file) => csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_reader(table_file), + } + }; if verbose { eprintln!( "{} - Loading table {}/{}: {}", @@ -4413,21 +4434,6 @@ async fn load_db( ); } table_num += 1; - let path = String::from( - config - .get("table") - .and_then(|t| t.as_object()) - .and_then(|o| o.get(&table_name)) - .and_then(|n| n.get("path")) - .and_then(|p| p.as_str()) - .unwrap(), - ); - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_reader(File::open(path.clone()).unwrap_or_else(|err| { - panic!("Unable to open '{}': {}", path.clone(), err); - })); // Extract the headers, which we will need later: let mut records = rdr.records(); From 81800669a70d2b416e09904163d547dc00f0e70c Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 14:48:15 -0500 Subject: [PATCH 42/48] add ValveRow alias --- src/lib.rs | 43 ++++++++++++++++++++++--------------------- src/validate.rs | 22 +++++++++++----------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 638770a0..ba4b2167 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,11 @@ lazy_static! { static ref SL_SQL_TYPES: Vec<&'static str> = vec!["text", "numeric", "integer", "real"]; } -/// An alias for [serde_json::Map](..//serde_json/struct.Map.html). +/// Aliases for [serde_json::Map](..//serde_json/struct.Map.html). // Note: serde_json::Map is // [backed by a BTreeMap by default](https://docs.serde.rs/serde_json/map/index.html) pub type SerdeMap = serde_json::Map; +pub type ValveRow = serde_json::Map; /// Represents a structure such as those found in the `structure` column of the `column` table in /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its @@ -1444,7 +1445,7 @@ pub async fn get_affected_rows( global_config: &SerdeMap, pool: &AnyPool, tx: &mut Transaction<'_, sqlx::Any>, -) -> Result, String> { +) -> Result, String> { // Since the consequence of an update could involve currently invalid rows // (in the conflict table) becoming valid or vice versa, we need to check rows for // which the value of the column is the same as `value` @@ -1470,7 +1471,7 @@ pub async fn get_affected_rows( .await .map_err(|e| e.to_string())? { - let mut table_row = SerdeMap::new(); + let mut table_row = ValveRow::new(); let mut row_number: Option = None; for column in row.columns() { let cname = column.name(); @@ -1508,7 +1509,7 @@ pub async fn get_row_from_db( tx: &mut Transaction<'_, sqlx::Any>, table: &str, row_number: &u32, -) -> Result { +) -> Result { let sql = format!( "{} WHERE row_number = {}", query_with_message_values(table, global_config, pool), @@ -1541,7 +1542,7 @@ pub async fn get_row_from_db( } }; - let mut row = SerdeMap::new(); + let mut row = ValveRow::new(); for column in sql_row.columns() { let cname = column.name(); if !vec!["row_number", "message"].contains(&cname) { @@ -1649,7 +1650,7 @@ pub async fn get_rows_to_update( ), String, > { - fn get_cell_value(row: &SerdeMap, column: &str) -> Result { + fn get_cell_value(row: &ValveRow, column: &str) -> Result { match row.get(column).and_then(|cell| cell.get("value")) { Some(SerdeValue::String(s)) => Ok(format!("{}", s)), Some(SerdeValue::Number(n)) => Ok(format!("{}", n)), @@ -1900,8 +1901,8 @@ pub async fn record_row_change( tx: &mut Transaction<'_, sqlx::Any>, table: &str, row_number: &u32, - from: Option<&SerdeMap>, - to: Option<&SerdeMap>, + from: Option<&ValveRow>, + to: Option<&ValveRow>, user: &str, ) -> Result<(), sqlx::Error> { if let (None, None) = (from, to) { @@ -1910,8 +1911,8 @@ pub async fn record_row_change( )); } - fn to_text(smap: Option<&SerdeMap>, quoted: bool) -> String { - match smap { + fn to_text(row: Option<&ValveRow>, quoted: bool) -> String { + match row { None => "NULL".to_string(), Some(r) => { let inner = format!("{}", json!(r)).replace("'", "''"); @@ -1932,7 +1933,7 @@ pub async fn record_row_change( } } - fn summarize(from: Option<&SerdeMap>, to: Option<&SerdeMap>) -> Result { + fn summarize(from: Option<&ValveRow>, to: Option<&ValveRow>) -> Result { // Constructs a summary of the form: // { // "column":"bar", @@ -2420,7 +2421,7 @@ pub async fn insert_new_row( compiled_rule_conditions: &HashMap>>, pool: &AnyPool, table: &str, - row: &SerdeMap, + row: &ValveRow, new_row_number: Option, user: &str, ) -> Result { @@ -2469,7 +2470,7 @@ pub async fn insert_new_row_tx( pool: &AnyPool, tx: &mut Transaction, table: &str, - row: &SerdeMap, + row: &ValveRow, new_row_number: Option, skip_validation: bool, ) -> Result { @@ -2789,7 +2790,7 @@ pub async fn update_row( compiled_rule_conditions: &HashMap>>, pool: &AnyPool, table_name: &str, - row: &SerdeMap, + row: &ValveRow, row_number: &u32, user: &str, ) -> Result<(), sqlx::Error> { @@ -2854,7 +2855,7 @@ pub async fn update_row_tx( pool: &AnyPool, tx: &mut Transaction, table: &str, - row: &SerdeMap, + row: &ValveRow, row_number: &u32, skip_validation: bool, do_not_recurse: bool, @@ -2967,10 +2968,10 @@ pub async fn update_row_tx( Ok(()) } -/// Given a path, read a TSV file and return a vector of rows represented as SerdeMaps. +/// Given a path, read a TSV file and return a vector of rows represented as ValveRows. /// Note: Use this function to read "small" TSVs only. In particular, use this for the special /// configuration tables. -fn read_tsv_into_vector(path: &str) -> Vec { +fn read_tsv_into_vector(path: &str) -> Vec { let mut rdr = csv::ReaderBuilder::new() .delimiter(b'\t') @@ -2981,7 +2982,7 @@ fn read_tsv_into_vector(path: &str) -> Vec { let rows: Vec<_> = rdr .deserialize() .map(|result| { - let row: SerdeMap = result.expect(format!("Error reading: {}", path).as_str()); + let row: ValveRow = result.expect(format!("Error reading: {}", path).as_str()); row }) .collect(); @@ -3010,8 +3011,8 @@ fn read_tsv_into_vector(path: &str) -> Vec { } /// Given a database at the specified location, query the "table" table and return a vector of rows -/// represented as SerdeMaps. -fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec { +/// represented as ValveRows. +fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec { let connection_options; if database.starts_with("postgresql://") { connection_options = AnyConnectOptions::from_str(database).unwrap(); @@ -3036,7 +3037,7 @@ fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec, + pub row: Option, } /// Given a config map, maps of compiled datatype and rule conditions, a database connection @@ -62,10 +62,10 @@ pub async fn validate_row( pool: &AnyPool, tx: Option<&mut Transaction<'_, sqlx::Any>>, table_name: &str, - row: &SerdeMap, + row: &ValveRow, row_number: Option, query_as_if: Option<&QueryAsIf>, -) -> Result { +) -> Result { // Fallback to a default transaction if it is not given. Since we do not commit before it falls // out of scope the transaction will be rolled back at the end of this function. And since this // function is read-only the rollback is trivial and therefore inconsequential. @@ -944,10 +944,10 @@ pub fn validate_rows_intra( result_rows } -/// Given a row represented as a SerdeMap, remove any duplicate messages from the row's cells, so +/// Given a row represented as a ValveRow, remove any duplicate messages from the row's cells, so /// that no cell has messages with the same level, rule, and message text. -fn remove_duplicate_messages(row: &SerdeMap) -> Result { - let mut deduped_row = SerdeMap::new(); +fn remove_duplicate_messages(row: &ValveRow) -> Result { + let mut deduped_row = ValveRow::new(); for (column_name, cell) in row.iter() { let mut messages = cell .get("messages") @@ -981,12 +981,12 @@ fn remove_duplicate_messages(row: &SerdeMap) -> Result { Ok(deduped_row) } -/// Given a result row, convert it to a SerdeMap and return it. +/// Given a result row, convert it to a ValveRow and return it. /// Note that if the incoming result row has an associated row_number, this is ignored. -fn result_row_to_config_map(incoming: &ResultRow) -> SerdeMap { - let mut outgoing = SerdeMap::new(); +fn result_row_to_config_map(incoming: &ResultRow) -> ValveRow { + let mut outgoing = ValveRow::new(); for (column, cell) in incoming.contents.iter() { - let mut cell_map = SerdeMap::new(); + let mut cell_map = ValveRow::new(); if let Some(nulltype) = &cell.nulltype { cell_map.insert( "nulltype".to_string(), From 529dd28ebc03c3958dbb40291600052c0742c535 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 15:15:25 -0500 Subject: [PATCH 43/48] add stubs for new API --- src/lib.rs | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index ba4b2167..1d056e99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,198 @@ lazy_static! { pub type SerdeMap = serde_json::Map; pub type ValveRow = serde_json::Map; +pub struct Valve { + global_config: SerdeMap, + compiled_datatype_conditions: HashMap, + compiled_rule_conditions: HashMap>>, + pool: AnyPool, + user: String, +} + +impl Valve { + /// Given a path to a table table, + /// read it, configure VALVE, and return a new Valve struct. + /// Return an error if reading or configuration fails. + pub fn build(mut self, table_path: &str) -> Result { + // Should be ConfigError + todo!(); + Ok(self) + } + + /// Set the user name for this instance. + /// The username must be a short string without newlines. + /// Return an error on invalid username. + pub fn set_user(mut self, user: &str) -> Result { + // ConfigError + todo!(); + Ok(self) + } + + /// Given a database connection string, + /// create a database connection for VALVE to use. + /// Drop and replace any current database connection. + /// Return an error if the connection cannot be created. + pub fn connect(mut self, connection: &str) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Create all configured database tables and views + /// if they do not already exist as configured. + /// Return an error on database problems. + pub fn create_all_tables(mut self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Drop all configured tables, in reverse dependency order. + /// Return an error on database problem. + pub fn drop_all_tables(self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// drop those tables, in the given order. + /// Return an error on invalid table name or database problem. + pub fn drop_tables(self, tables: Vec<&str>) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Truncate all configured tables, in reverse dependency order. + /// Return an error on database problem. + pub fn truncate_all_tables(self) -> Result { + // DatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// truncate those tables, in the given order. + /// Return an error on invalid table name or database problem. + pub fn truncate_tables(self, tables: Vec<&str>) -> Result { + // ConfigOrDatabaseError + //self.create_all_tables(); + todo!(); + Ok(self) + } + + /// Load all configured tables in dependency order. + /// If `validate` is false, just try to insert all rows. + /// Return an error on database problem, + /// including database conflicts that prevent rows being inserted. + pub fn load_all_tables(self, validate: bool) -> Result { + // DatabaseError + //self.create_all_tables(); + //self.truncate_all_tables(); + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// load those tables in the given order. + /// If `validate` is false, just try to insert all rows. + /// Return an error on invalid table name or database problem. + pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result { + // ConfigOrDatabaseError + //self.create_all_tables(); + //self.truncate_tables(tables); + todo!(); + Ok(self) + } + + /// Save all configured tables to their 'path's. + /// Return an error on writing or database problem. + pub fn save_all_tables(self) -> Result { + // WriteOrDatabaseError + todo!(); + Ok(self) + } + + /// Given a vector of table names, + /// Save thosee tables to their 'path's, in the given order. + /// Return an error on writing or database problem. + pub fn save_tables(self, tables: Vec<&str>) -> Result { + // WriteOrDatabaseError + todo!(); + Ok(self) + } + + /// Given a table name and a row as JSON, + /// return the validated row. + /// Return an error on database problem. + pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result { + // DatabaseError + todo!(); + } + + /// Given a table name and a row as JSON, + /// add the row to the table in the database, + /// and return the validated row, including its new row_number. + /// Return an error invalid table name or database problem. + pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result { + // ConfigOrDatabaseError + todo!(); + } + + /// Given a table name, a row number, and a row as JSON, + /// update the row in the database, + /// and return the validated row. + /// Return an error invalid table name or row number or database problem. + pub fn update_row( + self, + table_name: &str, + row_number: usize, + row: &ValveRow, + ) -> Result { + // ConfigOrDatabaseError + todo!(); + } + + /// Given a table name and a row number, + /// delete that row from the table. + /// Return an error invalid table name or row number or database problem. + pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { + // ConfigOrDatabaseError + todo!(); + } + + /// Return the next change to undo, or None. + /// Return an error on database problem. + pub fn get_record_to_undo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Return the next change to redo, or None. + /// Return an error on database problem. + pub fn get_record_to_redo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Undo one change and return the change record + /// or None if there was no change to undo. + /// Return an error on database problem. + pub fn undo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } + + /// Redo one change and return the change record + /// or None if there was no change to redo. + /// Return an error on database problem. + pub fn redo(self) -> Result, sqlx::Error> { + // DatabaseError + todo!(); + } +} + /// Represents a structure such as those found in the `structure` column of the `column` table in /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its /// original format (i.e., as a plain String). From efeb611eac7fc4e0a93fb20c56981713ae1a2595 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Sun, 26 Nov 2023 19:27:17 -0500 Subject: [PATCH 44/48] implement Valve::build() --- src/lib.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1d056e99..8407b621 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,22 +83,116 @@ lazy_static! { pub type SerdeMap = serde_json::Map; pub type ValveRow = serde_json::Map; +#[derive(Debug)] pub struct Valve { - global_config: SerdeMap, - compiled_datatype_conditions: HashMap, - compiled_rule_conditions: HashMap>>, - pool: AnyPool, - user: String, + pub global_config: SerdeMap, + pub compiled_datatype_conditions: HashMap, + pub compiled_rule_conditions: HashMap>>, + pub pool: Option, + pub user: String, } +// TODO NEXT: Move the existing public functions into this interface: impl Valve { - /// Given a path to a table table, - /// read it, configure VALVE, and return a new Valve struct. + /// Given a path to a table table and its name, read the table table, configure VALVE + /// partially ... TODO: finish this. + /// , and return a new Valve struct. /// Return an error if reading or configuration fails. - pub fn build(mut self, table_path: &str) -> Result { + pub async fn build( + table_path: &str, + config_table: &str, + // TODO: We need to refactor configure_db() so that it no longer collects the constraints + // configuration. We will do that in read_config_files() instead. + // Once this is implemented, the code below to construct the AnyPool which is used to + // call configure_db() should be removed. + // We will also remove the `database`, `initial_load` and `verbose` parameters. + database: &str, + initial_load: bool, + verbose: bool, + ) -> Result { // Should be ConfigError - todo!(); - Ok(self) + + let parser = StartParser::new(); + + let (specials_config, mut tables_config, mut datatypes_config, rules_config) = + read_config_files(table_path, config_table); + + //////////////////////////////////////////////////////////////////////////////////////// + // TODO: Remove this block of code later (see comment above) + let connection_options; + if database.starts_with("postgresql://") { + connection_options = AnyConnectOptions::from_str(database)?; + } else { + let connection_string; + if !database.starts_with("sqlite://") { + connection_string = format!("sqlite://{}?mode=rwc", database); + } else { + connection_string = database.to_string(); + } + connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); + } + + let pool = AnyPoolOptions::new() + .max_connections(5) + .connect_with(connection_options) + .await?; + + let (sorted_table_list, constraints_config) = configure_db( + &mut tables_config, + &mut datatypes_config, + &pool, + &parser, + verbose, + &ValveCommand::Config, + ) + .await?; + //////////////////////////////////////////////////////////////////////////////////////// + + let mut global_config = SerdeMap::new(); + global_config.insert( + String::from("special"), + SerdeValue::Object(specials_config.clone()), + ); + global_config.insert( + String::from("table"), + SerdeValue::Object(tables_config.clone()), + ); + global_config.insert( + String::from("datatype"), + SerdeValue::Object(datatypes_config.clone()), + ); + global_config.insert( + String::from("rule"), + SerdeValue::Object(rules_config.clone()), + ); + global_config.insert( + String::from("constraints"), + SerdeValue::Object(constraints_config.clone()), + ); + let mut sorted_table_serdevalue_list: Vec = vec![]; + for table in &sorted_table_list { + sorted_table_serdevalue_list.push(SerdeValue::String(table.to_string())); + } + global_config.insert( + String::from("sorted_table_list"), + SerdeValue::Array(sorted_table_serdevalue_list), + ); + + let compiled_datatype_conditions = + get_compiled_datatype_conditions(&global_config, &parser); + let compiled_rule_conditions = get_compiled_rule_conditions( + &global_config, + compiled_datatype_conditions.clone(), + &parser, + ); + + Ok(Self { + global_config: global_config, + compiled_datatype_conditions: compiled_datatype_conditions, + compiled_rule_conditions: compiled_rule_conditions, + pool: None, + user: String::from("Valve"), + }) } /// Set the user name for this instance. @@ -106,7 +200,7 @@ impl Valve { /// Return an error on invalid username. pub fn set_user(mut self, user: &str) -> Result { // ConfigError - todo!(); + self.user = user.to_string(); Ok(self) } From 1c4980821e7a93234b49ac7bb26843969a1d5e89 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 08:23:25 -0500 Subject: [PATCH 45/48] refactor, fix api sigs, implement Valve::connect() and Valve::create_tables() --- src/lib.rs | 112 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 41 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8407b621..fd14adf4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,7 +95,7 @@ pub struct Valve { // TODO NEXT: Move the existing public functions into this interface: impl Valve { /// Given a path to a table table and its name, read the table table, configure VALVE - /// partially ... TODO: finish this. + /// partially ... TODO: finish rewriting this doc string. /// , and return a new Valve struct. /// Return an error if reading or configuration fails. pub async fn build( @@ -110,7 +110,7 @@ impl Valve { initial_load: bool, verbose: bool, ) -> Result { - // Should be ConfigError + // TODO: Error type should be ConfigError let parser = StartParser::new(); @@ -119,24 +119,7 @@ impl Valve { //////////////////////////////////////////////////////////////////////////////////////// // TODO: Remove this block of code later (see comment above) - let connection_options; - if database.starts_with("postgresql://") { - connection_options = AnyConnectOptions::from_str(database)?; - } else { - let connection_string; - if !database.starts_with("sqlite://") { - connection_string = format!("sqlite://{}?mode=rwc", database); - } else { - connection_string = database.to_string(); - } - connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); - } - - let pool = AnyPoolOptions::new() - .max_connections(5) - .connect_with(connection_options) - .await?; - + let pool = get_pool_from_connection_string(database).await?; let (sorted_table_list, constraints_config) = configure_db( &mut tables_config, &mut datatypes_config, @@ -198,7 +181,7 @@ impl Valve { /// Set the user name for this instance. /// The username must be a short string without newlines. /// Return an error on invalid username. - pub fn set_user(mut self, user: &str) -> Result { + pub fn set_user(&mut self, user: &str) -> Result<&mut Self, sqlx::Error> { // ConfigError self.user = user.to_string(); Ok(self) @@ -208,24 +191,48 @@ impl Valve { /// create a database connection for VALVE to use. /// Drop and replace any current database connection. /// Return an error if the connection cannot be created. - pub fn connect(mut self, connection: &str) -> Result { + pub async fn connect(&mut self, connection: &str) -> Result<&mut Self, sqlx::Error> { // DatabaseError - todo!(); + self.pool = Some(get_pool_from_connection_string(connection).await?); Ok(self) } /// Create all configured database tables and views /// if they do not already exist as configured. /// Return an error on database problems. - pub fn create_all_tables(mut self) -> Result { + pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError - todo!(); + let mut tables_config = self + .global_config + .get_mut("table") + .and_then(|t| t.as_object_mut()) + .unwrap(); + let mut tables_config = tables_config.clone(); + let mut datatypes_config = self + .global_config + .get_mut("datatype") + .and_then(|d| d.as_object_mut()) + .unwrap(); + let mut datatypes_config = datatypes_config.clone(); + let pool = self.pool.as_ref().unwrap(); + let parser = StartParser::new(); + + // TODO: Revisit this once te configure_db() function has been refactored: + let (_, _) = configure_db( + &mut tables_config, + &mut datatypes_config, + &pool, + &parser, + verbose, + &ValveCommand::Create, + ) + .await?; Ok(self) } /// Drop all configured tables, in reverse dependency order. /// Return an error on database problem. - pub fn drop_all_tables(self) -> Result { + pub fn drop_all_tables(&self) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -234,7 +241,7 @@ impl Valve { /// Given a vector of table names, /// drop those tables, in the given order. /// Return an error on invalid table name or database problem. - pub fn drop_tables(self, tables: Vec<&str>) -> Result { + pub fn drop_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -242,7 +249,7 @@ impl Valve { /// Truncate all configured tables, in reverse dependency order. /// Return an error on database problem. - pub fn truncate_all_tables(self) -> Result { + pub fn truncate_all_tables(&self) -> Result<&Self, sqlx::Error> { // DatabaseError todo!(); Ok(self) @@ -251,7 +258,7 @@ impl Valve { /// Given a vector of table names, /// truncate those tables, in the given order. /// Return an error on invalid table name or database problem. - pub fn truncate_tables(self, tables: Vec<&str>) -> Result { + pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError //self.create_all_tables(); todo!(); @@ -262,7 +269,7 @@ impl Valve { /// If `validate` is false, just try to insert all rows. /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. - pub fn load_all_tables(self, validate: bool) -> Result { + pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { // DatabaseError //self.create_all_tables(); //self.truncate_all_tables(); @@ -274,7 +281,7 @@ impl Valve { /// load those tables in the given order. /// If `validate` is false, just try to insert all rows. /// Return an error on invalid table name or database problem. - pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result { + pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError //self.create_all_tables(); //self.truncate_tables(tables); @@ -284,7 +291,7 @@ impl Valve { /// Save all configured tables to their 'path's. /// Return an error on writing or database problem. - pub fn save_all_tables(self) -> Result { + pub fn save_all_tables(&self) -> Result<&Self, sqlx::Error> { // WriteOrDatabaseError todo!(); Ok(self) @@ -293,7 +300,7 @@ impl Valve { /// Given a vector of table names, /// Save thosee tables to their 'path's, in the given order. /// Return an error on writing or database problem. - pub fn save_tables(self, tables: Vec<&str>) -> Result { + pub fn save_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // WriteOrDatabaseError todo!(); Ok(self) @@ -302,7 +309,7 @@ impl Valve { /// Given a table name and a row as JSON, /// return the validated row. /// Return an error on database problem. - pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result { + pub fn validate_row(&self, table_name: &str, row: &ValveRow) -> Result { // DatabaseError todo!(); } @@ -311,7 +318,7 @@ impl Valve { /// add the row to the table in the database, /// and return the validated row, including its new row_number. /// Return an error invalid table name or database problem. - pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result { + pub fn insert_row(&self, table_name: &str, row: &ValveRow) -> Result { // ConfigOrDatabaseError todo!(); } @@ -321,7 +328,7 @@ impl Valve { /// and return the validated row. /// Return an error invalid table name or row number or database problem. pub fn update_row( - self, + &self, table_name: &str, row_number: usize, row: &ValveRow, @@ -333,21 +340,21 @@ impl Valve { /// Given a table name and a row number, /// delete that row from the table. /// Return an error invalid table name or row number or database problem. - pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { + pub fn delete_row(&self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> { // ConfigOrDatabaseError todo!(); } /// Return the next change to undo, or None. /// Return an error on database problem. - pub fn get_record_to_undo(self) -> Result, sqlx::Error> { + pub fn get_record_to_undo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } /// Return the next change to redo, or None. /// Return an error on database problem. - pub fn get_record_to_redo(self) -> Result, sqlx::Error> { + pub fn get_record_to_redo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -355,7 +362,7 @@ impl Valve { /// Undo one change and return the change record /// or None if there was no change to undo. /// Return an error on database problem. - pub fn undo(self) -> Result, sqlx::Error> { + pub fn undo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -363,7 +370,7 @@ impl Valve { /// Redo one change and return the change record /// or None if there was no change to redo. /// Return an error on database problem. - pub fn redo(self) -> Result, sqlx::Error> { + pub fn redo(&self) -> Result, sqlx::Error> { // DatabaseError todo!(); } @@ -432,6 +439,29 @@ impl std::fmt::Debug for ColumnRule { } } +/// TODO: Add docstring here. Note that once we have refactored configure_db() (see above) it may +/// make more sense for this function to be an inner function of Valve. +pub async fn get_pool_from_connection_string(database: &str) -> Result { + let connection_options; + if database.starts_with("postgresql://") { + connection_options = AnyConnectOptions::from_str(database)?; + } else { + let connection_string; + if !database.starts_with("sqlite://") { + connection_string = format!("sqlite://{}?mode=rwc", database); + } else { + connection_string = database.to_string(); + } + connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap(); + } + + let pool = AnyPoolOptions::new() + .max_connections(5) + .connect_with(connection_options) + .await?; + Ok(pool) +} + /// Given the path to a configuration table (either a table.tsv file or a database containing a /// table named "table"), load and check the 'table', 'column', and 'datatype' tables, and return /// SerdeMaps corresponding to specials, tables, datatypes, and rules. From b5ea3a811ebe38999e63da46e21f5c209a008f94 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 08:25:36 -0500 Subject: [PATCH 46/48] rename create_all_tables to create_missing_tables --- src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fd14adf4..73721b07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ impl Valve { /// Create all configured database tables and views /// if they do not already exist as configured. /// Return an error on database problems. - pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { + pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError let mut tables_config = self .global_config @@ -260,7 +260,7 @@ impl Valve { /// Return an error on invalid table name or database problem. pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); todo!(); Ok(self) } @@ -270,8 +270,10 @@ impl Valve { /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { + // YOU ARE HERE. + // DatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); //self.truncate_all_tables(); todo!(); Ok(self) @@ -283,7 +285,7 @@ impl Valve { /// Return an error on invalid table name or database problem. pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> { // ConfigOrDatabaseError - //self.create_all_tables(); + //self.create_missing_tables(); //self.truncate_tables(tables); todo!(); Ok(self) From 268bd2aa171bef1171fd11989fd08a8a4a00103f Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 09:07:17 -0500 Subject: [PATCH 47/48] implement (rough) load_all_tables() --- src/lib.rs | 63 ++++++++++++++++++++++++++++++++++++++++++++++------- src/main.rs | 23 +++++++++++-------- 2 files changed, 69 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 73721b07..4eefe7e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,9 +105,8 @@ impl Valve { // configuration. We will do that in read_config_files() instead. // Once this is implemented, the code below to construct the AnyPool which is used to // call configure_db() should be removed. - // We will also remove the `database`, `initial_load` and `verbose` parameters. + // We will also remove the `database` and `verbose` parameters. database: &str, - initial_load: bool, verbose: bool, ) -> Result { // TODO: Error type should be ConfigError @@ -202,6 +201,10 @@ impl Valve { /// Return an error on database problems. pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> { // DatabaseError + + // TODO: Revisit the implementation of this once te configure_db() function has been + // refactored. Currently it implicitly drops and recreates _all_ tables but eventually this + // function needs to do this only for _missing_ tables. let mut tables_config = self .global_config .get_mut("table") @@ -217,7 +220,6 @@ impl Valve { let pool = self.pool.as_ref().unwrap(); let parser = StartParser::new(); - // TODO: Revisit this once te configure_db() function has been refactored: let (_, _) = configure_db( &mut tables_config, &mut datatypes_config, @@ -269,13 +271,58 @@ impl Valve { /// If `validate` is false, just try to insert all rows. /// Return an error on database problem, /// including database conflicts that prevent rows being inserted. - pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> { - // YOU ARE HERE. - + pub async fn load_all_tables( + &mut self, + validate: bool, + verbose: bool, + initial_load: bool, + ) -> Result<&mut Self, sqlx::Error> { // DatabaseError - //self.create_missing_tables(); + + self.create_missing_tables(verbose); //self.truncate_all_tables(); - todo!(); + if let Some(pool) = &self.pool { + if pool.any_kind() == AnyKind::Sqlite { + sqlx_query("PRAGMA foreign_keys = ON").execute(pool).await?; + if initial_load { + // These pragmas are unsafe but they are used during initial loading since data + // integrity is not a priority in this case. + sqlx_query("PRAGMA journal_mode = OFF") + .execute(pool) + .await?; + sqlx_query("PRAGMA synchronous = 0").execute(pool).await?; + sqlx_query("PRAGMA cache_size = 1000000") + .execute(pool) + .await?; + sqlx_query("PRAGMA temp_store = MEMORY") + .execute(pool) + .await?; + } + } + + if verbose { + eprintln!( + "{} - Processing {} tables.", + Utc::now(), + self.global_config + .get("sorted_table_list") + .and_then(|l| l.as_array()) + .unwrap() + .len() + ); + } + load_db( + &self.global_config, + &pool, + &self.compiled_datatype_conditions, + &self.compiled_rule_conditions, + verbose, + ) + .await?; + } else { + eprintln!("WARN: Attempt to load tables but Valve is not connected to a database."); + } + Ok(self) } diff --git a/src/main.rs b/src/main.rs index 7e61aba4..4c919167 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use argparse::{ArgumentParser, Store, StoreTrue}; use ontodev_valve::{ get_compiled_datatype_conditions, get_compiled_rule_conditions, get_parsed_structure_conditions, valve, valve_grammar::StartParser, ValveCommand, + Valve }; use serde_json::{from_str, Value as SerdeValue}; use std::{env, process}; @@ -156,15 +157,19 @@ async fn main() -> Result<(), sqlx::Error> { ) .await?; } else { - valve( - &source, - &destination, - &ValveCommand::Load, - verbose, - initial_load, - &config_table, - ) - .await?; + let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?; + valve.connect(&destination).await?; + valve.create_missing_tables(verbose).await?; + valve.load_all_tables(true, verbose, initial_load).await?; + // valve( + // &source, + // &destination, + // &ValveCommand::Load, + // verbose, + // initial_load, + // &config_table, + // ) + // .await?; } Ok(()) From 2b4073070959c839b6514df4b1bb4182da834235 Mon Sep 17 00:00:00 2001 From: Michael Cuffaro Date: Mon, 27 Nov 2023 09:14:36 -0500 Subject: [PATCH 48/48] fix small bug in call to create_all_tables() --- src/lib.rs | 2 +- src/main.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4eefe7e9..5bc104bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -279,7 +279,7 @@ impl Valve { ) -> Result<&mut Self, sqlx::Error> { // DatabaseError - self.create_missing_tables(verbose); + self.create_missing_tables(verbose).await?; //self.truncate_all_tables(); if let Some(pool) = &self.pool { if pool.any_kind() == AnyKind::Sqlite { diff --git a/src/main.rs b/src/main.rs index 4c919167..486cb522 100644 --- a/src/main.rs +++ b/src/main.rs @@ -159,7 +159,6 @@ async fn main() -> Result<(), sqlx::Error> { } else { let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?; valve.connect(&destination).await?; - valve.create_missing_tables(verbose).await?; valve.load_all_tables(true, verbose, initial_load).await?; // valve( // &source,