From 21dc32653285263198b6aef81f8e90a7fada0ee9 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 6 Nov 2023 13:12:16 -0500
Subject: [PATCH 01/48] require that actual and defined columns always match
 when the former exist

---
 src/lib.rs                       | 122 +++++++++++++++----------------
 test/expected/table3.tsv         |  24 +++---
 test/random_test_data/column.tsv |   7 ++
 test/src/column.tsv              |   8 ++
 test/src/ontology/table3.tsv     |  22 +++---
 5 files changed, 97 insertions(+), 86 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 16de0d94..296fd7cf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -969,31 +969,32 @@ pub async fn configure_db(
     // use that information to create the associated database tables, while saving constraint
     // information to constrains_config.
     let mut setup_statements = HashMap::new();
-    let table_names: Vec<String> = tables_config.keys().cloned().collect();
-    for table_name in table_names {
+    for table_name in tables_config.keys().cloned().collect::<Vec<_>>() {
         let optional_path = tables_config
             .get(&table_name)
             .and_then(|r| r.get("path"))
             .and_then(|p| p.as_str());
 
-        let path;
+        let mut path = None;
         match optional_path {
-            // If an entry of the tables_config has no path then it is an internal table which need
-            // not be configured explicitly. Currently the only example is the message table.
-            None => continue,
+            None => {
+                // If an entry of the tables_config has no path then it is an internal table which
+                // need not be configured explicitly. Currently the only examples are the message
+                // and history tables.
+                if table_name != "message" && table_name != "history" {
+                    panic!("No path defined for table {}", table_name);
+                }
+                continue;
+            }
             Some(p) if !Path::new(p).is_file() => {
                 eprintln!("WARN: File does not exist {}", p);
-                continue;
             }
             Some(p) if Path::new(p).canonicalize().is_err() => {
                 eprintln!("WARN: File path could not be made canonical {}", p);
-                continue;
             }
-
-            Some(p) => path = p.to_string(),
+            Some(p) => path = Some(p.to_string()),
         };
 
-        // Get the columns that have been previously configured:
         let defined_columns: Vec<String> = tables_config
             .get(&table_name)
             .and_then(|r| r.get("column"))
@@ -1003,65 +1004,58 @@ pub async fn configure_db(
             .and_then(|k| Some(k.collect()))
             .unwrap();
 
-        // Get the actual columns from the data itself. Note that we set has_headers to false
-        // (even though the files have header rows) in order to explicitly read the header row.
-        let mut rdr = csv::ReaderBuilder::new()
-            .has_headers(false)
-            .delimiter(b'\t')
-            .from_reader(File::open(path.clone()).unwrap_or_else(|err| {
-                panic!("Unable to open '{}': {}", path.clone(), err);
-            }));
-        let mut iter = rdr.records();
-        let actual_columns;
-        if let Some(result) = iter.next() {
-            actual_columns = result.unwrap();
-        } else {
-            panic!("'{}' is empty", path);
-        }
-
         // We use column_order to explicitly indicate the order in which the columns should appear
-        // in the table, for later reference.
+        // in the table, for later reference. The default is to preserve the order from the actual
+        // table file. If that does not exist, we use the ordering in defined_columns.
         let mut column_order = vec![];
-        let mut all_columns: SerdeMap = SerdeMap::new();
-        for column_name in &actual_columns {
-            let column;
-            if !defined_columns.contains(&column_name.to_string()) {
-                let mut cmap = SerdeMap::new();
-                cmap.insert(
-                    String::from("table"),
-                    SerdeValue::String(table_name.to_string()),
-                );
-                cmap.insert(
-                    String::from("column"),
-                    SerdeValue::String(column_name.to_string()),
-                );
-                cmap.insert(
-                    String::from("nulltype"),
-                    SerdeValue::String(String::from("empty")),
-                );
-                cmap.insert(
-                    String::from("datatype"),
-                    SerdeValue::String(String::from("text")),
-                );
-                column = SerdeValue::Object(cmap);
-            } else {
-                column = tables_config
-                    .get(&table_name)
-                    .and_then(|r| r.get("column"))
-                    .and_then(|v| v.as_object())
-                    .and_then(|o| o.get(column_name))
+        if let Some(path) = path {
+            // Get the actual columns from the data itself. Note that we set has_headers to
+            // false(even though the files have header rows) in order to explicitly read the
+            // header row.
+            let mut rdr = csv::ReaderBuilder::new()
+                .has_headers(false)
+                .delimiter(b'\t')
+                .from_reader(File::open(path.clone()).unwrap_or_else(|err| {
+                    panic!("Unable to open '{}': {}", path.clone(), err);
+                }));
+            let mut iter = rdr.records();
+            if let Some(result) = iter.next() {
+                let actual_columns = result
                     .unwrap()
-                    .clone();
+                    .iter()
+                    .map(|c| c.to_string())
+                    .collect::<Vec<_>>();
+                // Make sure that the actual columns found in the table file, and the columns
+                // defined in the column config, exactly match in terms of their content:
+                for column_name in &actual_columns {
+                    column_order.push(json!(column_name));
+                    if !defined_columns.contains(&column_name.to_string()) {
+                        panic!(
+                            "Column '{}.{}' not in column config",
+                            table_name, column_name
+                        );
+                    }
+                }
+                for column_name in &defined_columns {
+                    if !actual_columns.contains(&column_name.to_string()) {
+                        panic!(
+                            "Defined column '{}.{}' not found in table",
+                            table_name, column_name
+                        );
+                    }
+                }
+            } else {
+                panic!("'{}' is empty", path);
             }
-            column_order.push(SerdeValue::String(column_name.to_string()));
-            all_columns.insert(column_name.to_string(), column);
         }
 
+        if column_order.is_empty() {
+            column_order = defined_columns.iter().map(|c| json!(c)).collect::<Vec<_>>();
+        }
         tables_config
             .get_mut(&table_name)
             .and_then(|t| t.as_object_mut())
             .and_then(|o| {
-                o.insert(String::from("column"), SerdeValue::Object(all_columns));
                 o.insert(
                     String::from("column_order"),
                     SerdeValue::Array(column_order),
@@ -1097,9 +1091,11 @@ pub async fn configure_db(
     }
 
     // Sort the tables according to their foreign key dependencies so that tables are always loaded
-    // after the tables they depend on:
-    let unsorted_tables: Vec<String> = setup_statements.keys().cloned().collect();
-    let sorted_tables = verify_table_deps_and_sort(&unsorted_tables, &constraints_config);
+    // after the tables they depend on. Ignore the internal message and history tables:
+    let sorted_tables = verify_table_deps_and_sort(
+        &setup_statements.keys().cloned().collect(),
+        &constraints_config,
+    );
 
     if *command != ValveCommand::Config || verbose {
         // Generate DDL for the history table:
diff --git a/test/expected/table3.tsv b/test/expected/table3.tsv
index 04c78efc..c0f31eda 100644
--- a/test/expected/table3.tsv
+++ b/test/expected/table3.tsv
@@ -1,12 +1,12 @@
-source	id	label	type	parent
-MOB	MOB:0000013	   mobecular entity	owl:Class	material entity
-ZOB	ZOB:0000013	bar	owl:Class	car
-JOB	JOB:0000013	car	owl:Class	foo
-SOB	SOB:0000013	foo	owl:Class	bar
-YOB	YOB:0000013	mar	owl:Class	jafar
-COB	BFO:0000040	material entity	owl:Class	owl:Thing
-CO B	COB:0000013	molecular dentity	owl:Class	material entity
-COB	COB:0000013	molecular entity	owl:Class	material entity
-COB	VO:0000001	vaccine	owl:Class	material entity
-BOB	VO:0000001	vaccine	owl:Class	material entity
-BFOBBER	BFO:0000027	bazaar	owl:Class	barrie
+source	id	label	type	parent	related
+MOB	MOB:0000013	   mobecular entity	owl:Class	material entity	
+ZOB	ZOB:0000013	bar	owl:Class	car	
+JOB	JOB:0000013	car	owl:Class	foo	
+SOB	SOB:0000013	foo	owl:Class	bar	
+YOB	YOB:0000013	mar	owl:Class	jafar	
+COB	BFO:0000040	material entity	owl:Class	owl:Thing	
+CO B	COB:0000013	molecular dentity	owl:Class	material entity	
+COB	COB:0000013	molecular entity	owl:Class	material entity	
+COB	VO:0000001	vaccine	owl:Class	material entity	
+BOB	VO:0000001	vaccine	owl:Class	material entity	
+BFOBBER	BFO:0000027	bazaar	owl:Class	barrie	
diff --git a/test/random_test_data/column.tsv b/test/random_test_data/column.tsv
index 80268a30..f7e6a20e 100644
--- a/test/random_test_data/column.tsv
+++ b/test/random_test_data/column.tsv
@@ -12,7 +12,14 @@ column	structure		empty	trimmed_line		schema information for this column
 column	description		empty	description		a description of this column
 datatype	datatype			datatype_name	primary	the name of this datatype
 datatype	parent		empty	datatype_name	tree(datatype)	the parent datatype
+datatype	transform		empty	word		
 datatype	condition		empty	datatype_condition		
+datatype	structure		empty	trimmed_line		
+datatype	description		empty	trimmed_text		
+datatype	SQLite type		empty	datatype_name		
+datatype	PostgreSQL type		empty	datatype_name		
+datatype	RDF type		empty	datatype_name		
+datatype	HTML type		empty	datatype_name		
 rule	table			table_name		
 rule	when column			column_name		
 rule	when condition			datatype_condition		
diff --git a/test/src/column.tsv b/test/src/column.tsv
index 07f38290..9c6c8256 100644
--- a/test/src/column.tsv
+++ b/test/src/column.tsv
@@ -12,7 +12,14 @@ column	structure		empty	trimmed_line		schema information for this column
 column	description		empty	description		a description of this column
 datatype	datatype			datatype_name	primary	the name of this datatype
 datatype	parent		empty	datatype_name	tree(datatype)	the parent datatype
+datatype	transform		empty	word		
 datatype	condition		empty	datatype_condition		
+datatype	structure		empty	trimmed_line		
+datatype	description		empty	trimmed_text		
+datatype	SQLite type		empty	trimmed_line		
+datatype	PostgreSQL type		empty	trimmed_line		
+datatype	RDF type		empty	trimmed_line		
+datatype	HTML type		empty	datatype_name		
 rule	table			table_name		
 rule	when column			column_name		
 rule	when condition			datatype_condition		
@@ -32,6 +39,7 @@ table2	bar		empty	text
 table3	source			prefix	from(table1.prefix)	
 table3	id			CURIE	unique	
 table3	label			label	primary	
+table3	type		empty	CURIE		
 table3	parent		empty	label	tree(label)	
 table3	related		empty	trimmed_line		
 table4	foreign_column			text	unique	
diff --git a/test/src/ontology/table3.tsv b/test/src/ontology/table3.tsv
index 710e1e16..e8d75e99 100644
--- a/test/src/ontology/table3.tsv
+++ b/test/src/ontology/table3.tsv
@@ -1,11 +1,11 @@
-source	id	label	type	parent
-MOB	MOB:0000013	   mobecular entity	owl:Class	material entity
-ZOB	ZOB:0000013	bar	owl:Class	car
-JOB	JOB:0000013	car	owl:Class	foo
-SOB	SOB:0000013	foo	owl:Class	bar
-YOB	YOB:0000013	mar	owl:Class	jafar
-COB	BFO:0000040	material entity	owl:Class	owl:Thing
-CO B	COB:0000013	molecular dentity	owl:Class	material entity
-COB	COB:0000013	molecular entity	owl:Class	material entity
-COB	VO:0000001	vaccine	owl:Class	material entity
-BOB	VO:0000001	vaccine	owl:Class	material entity
+source	id	label	type	parent	related
+MOB	MOB:0000013	   mobecular entity	owl:Class	material entity	
+ZOB	ZOB:0000013	bar	owl:Class	car	
+JOB	JOB:0000013	car	owl:Class	foo	
+SOB	SOB:0000013	foo	owl:Class	bar	
+YOB	YOB:0000013	mar	owl:Class	jafar	
+COB	BFO:0000040	material entity	owl:Class	owl:Thing	
+CO B	COB:0000013	molecular dentity	owl:Class	material entity	
+COB	COB:0000013	molecular entity	owl:Class	material entity	
+COB	VO:0000001	vaccine	owl:Class	material entity	
+BOB	VO:0000001	vaccine	owl:Class	material entity	

From c133071377693f88cb36304eb3dc7662025d6875 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Tue, 7 Nov 2023 13:59:10 -0500
Subject: [PATCH 02/48] read config from file when generating random data

---
 Makefile                          |   8 +-
 test/generate_random_test_data.py | 325 +++++++++---------------------
 test/perf_test_data/column.tsv    |   7 +
 3 files changed, 110 insertions(+), 230 deletions(-)

diff --git a/Makefile b/Makefile
index 024cd745..d13430b5 100644
--- a/Makefile
+++ b/Makefile
@@ -95,8 +95,8 @@ random_test: sqlite_random_test pg_random_test
 $(random_test_dir)/ontology:
 	mkdir -p $(random_test_dir)/ontology
 
-random_test_data: test/generate_random_test_data.py | $(random_test_dir)/ontology
-	./$< $$(date +"%s") 100 5 $|
+random_test_data: test/generate_random_test_data.py valve valve test/random_test_data/table.tsv | $(random_test_dir)/ontology
+	./$< $$(date +"%s") 100 5 $(word 3,$^) $|
 
 sqlite_random_test: valve clean random_test_data | build test/output
 	@echo "Testing with random data on sqlite ..."
@@ -110,9 +110,9 @@ pg_random_test: valve clean random_test_data | build test/output
 	test/round_trip.sh postgresql:///valve_postgres $(random_test_dir)/table.tsv
 	@echo "Test succeeded!"
 
-test/perf_test_data/ontology: test/generate_random_test_data.py
+test/perf_test_data/ontology: test/generate_random_test_data.py valve test/random_test_data/table.tsv
 	mkdir $@
-	./$< 1 10000 5 $@
+	./$< 1 10000 5 $(word 3,$^) $@
 
 build/valve_perf.db: valve | test/perf_test_data/ontology build
 	@if [ -f $@ ]; \
diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py
index 3f8d988d..e851727d 100755
--- a/test/generate_random_test_data.py
+++ b/test/generate_random_test_data.py
@@ -1,209 +1,65 @@
 #!/usr/bin/env python3
 
+import json
 import math
 import random
 import string
+import subprocess
+import sys
 
 from argparse import ArgumentParser
 
 
 TOKEN_LENGTH = 9
+WINDOW_SIZE = 100
 
 
-CONFIG = {
-    "table1": {
-        "prefix": {
-            "allow_empty": False,
-            "datatype": "prefix",
-            "structure": {
-                "type": "primary",
-            },
-        },
-        "base": {
-            "allow_empty": False,
-            "datatype": "IRI",
-            "structure": {
-                "type": "unique",
-            },
-        },
-        "ontology IRI": {
-            "allow_empty": True,
-            "datatype": "IRI",
-        },
-        "version IRI": {
-            "allow_empty": True,
-            "datatype": "IRI",
-        },
-    },
-    "table2": {
-        "child": {
-            "allow_empty": False,
-            "datatype": "trimmed_line",
-            "structure": {
-                "type": "foreign",
-                "ftable": "table4",
-                "fcolumn": "other_foreign_column",
-            },
-        },
-        "parent": {
-            "allow_empty": True,
-            "datatype": "trimmed_line",
-            "structure": {
-                "type": "tree",
-                "tcolumn": "child",
-            },
-        },
-        "xyzzy": {
-            "allow_empty": True,
-            "datatype": "trimmed_line",
-            "structure": {
-                "type": "under",
-                "ttable": "table2",
-                "tcolumn": "child",
-                "uval": "d",
-            },
-        },
-        "foo": {
-            "allow_empty": True,
-            "datatype": "integer",
-            "structure": {
-                "type": "foreign",
-                "ftable": "table4",
-                "fcolumn": "numeric_foreign_column",
-            },
-        },
-        "bar": {
-            "allow_empty": True,
-            "datatype": "text",
-        },
-    },
-    "table3": {
-        "source": {
-            "allow_empty": False,
-            "datatype": "prefix",
-            "structure": {
-                "type": "foreign",
-                "ftable": "table1",
-                "fcolumn": "prefix",
-            },
-        },
-        "id": {
-            "allow_empty": False,
-            "datatype": "curie",
-            "structure": {
-                "type": "unique",
-            },
-        },
-        "label": {
-            "allow_empty": False,
-            "datatype": "label",
-            "structure": {
-                "type": "primary",
-            },
-        },
-        "parent": {
-            "allow_empty": True,
-            "datatype": "label",
-            "structure": {
-                "type": "tree",
-                "tcolumn": "label",
-            },
-        },
-        "related": {
-            "allow_empty": True,
-            "datatype": "trimmed_line",
-        },
-    },
-    "table4": {
-        "foreign_column": {
-            "allow_empty": False,
-            "datatype": "text",
-            "structure": {
-                "type": "unique",
-            },
-        },
-        "other_foreign_column": {
-            "allow_empty": False,
-            "datatype": "text",
-            "structure": {
-                "type": "unique",
-            },
-        },
-        "numeric_foreign_column": {
-            "allow_empty": False,
-            "datatype": "integer",
-            "structure": {
-                "type": "primary",
-            },
-        },
-    },
-    "table5": {
-        "foo": {
-            "allow_empty": False,
-            "datatype": "word",
-            "structure": {
-                "type": "primary",
-            },
-        },
-        "bar": {
-            "allow_empty": False,
-            "datatype": "integer",
-        },
-    },
-    "table6": {
-        "child": {
-            "allow_empty": False,
-            "datatype": "integer",
-            "structure": {
-                "type": "foreign",
-                "ftable": "table4",
-                "fcolumn": "numeric_foreign_column",
-            },
-        },
-        "parent": {
-            "allow_empty": True,
-            "datatype": "integer",
-            "structure": {
-                "type": "tree",
-                "tcolumn": "child",
-            },
-        },
-        "xyzzy": {
-            "allow_empty": True,
-            "datatype": "integer",
-            "structure": {
-                "type": "under",
-                "ttable": "table6",
-                "tcolumn": "child",
-                "uval": "4",
-            },
-        },
-        "foo": {
-            "allow_empty": True,
-            "datatype": "text",
-        },
-        "bar": {
-            "allow_empty": True,
-            "datatype": "integer",
-        },
-    },
-}
-
-
-def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, to_column):
-    global CONFIG
+def get_special_tables(config):
+    return [k for k, v in config["special"].items() if v is not None]
+
+
+def get_table_columns(config, table):
+    return [column for column in config["table"][table]["column_order"]]
+
+
+def has_nulltype(config, table, column):
+    return bool(config["table"][table]["column"][column].get("nulltype"))
+
+
+def get_column_structure(config, table, column):
+    return config["table"][table]["column"][column].get("structure")
+
+
+def get_column_datatype(config, table, column):
+    return config["table"][table]["column"][column]["datatype"]
+
+
+def get_foreign_key(config, table, column):
+    return [f for f in config["constraints"]["foreign"][table] if f["column"] == column][0]
+
+
+def get_tree(config, table, column):
+    return [f for f in config["constraints"]["tree"][table] if f["parent"] == column][0]
+
+
+def get_under(config, table, column):
+    return [f for f in config["constraints"]["under"][table] if f["column"] == column][0]
+
+
+def get_value_from_prev_insert(config, prev_inserts, from_table, from_column, to_table, to_column):
+    global WINDOW_SIZE
 
     # Note: because we are loading the tables and columns in the correct order (i.e. such that
     # all dependencies are loaded before the tables and columns they depend on), the list of
     # previous inserts for the from_table/from_column will never be empty.
     if len(prev_inserts[from_table][from_column]) == 1:
-        if CONFIG[to_table][to_column]["allow_empty"]:
+        if has_nulltype(config, to_table, to_column):
             return ""
         else:
             return prev_inserts[from_table][from_column][0]
     else:
-        # Select at random from the last 100 inserted values:
-        prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-100:]
+        # Select at random from the last N inserted values, with N given by WINDOW_SIZE:
+        prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-WINDOW_SIZE:]
         from_values = prev_inserts[from_table][from_column]
         # We'd ideally like to exclude the last inserted value from consideration, but we save it
         # here in case we cannot:
@@ -219,58 +75,58 @@ def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table,
             return values_to_choose_from[random.randrange(len(values_to_choose_from))]
 
 
-def get_constrained_cell_value(table, column, row_num, prev_inserts):
+def get_constrained_cell_value(config, table, column, row_num, prev_inserts):
     global TOKEN_LENGTH
-    global CONFIG
-
-    structure = CONFIG[table][column].get("structure")
-    if structure and structure["type"] == "foreign":
-        ftable = structure["ftable"]
-        fcolumn = structure["fcolumn"]
-        cell = get_value_from_prev_insert(prev_inserts, ftable, fcolumn, table, column)
-    elif structure and structure["type"] == "tree":
-        tcolumn = structure["tcolumn"]
-        cell = get_value_from_prev_insert(prev_inserts, table, tcolumn, table, column)
-    elif structure and structure["type"] == "under":
+
+    structure = get_column_structure(config, table, column)
+    datatype = get_column_datatype(config, table, column).casefold()
+    if structure.startswith("from("):
+        fkey = get_foreign_key(config, table, column)
+        ftable = fkey["ftable"]
+        fcolumn = fkey["fcolumn"]
+        cell = get_value_from_prev_insert(config, prev_inserts, ftable, fcolumn, table, column)
+    elif structure.startswith("tree("):
+        tkey = get_tree(config, table, column)
+        tcolumn = tkey["child"]
+        cell = get_value_from_prev_insert(config, prev_inserts, table, tcolumn, table, column)
+    elif structure.startswith("under("):
         # Note that properly satisfying the under constraint requires, not only that
         # the cell is in the specified tree column, but also (a) that the tree
         # actually exists, and (b) that the value is "under" the under value. To do
         # this properly, though, would require a decent amount of memory. So perhaps
         # it's not worth it to check for (a) and (b) and allow any offending cells
         # to generate errors which we can then verify are handled properly by valve.
-        ttable = structure["ttable"]
-        tcolumn = structure["tcolumn"]
-        cell = get_value_from_prev_insert(prev_inserts, ttable, tcolumn, table, column)
-    elif CONFIG[table][column]["datatype"] in [
+        ukey = get_under(config, table, column)
+        ttable = ukey["ttable"]
+        tcolumn = ukey["tcolumn"]
+        cell = get_value_from_prev_insert(config, prev_inserts, ttable, tcolumn, table, column)
+    elif datatype in [
         "prefix",
-        "IRI",
+        "iri",
         "trimmed_line",
         "label",
         "word",
     ]:
         cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
-    elif CONFIG[table][column]["datatype"] == "curie":
+    elif datatype == "curie":
         cell = (
             "".join(random.choices(string.ascii_lowercase, k=3)).upper()
             + ":"
             + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
         )
-    elif CONFIG[table][column]["datatype"] == "text":
+    elif datatype == "text":
         cell = (
             "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
             + " "
             + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
         )
-    elif CONFIG[table][column]["datatype"] == "integer":
+    elif datatype == "integer":
         # No leading 0s:
         cell = "".join(random.choices("123456789", k=1)) + "".join(
             random.choices(string.digits, k=TOKEN_LENGTH - 1)
         )
     else:
-        print(
-            f"Warning: Unknown datatype: {CONFIG[table][column]['datatype']}. "
-            "Generating a random string."
-        )
+        print(f"Warning: Unknown datatype: {datatype}. Generating a random string.")
         cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
 
     return cell
@@ -278,20 +134,20 @@ def get_constrained_cell_value(table, column, row_num, prev_inserts):
 
 def main():
     global TOKEN_LENGTH
-    global CONFIG
 
     parser = ArgumentParser(
         description="""
     Deterministically generate a specified amount of data, a specified percentage of which are
-    errors, using a hard-coded VALVE configuration, given the specified seed, to a specified
-    output directory.
-    """
+    errors, using the given VALVE table configuration and seed, to the output directory."""
     )
     parser.add_argument("seed", help="The seed to use to generate the random data")
     parser.add_argument("num_rows", help="The number of rows per table to generate")
     parser.add_argument(
         "pct_errors", help="The percentage of rows in each table that should have errors"
     )
+    parser.add_argument(
+        "input_table", help="The .TSV file representing the VALVE table configuration"
+    )
     parser.add_argument(
         "output_dir", help="The output directory to write the new table configuration to"
     )
@@ -299,48 +155,65 @@ def main():
     seed = int(args.seed)
     num_rows = int(args.num_rows)
     pct_errors = int(args.pct_errors)
+    input_table = args.input_table
     outdir = args.output_dir
 
+    # Use the seed argument to seed the random data that will be generated:
     random.seed(seed)
 
+    # Get the VALVE configuration:
+    result = subprocess.run(["valve", "--dump_config", input_table], capture_output=True)
+    if result.returncode != 0:
+        error = result.stderr.decode()
+        output = result.stdout.decode()
+        if output:
+            error = f"{error}\n{output}"
+        print(f"{error}", file=sys.stderr)
+        sys.exit(result.returncode)
+    config = json.loads(result.stdout.decode())
+
     # This is a record of the last inserted values for each table and column. When one column
     # takes its values from another column, then we look here and fetch the last inserted value of
     # the second column.
     prev_inserts = {}
+
+    # The data tables to generate:
+    data_tables = [t for t in config["sorted_table_list"] if t not in get_special_tables(config)]
+
+    # The TSV files corresponding to each data table:
     tsv_files = {}
-    tables_in_order = ["table4", "table1", "table2", "table3", "table5", "table6"]
-    for table in tables_in_order:
+    for table in data_tables:
         tsv_files[table] = open(f"{outdir}/{table}.tsv", "w")
-        columns = [column for column in CONFIG[table]]
+        columns = get_table_columns(config, table)
         print("\t".join(columns), file=tsv_files[table])
 
     num_error_rows = math.ceil((pct_errors / 100) * num_rows)
     error_proportion = None if not num_error_rows else math.floor(num_rows / num_error_rows)
     for row_num in range(1, num_rows + 1):
-        for table in tables_in_order:
+        for table in data_tables:
             is_error_row = error_proportion and row_num % error_proportion == 1
-            columns = [column for column in CONFIG[table]]
+            columns = get_table_columns(config, table)
             error_column = random.randrange(len(columns))
             row = {}
             for column_num, column in enumerate(columns):
                 is_error_column = is_error_row and column_num == error_column
                 if (
                     not is_error_column
-                    and CONFIG[table][column]["allow_empty"]
+                    and has_nulltype(config, table, column)
                     and row_num % random.randrange(2, num_rows) == 1
                 ):
                     # If the column allows empty values, assign an empty value "sometimes":
                     cell = ""
                 elif not is_error_column:
-                    cell = get_constrained_cell_value(table, column, row_num, prev_inserts)
+                    cell = get_constrained_cell_value(config, table, column, row_num, prev_inserts)
                 else:
-                    if CONFIG[table][column].get("structure") and CONFIG[table][column][
-                        "structure"
-                    ]["type"] in ["unique", "primary"]:
+                    structure = get_column_structure(config, table, column)
+                    datatype = get_column_datatype(config, table, column)
+                    if structure in ["unique", "primary"]:
                         cell = ""
-                    elif CONFIG[table][column]["datatype"] in [
+                    elif datatype in [
                         "prefix",
-                        "IRI",
+                        "iri",
                         "word",
                         "curie",
                     ]:
@@ -350,7 +223,7 @@ def main():
                             + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
                         )
                     else:
-                        if CONFIG[table][column]["datatype"] == "integer":
+                        if datatype == "integer":
                             cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
                         else:
                             # No leading 0s:
diff --git a/test/perf_test_data/column.tsv b/test/perf_test_data/column.tsv
index 80268a30..f7e6a20e 100644
--- a/test/perf_test_data/column.tsv
+++ b/test/perf_test_data/column.tsv
@@ -12,7 +12,14 @@ column	structure		empty	trimmed_line		schema information for this column
 column	description		empty	description		a description of this column
 datatype	datatype			datatype_name	primary	the name of this datatype
 datatype	parent		empty	datatype_name	tree(datatype)	the parent datatype
+datatype	transform		empty	word		
 datatype	condition		empty	datatype_condition		
+datatype	structure		empty	trimmed_line		
+datatype	description		empty	trimmed_text		
+datatype	SQLite type		empty	datatype_name		
+datatype	PostgreSQL type		empty	datatype_name		
+datatype	RDF type		empty	datatype_name		
+datatype	HTML type		empty	datatype_name		
 rule	table			table_name		
 rule	when column			column_name		
 rule	when condition			datatype_condition		

From 33ce7af45b3b6041083676c187aea98887fce0c1 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Tue, 7 Nov 2023 14:15:12 -0500
Subject: [PATCH 03/48] fix path to valve in random data generation script

---
 test/generate_random_test_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py
index e851727d..63e3f32e 100755
--- a/test/generate_random_test_data.py
+++ b/test/generate_random_test_data.py
@@ -162,7 +162,7 @@ def main():
     random.seed(seed)
 
     # Get the VALVE configuration:
-    result = subprocess.run(["valve", "--dump_config", input_table], capture_output=True)
+    result = subprocess.run(["./valve", "--dump_config", input_table], capture_output=True)
     if result.returncode != 0:
         error = result.stderr.decode()
         output = result.stdout.decode()

From 729a894967e6e0cc1999617ad18c7278f9263a8c Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Tue, 7 Nov 2023 14:44:34 -0500
Subject: [PATCH 04/48] add guess test files

---
 Makefile                          | 27 ++++++++++++----
 test/guess_test_data/column.tsv   | 53 +++++++++++++++++++++++++++++++
 test/guess_test_data/datatype.tsv | 22 +++++++++++++
 test/guess_test_data/rule.tsv     |  9 ++++++
 test/guess_test_data/table.tsv    | 11 +++++++
 5 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 test/guess_test_data/column.tsv
 create mode 100644 test/guess_test_data/datatype.tsv
 create mode 100644 test/guess_test_data/rule.tsv
 create mode 100644 test/guess_test_data/table.tsv

diff --git a/Makefile b/Makefile
index d13430b5..d2bd4a2b 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ build:
 
 .PHONY: doc time test sqlite_test pg_test
 .PHONY: api_test sqlite_api_test pg_qpi_test
-.PHONY: random_test_data random_test sqlite_random_test pg_random_test
+.PHONY: random_test_data random_test sqlite_random_test pg_random_test valve_debug valve_release
 
 doc:
 	cargo doc --document-private-items
@@ -22,12 +22,17 @@ doc:
 readme:
 	cargo readme --no-title > README.md
 
-valve: src/*.rs src/*.lalrpop
+valve: valve_debug
+
+valve_release: src/*.rs src/*.lalrpop
 	rm -f valve
 	cargo build --release
 	ln -s target/release/ontodev_valve valve
-	# cargo build
-	# ln -s target/debug/ontodev_valve valve
+
+valve_debug: src/*.rs src/*.lalrpop
+	rm -f valve
+	cargo build
+	ln -s target/debug/ontodev_valve valve
 
 build/valve.db: test/src/table.tsv valve clean | build
 	./valve $< $@
@@ -93,7 +98,7 @@ random_test_dir = test/random_test_data
 random_test: sqlite_random_test pg_random_test
 
 $(random_test_dir)/ontology:
-	mkdir -p $(random_test_dir)/ontology
+	mkdir -p $@
 
 random_test_data: test/generate_random_test_data.py valve valve test/random_test_data/table.tsv | $(random_test_dir)/ontology
 	./$< $$(date +"%s") 100 5 $(word 3,$^) $|
@@ -110,7 +115,15 @@ pg_random_test: valve clean random_test_data | build test/output
 	test/round_trip.sh postgresql:///valve_postgres $(random_test_dir)/table.tsv
 	@echo "Test succeeded!"
 
-test/perf_test_data/ontology: test/generate_random_test_data.py valve test/random_test_data/table.tsv
+guess_test_dir = test/guess_test_data
+
+$(guess_test_dir)/ontology:
+	mkdir -p $@
+
+guess_test_data: test/generate_random_test_data.py valve valve $(guess_test_dir)/table.tsv | $(guess_test_dir)/ontology
+	./$< $$(date +"%s") 10000 5 $(word 3,$^) $|
+
+test/perf_test_data/ontology: test/generate_random_test_data.py valve test/perf_test_data/table.tsv
 	mkdir $@
 	./$< 1 10000 5 $(word 3,$^) $@
 
@@ -136,7 +149,7 @@ pg_perf_test: valve test/perf_test_data/ontology | test/output
 perf_test: sqlite_perf_test pg_perf_test
 
 clean:
-	rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology
+	rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology $(guess_test_dir)/ontology
 
 cleanperfdb:
 	rm -Rf build/valve_perf.db
diff --git a/test/guess_test_data/column.tsv b/test/guess_test_data/column.tsv
new file mode 100644
index 00000000..552b1036
--- /dev/null
+++ b/test/guess_test_data/column.tsv
@@ -0,0 +1,53 @@
+table	column	label	nulltype	datatype	structure	description
+table	table	table_name		table_name	primary	name of this table
+table	path			path		path to the TSV file for this table, relative to the table.tsv file
+table	type		empty	table_type		type of this table, used for tables with special meanings
+table	description		empty	description		a description of this table
+column	table			table_name	from(table.table)	the table that this column belongs to
+column	column			column_name		the name of this column
+column	label		empty	label		the human-readable label for this column
+column	nulltype		empty	datatype_name	from(datatype.datatype)	the datatype for NULL values in this column
+column	datatype			datatype_name	from(datatype.datatype)	the datatype for this column
+column	structure		empty	trimmed_line		schema information for this column
+column	description		empty	description		a description of this column
+datatype	datatype			datatype_name	primary	the name of this datatype
+datatype	parent		empty	datatype_name	tree(datatype)	the parent datatype
+datatype	transform		empty	word		
+datatype	condition		empty	datatype_condition		
+datatype	structure		empty	trimmed_line		
+datatype	description		empty	trimmed_text		
+datatype	SQLite type		empty	datatype_name		
+datatype	PostgreSQL type		empty	datatype_name		
+datatype	RDF type		empty	datatype_name		
+datatype	HTML type		empty	datatype_name		
+rule	table			table_name		
+rule	when column			column_name		
+rule	when condition			datatype_condition		
+rule	then column			column_name		
+rule	then condition			datatype_condition		
+rule	level			word		
+rule	description		empty	description		
+table1	prefix			prefix	primary	
+table1	base			IRI	unique	
+table1	ontology IRI		empty	IRI		
+table1	version IRI		empty	IRI		
+table2	child			trimmed_line		
+table2	parent		empty	trimmed_line		
+table2	xyzzy		empty	trimmed_line		
+table2	foo		empty	integer		
+table2	bar		empty	text		
+table3	source			prefix		
+table3	id			CURIE	unique	
+table3	label			label	primary	
+table3	parent		empty	label		
+table3	related		empty	trimmed_line		
+table4	foreign_column			text	unique	
+table4	other_foreign_column			text	unique	
+table4	numeric_foreign_column			integer	primary	
+table5	foo			word	primary	
+table5	bar			integer		
+table6	child			integer		
+table6	parent		empty	integer		
+table6	xyzzy		empty	integer		
+table6	foo		empty	text		
+table6	bar		empty	integer		
diff --git a/test/guess_test_data/datatype.tsv b/test/guess_test_data/datatype.tsv
new file mode 100644
index 00000000..c118588d
--- /dev/null
+++ b/test/guess_test_data/datatype.tsv
@@ -0,0 +1,22 @@
+datatype	parent	transform	condition	structure	description	SQLite type	PostgreSQL type	RDF type	HTML type
+CURIE	nonspace		match(/\S+:\S+/)	concat(prefix, ":", suffix)	a Compact URI			CURIE	
+IRI	nonspace		exclude(/\s/)		an Internationalized Resource Identifier			IRI	
+column_name	trimmed_line		match(/\S([^\n]*\S)*/)		a column name				
+datatype_condition	line		exclude(/\n/)		a datatype condition specification				
+datatype_name	word		exclude(/\W/)		a datatype name				
+description	trimmed_text		match(/\S(.*\S)*/)		a brief description				
+empty	text		equals('')		the empty string	NULL	NULL	null	
+integer	nonspace		match(/-?\d+/)		a positive or negative integer	INTEGER	INTEGER		
+label	trimmed_line		match(/\S([^\n]*\S)*/)						
+line	text		exclude(/\n/)		a line of text				input
+natural_number	integer		match(/\d+/)		a natural number, including zero	INTEGER	INTEGER		
+nonspace	trimmed_line		exclude(/\s/)		text without whitespace				
+path	line		exclude(/\n/)		a path to a file				
+prefix	word		exclude(/\W/)		a prefix for a CURIE				
+suffix	word		exclude(/\W/)		a suffix for a CURIE				
+table_name	word		exclude(/\W/)		a table name				
+table_type	word	lowercase	in('table', 'column', 'datatype')		a table type				
+text					any text	TEXT	TEXT	xsd:string	textarea
+trimmed_line	line		match(/\S([^\n]*\S)*/)		a line of text that does not begin or end with whitespace				
+trimmed_text	text		exclude(/^\s+|\s+$/)		text that does not begin or end with whitespace				
+word	nonspace		exclude(/\W/)		a single word: letters, numbers, underscore				
diff --git a/test/guess_test_data/rule.tsv b/test/guess_test_data/rule.tsv
new file mode 100644
index 00000000..3a9356ff
--- /dev/null
+++ b/test/guess_test_data/rule.tsv
@@ -0,0 +1,9 @@
+table	when column	when condition	then column	then condition	level	description
+table2	foo	null	bar	null	error	bar must be null whenever foo is null
+table2	foo	not null	bar	not null	error	bar cannot be null if foo is not null
+table2	foo	IRI	bar	label	error	bar must be a label if foo is an IRI
+table2	foo	equals(5)	bar	in('y', 'z')	error	bar must be 'y' or 'z' if foo = 5
+table6	foo	null	bar	null	error	bar must be null whenever foo is null
+table6	foo	not null	bar	not null	error	bar cannot be null if foo is not null
+table6	foo	IRI	bar	label	error	bar must be a label if foo is an IRI
+table6	foo	equals(e)	bar	in(25, 26)	error	bar must be 25 or 26 if foo = 'e'
diff --git a/test/guess_test_data/table.tsv b/test/guess_test_data/table.tsv
new file mode 100644
index 00000000..1fcc8584
--- /dev/null
+++ b/test/guess_test_data/table.tsv
@@ -0,0 +1,11 @@
+table	path	description	type
+column	test/guess_test_data/column.tsv	Columns for all of the tables.	column
+datatype	test/guess_test_data/datatype.tsv	Datatypes for all of the columns	datatype
+rule	test/guess_test_data/rule.tsv	More complex "when" rules	rule
+table	test/guess_test_data/table.tsv	All of the user-editable tables in this project.	table
+table1	test/guess_test_data/ontology/table1.tsv	The first data table	
+table2	test/guess_test_data/ontology/table2.tsv	The second data table	
+table3	test/guess_test_data/ontology/table3.tsv	The third data table	
+table4	test/guess_test_data/ontology/table4.tsv	The fourth data table	
+table5	test/guess_test_data/ontology/table5.tsv	The fifth data table	
+table6	test/guess_test_data/ontology/table6.tsv	The sixth data table (like table2 but all numeric)	

From cdb657fa685bcdfc8a4406104582ab5683535a95 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 10:23:49 -0500
Subject: [PATCH 05/48] setup Makefile for guess tests

---
 Makefile                                 | 70 ++++++++++++++++--------
 confirm_overwrite.sh                     | 14 +++++
 test/generate_random_test_data.py        |  2 +-
 test/guess_test_data/column.tsv          | 20 -------
 test/guess_test_data/column_expected.tsv | 53 ++++++++++++++++++
 test/guess_test_data/rule.tsv            |  9 +--
 test/guess_test_data/table.tsv           |  7 +--
 test/guess_test_data/table_expected.tsv  | 11 ++++
 8 files changed, 128 insertions(+), 58 deletions(-)
 create mode 100755 confirm_overwrite.sh
 create mode 100644 test/guess_test_data/column_expected.tsv
 create mode 100644 test/guess_test_data/table_expected.tsv

diff --git a/Makefile b/Makefile
index d2bd4a2b..01e4f65f 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,8 @@ doc:
 readme:
 	cargo readme --no-title > README.md
 
-valve: valve_debug
+valve: src/*.rs src/*.lalrpop
+	@$(MAKE) valve_debug
 
 valve_release: src/*.rs src/*.lalrpop
 	rm -f valve
@@ -34,7 +35,7 @@ valve_debug: src/*.rs src/*.lalrpop
 	cargo build
 	ln -s target/debug/ontodev_valve valve
 
-build/valve.db: test/src/table.tsv valve clean | build
+build/valve.db: test/src/table.tsv clean valve | build
 	./valve $< $@
 
 test/output:
@@ -116,47 +117,70 @@ pg_random_test: valve clean random_test_data | build test/output
 	@echo "Test succeeded!"
 
 guess_test_dir = test/guess_test_data
+guess_test_db = build/valve_guess.db
+.PHONY: guess_test_data
+
+$(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv
+	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
 
 $(guess_test_dir)/ontology:
 	mkdir -p $@
 
-guess_test_data: test/generate_random_test_data.py valve valve $(guess_test_dir)/table.tsv | $(guess_test_dir)/ontology
-	./$< $$(date +"%s") 10000 5 $(word 3,$^) $|
+guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology
+	./confirm_overwrite.sh $(guess_test_dir)/ontology
+	rm -f $(guess_test_dir)/table1.tsv
+	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
+	rm -f $(guess_test_dir)/ontology/*.tsv
+	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table_expected.tsv $|
+	rm -f $(guess_test_dir)/ontology/table1.tsv
+
+$(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology
+	rm -f $@
+	./$< $(guess_test_dir)/table.tsv $@
+
+perf_test_dir = test/perf_test_data
+perf_test_db = build/valve_perf.db
+.PHONY: perf_test_data
+
+$(perf_test_dir)/ontology:
+	mkdir -p $@
 
-test/perf_test_data/ontology: test/generate_random_test_data.py valve test/perf_test_data/table.tsv
-	mkdir $@
-	./$< 1 10000 5 $(word 3,$^) $@
+perf_test_data: test/generate_random_test_data.py valve confirm_overwrite.sh $(perf_test_dir)/*.tsv | $(perf_test_dir)/ontology
+	./confirm_overwrite.sh $(perf_test_dir)/ontology
+	rm -f $(perf_test_dir)/ontology/*.tsv
+	./$< $$(date +"%s") 10000 5 $(perf_test_dir)/table.tsv $|
 
-build/valve_perf.db: valve | test/perf_test_data/ontology build
-	@if [ -f $@ ]; \
-	then \
-		echo "'$@' exists but is out of date. To rebuild '$@', run \`make cleanperfdb\`" \
-		"before running \`make $@\`" ; \
-		false; \
-	fi
-	time -p ./$< --verbose test/perf_test_data/table.tsv $@
+$(perf_test_db): valve perf_test_data $(perf_test_dir)/*.tsv | build $(perf_test_dir)/ontology
+	rm -f $@
+	time -p ./$< --verbose $(perf_test_dir)/table.tsv $@
 
 .PHONY: sqlite_perf_test
 sqlite_perf_test: build/valve_perf.db | test/output
 	time -p scripts/export.py messages $< $| $(tables_to_test)
 
 .PHONY: pg_perf_test
-pg_perf_test: valve test/perf_test_data/ontology | test/output
-	time -p ./$< --verbose test/perf_test_data/table.tsv postgresql:///valve_postgres
+pg_perf_test: valve $(perf_test_dir)/ontology | test/output
+	time -p ./$< --verbose $(perf_test_dir)/table.tsv postgresql:///valve_postgres
 	time -p scripts/export.py messages postgresql:///valve_postgres $| $(tables_to_test)
 
 .PHONY: perf_test
 perf_test: sqlite_perf_test pg_perf_test
 
 clean:
-	rm -Rf build/valve.db build/valve_random.db test/output $(random_test_dir)/ontology $(guess_test_dir)/ontology
+	rm -Rf build/valve.db* build/valve_random.db* test/output $(random_test_dir)/ontology valve
+
+clean_guess_db:
+	rm -Rf build/valve_guess.db
 
-cleanperfdb:
+clean_guess_data:
+	rm -Rf $(guess_test_dir)/table1.tsv $(guess_test_dir)/ontology
+
+clean_perf_db:
 	rm -Rf build/valve_perf.db
 
-cleanperfdata:
-	rm -Rf test/perf_test_data/ontology
+clean_perf_data:
+	rm -Rf $(perf_test_dir)/ontology
 
-cleanall: clean cleanperfdb cleanperfdata
+cleanall: clean clean_perf_db clean_perf_data clean_guess_db clean_guess_data
 	cargo clean
-	rm -Rf valve
+	rm -f valve
diff --git a/confirm_overwrite.sh b/confirm_overwrite.sh
new file mode 100755
index 00000000..aa58cd50
--- /dev/null
+++ b/confirm_overwrite.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env sh
+
+if [ -d $1 -a ! -z "$(ls -A $1)" ]
+then
+	printf "$1 already exists and contains the following files: $(ls -A -m -w 0 $1)\nAre you sure (y/n)? "
+	read enter
+	if [ $enter = 'y' ]
+	then
+		exit 0
+	else
+    echo "Understood. Exiting with error code."
+		exit 1
+	fi
+fi
diff --git a/test/generate_random_test_data.py b/test/generate_random_test_data.py
index 63e3f32e..87008651 100755
--- a/test/generate_random_test_data.py
+++ b/test/generate_random_test_data.py
@@ -11,7 +11,7 @@
 
 
 TOKEN_LENGTH = 9
-WINDOW_SIZE = 100
+WINDOW_SIZE = 50
 
 
 def get_special_tables(config):
diff --git a/test/guess_test_data/column.tsv b/test/guess_test_data/column.tsv
index 552b1036..2659b524 100644
--- a/test/guess_test_data/column.tsv
+++ b/test/guess_test_data/column.tsv
@@ -31,23 +31,3 @@ table1	prefix			prefix	primary
 table1	base			IRI	unique	
 table1	ontology IRI		empty	IRI		
 table1	version IRI		empty	IRI		
-table2	child			trimmed_line		
-table2	parent		empty	trimmed_line		
-table2	xyzzy		empty	trimmed_line		
-table2	foo		empty	integer		
-table2	bar		empty	text		
-table3	source			prefix		
-table3	id			CURIE	unique	
-table3	label			label	primary	
-table3	parent		empty	label		
-table3	related		empty	trimmed_line		
-table4	foreign_column			text	unique	
-table4	other_foreign_column			text	unique	
-table4	numeric_foreign_column			integer	primary	
-table5	foo			word	primary	
-table5	bar			integer		
-table6	child			integer		
-table6	parent		empty	integer		
-table6	xyzzy		empty	integer		
-table6	foo		empty	text		
-table6	bar		empty	integer		
diff --git a/test/guess_test_data/column_expected.tsv b/test/guess_test_data/column_expected.tsv
new file mode 100644
index 00000000..f7e6a20e
--- /dev/null
+++ b/test/guess_test_data/column_expected.tsv
@@ -0,0 +1,53 @@
+table	column	label	nulltype	datatype	structure	description
+table	table	table_name		table_name	primary	name of this table
+table	path			path		path to the TSV file for this table, relative to the table.tsv file
+table	type		empty	table_type		type of this table, used for tables with special meanings
+table	description		empty	description		a description of this table
+column	table			table_name	from(table.table)	the table that this column belongs to
+column	column			column_name		the name of this column
+column	label		empty	label		the human-readable label for this column
+column	nulltype		empty	datatype_name	from(datatype.datatype)	the datatype for NULL values in this column
+column	datatype			datatype_name	from(datatype.datatype)	the datatype for this column
+column	structure		empty	trimmed_line		schema information for this column
+column	description		empty	description		a description of this column
+datatype	datatype			datatype_name	primary	the name of this datatype
+datatype	parent		empty	datatype_name	tree(datatype)	the parent datatype
+datatype	transform		empty	word		
+datatype	condition		empty	datatype_condition		
+datatype	structure		empty	trimmed_line		
+datatype	description		empty	trimmed_text		
+datatype	SQLite type		empty	datatype_name		
+datatype	PostgreSQL type		empty	datatype_name		
+datatype	RDF type		empty	datatype_name		
+datatype	HTML type		empty	datatype_name		
+rule	table			table_name		
+rule	when column			column_name		
+rule	when condition			datatype_condition		
+rule	then column			column_name		
+rule	then condition			datatype_condition		
+rule	level			word		
+rule	description		empty	description		
+table1	prefix			prefix	primary	
+table1	base			IRI	unique	
+table1	ontology IRI		empty	IRI		
+table1	version IRI		empty	IRI		
+table2	child			trimmed_line	from(table4.other_foreign_column)	
+table2	parent		empty	trimmed_line	tree(child)	
+table2	xyzzy		empty	trimmed_line	under(table2.child, d)	
+table2	foo		empty	integer	from(table4.numeric_foreign_column)	
+table2	bar		empty	text		
+table3	source			prefix	from(table1.prefix)	
+table3	id			CURIE	unique	
+table3	label			label	primary	
+table3	parent		empty	label	tree(label)	
+table3	related		empty	trimmed_line		
+table4	foreign_column			text	unique	
+table4	other_foreign_column			text	unique	
+table4	numeric_foreign_column			integer	primary	
+table5	foo			word	primary	
+table5	bar			integer		
+table6	child			integer	from(table4.numeric_foreign_column)	
+table6	parent		empty	integer	tree(child)	
+table6	xyzzy		empty	integer	under(table6.child, 4)	
+table6	foo		empty	text		
+table6	bar		empty	integer		
diff --git a/test/guess_test_data/rule.tsv b/test/guess_test_data/rule.tsv
index 3a9356ff..a46b8d52 100644
--- a/test/guess_test_data/rule.tsv
+++ b/test/guess_test_data/rule.tsv
@@ -1,9 +1,2 @@
 table	when column	when condition	then column	then condition	level	description
-table2	foo	null	bar	null	error	bar must be null whenever foo is null
-table2	foo	not null	bar	not null	error	bar cannot be null if foo is not null
-table2	foo	IRI	bar	label	error	bar must be a label if foo is an IRI
-table2	foo	equals(5)	bar	in('y', 'z')	error	bar must be 'y' or 'z' if foo = 5
-table6	foo	null	bar	null	error	bar must be null whenever foo is null
-table6	foo	not null	bar	not null	error	bar cannot be null if foo is not null
-table6	foo	IRI	bar	label	error	bar must be a label if foo is an IRI
-table6	foo	equals(e)	bar	in(25, 26)	error	bar must be 25 or 26 if foo = 'e'
+table1	ontology IRI	null	version IRI	null	error	'version IRI' must be null whenever 'ontology IRI' is null
diff --git a/test/guess_test_data/table.tsv b/test/guess_test_data/table.tsv
index 1fcc8584..ac5800f1 100644
--- a/test/guess_test_data/table.tsv
+++ b/test/guess_test_data/table.tsv
@@ -3,9 +3,4 @@ column	test/guess_test_data/column.tsv	Columns for all of the tables.	column
 datatype	test/guess_test_data/datatype.tsv	Datatypes for all of the columns	datatype
 rule	test/guess_test_data/rule.tsv	More complex "when" rules	rule
 table	test/guess_test_data/table.tsv	All of the user-editable tables in this project.	table
-table1	test/guess_test_data/ontology/table1.tsv	The first data table	
-table2	test/guess_test_data/ontology/table2.tsv	The second data table	
-table3	test/guess_test_data/ontology/table3.tsv	The third data table	
-table4	test/guess_test_data/ontology/table4.tsv	The fourth data table	
-table5	test/guess_test_data/ontology/table5.tsv	The fifth data table	
-table6	test/guess_test_data/ontology/table6.tsv	The sixth data table (like table2 but all numeric)	
+table1	test/guess_test_data/table1.tsv	The first data table	
diff --git a/test/guess_test_data/table_expected.tsv b/test/guess_test_data/table_expected.tsv
new file mode 100644
index 00000000..dfb683c4
--- /dev/null
+++ b/test/guess_test_data/table_expected.tsv
@@ -0,0 +1,11 @@
+table	path	description	type
+column	test/guess_test_data/column_expected.tsv	Columns for all of the tables.	column
+datatype	test/guess_test_data/datatype.tsv	Datatypes for all of the columns	datatype
+rule	test/guess_test_data/rule.tsv	More complex "when" rules	rule
+table	test/guess_test_data/table_expected.tsv	All of the user-editable tables in this project.	table
+table1	test/guess_test_data/table1.tsv	The first data table	
+table2	test/guess_test_data/ontology/table2.tsv	The second data table	
+table3	test/guess_test_data/ontology/table3.tsv	The third data table	
+table4	test/guess_test_data/ontology/table4.tsv	The fourth data table	
+table5	test/guess_test_data/ontology/table5.tsv	The fifth data table	
+table6	test/guess_test_data/ontology/table6.tsv	The sixth data table (like table2 but all numeric)	

From 701cc84f59073ec97b614d9cdf16a28ff3e588d7 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 15:36:26 -0500
Subject: [PATCH 06/48] tweaks to Makefile

---
 Makefile | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 01e4f65f..8e53c26a 100644
--- a/Makefile
+++ b/Makefile
@@ -12,9 +12,10 @@ MAKEFLAGS += --warn-undefined-variables
 build:
 	mkdir build
 
-.PHONY: doc time test sqlite_test pg_test
-.PHONY: api_test sqlite_api_test pg_qpi_test
-.PHONY: random_test_data random_test sqlite_random_test pg_random_test valve_debug valve_release
+.PHONY: doc readme valve_debug valve_release test sqlite_test pg_test api_test sqlite_api_test \
+	pg_qpi_test random_test_data random_test sqlite_random_test pg_random_test guess_test_data \
+	perf_test_data sqlite_perf_test pg_perf_test perf_test
+
 
 doc:
 	cargo doc --document-private-items
@@ -25,12 +26,12 @@ readme:
 valve: src/*.rs src/*.lalrpop
 	@$(MAKE) valve_debug
 
-valve_release: src/*.rs src/*.lalrpop
+valve_release:
 	rm -f valve
 	cargo build --release
 	ln -s target/release/ontodev_valve valve
 
-valve_debug: src/*.rs src/*.lalrpop
+valve_debug:
 	rm -f valve
 	cargo build
 	ln -s target/debug/ontodev_valve valve
@@ -43,7 +44,8 @@ test/output:
 
 test: sqlite_test pg_test api_test random_test
 
-tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 table9 table10 table11
+tables_to_test = column datatype rule table table1 table2 table3 table4 table5 table6 table7 table8 \
+	table9 table10 table11
 
 sqlite_test: build/valve.db test/src/table.tsv | test/output
 	@echo "Testing valve on sqlite ..."
@@ -118,7 +120,6 @@ pg_random_test: valve clean random_test_data | build test/output
 
 guess_test_dir = test/guess_test_data
 guess_test_db = build/valve_guess.db
-.PHONY: guess_test_data
 
 $(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv
 	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
@@ -140,7 +141,6 @@ $(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_
 
 perf_test_dir = test/perf_test_data
 perf_test_db = build/valve_perf.db
-.PHONY: perf_test_data
 
 $(perf_test_dir)/ontology:
 	mkdir -p $@
@@ -154,16 +154,13 @@ $(perf_test_db): valve perf_test_data $(perf_test_dir)/*.tsv | build $(perf_test
 	rm -f $@
 	time -p ./$< --verbose $(perf_test_dir)/table.tsv $@
 
-.PHONY: sqlite_perf_test
 sqlite_perf_test: build/valve_perf.db | test/output
 	time -p scripts/export.py messages $< $| $(tables_to_test)
 
-.PHONY: pg_perf_test
 pg_perf_test: valve $(perf_test_dir)/ontology | test/output
 	time -p ./$< --verbose $(perf_test_dir)/table.tsv postgresql:///valve_postgres
 	time -p scripts/export.py messages postgresql:///valve_postgres $| $(tables_to_test)
 
-.PHONY: perf_test
 perf_test: sqlite_perf_test pg_perf_test
 
 clean:

From 474c7a597d6bf4f3a9d1b0d0c29dbe17ad19689d Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 15:37:25 -0500
Subject: [PATCH 07/48] initial version of guess prototype

---
 scripts/guess.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100755 scripts/guess.py

diff --git a/scripts/guess.py b/scripts/guess.py
new file mode 100755
index 00000000..4c93e570
--- /dev/null
+++ b/scripts/guess.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import csv
+import random
+import re
+import sys
+import time
+
+from argparse import ArgumentParser
+
+
+def has_ncolumn(sample, ncolumn):
+    return bool([label for label in sample if sample[label]["normalized"] == ncolumn])
+
+
+def get_random_sample(table, sample_size):
+    # Get the number of rows in the file (we substract 1 for the header row):
+    with open(table, "rb") as f:
+        total_rows = sum(1 for _ in f) - 1
+
+    if total_rows < sample_size:
+        sample_size = total_rows
+
+    if sample_size == total_rows:
+        sample_row_numbers = range(1, total_rows + 1)
+    else:
+        sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size))
+    with open(table) as f:
+        rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+        sample = {}
+        for i, row in enumerate(rows, start=1):
+            if i in sample_row_numbers:
+                for label, value in row.items():
+                    if label not in sample:
+                        ncolumn = re.sub(r"[^0-9a-zA-Z_]+", "", label).casefold()
+                        if has_ncolumn(sample, ncolumn):
+                            print(
+                                "The data has more than one column with the normalized name "
+                                f"{ncolumn}"
+                            )
+                            sys.exit(1)
+                        sample[label] = {
+                            "normalized": ncolumn,
+                            "values": [],
+                        }
+                    sample[label]["values"].append(value)
+    return sample
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="VALVE guesser (prototype)")
+    parser.add_argument(
+        "--sample_size",
+        type=int,
+        default=10000,
+        help="Sample size to use when guessing (default: 10,000)",
+    )
+    parser.add_argument(
+        "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%)"
+    )
+    parser.add_argument(
+        "--enum_size",
+        type=int,
+        default=10,
+        help="The maximum number of values to use for in(...) datatype conditions",
+    )
+    parser.add_argument(
+        "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)"
+    )
+    parser.add_argument(
+        "TABLE", help="The name of the .TSV file containing the data for which we will be guessing"
+    )
+    args = parser.parse_args()
+
+    # Use the seed argument, or the epoch time if no seed is given, to set up the random generator:
+    if args.seed is not None:
+        seed = args.seed
+    else:
+        seed = time.time_ns()
+    random.seed(seed)
+
+    sample = get_random_sample(args.TABLE, args.sample_size)
+    print(sample)

From d401835f3eb403d52acda614d1b621610d6e2947 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 16:50:58 -0500
Subject: [PATCH 08/48] guess nulltype

---
 scripts/guess.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 4c93e570..01a2fe85 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -47,6 +47,17 @@ def get_random_sample(table, sample_size):
     return sample
 
 
+def annotate(label, sample, error_rate):
+    def has_nulltype(target):
+        num_values = len(target["values"])
+        num_empties = target["values"].count("")
+        return num_empties / num_values > error_rate
+
+    target = sample[label]
+    if has_nulltype(target):
+        target["nulltype"] = "empty"
+
+
 if __name__ == "__main__":
     parser = ArgumentParser(description="VALVE guesser (prototype)")
     parser.add_argument(
@@ -80,4 +91,12 @@ def get_random_sample(table, sample_size):
     random.seed(seed)
 
     sample = get_random_sample(args.TABLE, args.sample_size)
-    print(sample)
+    for label in sample:
+        annotate(label, sample, args.error_rate)
+
+    # For debugging
+    for label in sample:
+        print(f"{label}: ", end="")
+        for annotation in sample[label]:
+            print(f"{annotation} ", end="")
+        print()

From 5a6611844567f001899bbb75439ce2dc28fa049d Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 17:35:30 -0500
Subject: [PATCH 09/48] guess primary/unique

---
 scripts/guess.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 01a2fe85..57b4143b 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -47,15 +47,30 @@ def get_random_sample(table, sample_size):
     return sample
 
 
-def annotate(label, sample, error_rate):
+def annotate(label, sample, error_rate, is_primary_candidate):
     def has_nulltype(target):
         num_values = len(target["values"])
         num_empties = target["values"].count("")
         return num_empties / num_values > error_rate
 
+    def has_duplicates(target, ignore_empties):
+        if ignore_empties:
+            values = [v for v in target["values"] if v != ""]
+        else:
+            values = target["values"]
+        distinct_values = set(values)
+        return (len(values) - len(distinct_values)) > (error_rate * len(values))
+
     target = sample[label]
     if has_nulltype(target):
         target["nulltype"] = "empty"
+    # Since the target has no nulltype (because the previous branch of the if-statement did not
+    # apply), all empties are assumed to be errors, so we pass True here:
+    elif not has_duplicates(target, True):
+        if is_primary_candidate:
+            target["structure"] = "primary"
+        else:
+            target["structure"] = "unique"
 
 
 if __name__ == "__main__":
@@ -91,12 +106,16 @@ def has_nulltype(target):
     random.seed(seed)
 
     sample = get_random_sample(args.TABLE, args.sample_size)
-    for label in sample:
-        annotate(label, sample, args.error_rate)
+    for i, label in enumerate(sample):
+        annotate(label, sample, args.error_rate, i == 0)
 
     # For debugging
-    for label in sample:
-        print(f"{label}: ", end="")
-        for annotation in sample[label]:
-            print(f"{annotation} ", end="")
-        print()
+    # for label in sample:
+    #     print(f"{label}: ", end="")
+    #     for annotation in sample[label]:
+    #         print(f"{annotation} ", end="")
+    #     print()
+
+    from pprint import pprint
+
+    pprint(sample)

From 64502b68f222ebcc43f1b81c5bb3ad5dcb75ff7b Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 9 Nov 2023 17:44:27 -0500
Subject: [PATCH 10/48] tweak

---
 scripts/guess.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 57b4143b..d4e9ee6c 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -18,10 +18,8 @@ def get_random_sample(table, sample_size):
     with open(table, "rb") as f:
         total_rows = sum(1 for _ in f) - 1
 
-    if total_rows < sample_size:
+    if total_rows <= sample_size:
         sample_size = total_rows
-
-    if sample_size == total_rows:
         sample_row_numbers = range(1, total_rows + 1)
     else:
         sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size))

From f02740e49e720c823ef31e40130fc0d4e10529af Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sat, 11 Nov 2023 10:51:20 -0500
Subject: [PATCH 11/48] annotate datatype (WIP)

---
 scripts/guess.py | 127 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 120 insertions(+), 7 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index d4e9ee6c..fc7cd4f5 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -1,13 +1,18 @@
 #!/usr/bin/env python3
 
 import csv
+import json
 import random
 import re
+import subprocess
 import sys
 import time
 
 from argparse import ArgumentParser
 
+# TODO: Remove this import later (used only for debugging):
+from pprint import pprint, pformat
+
 
 def has_ncolumn(sample, ncolumn):
     return bool([label for label in sample if sample[label]["normalized"] == ncolumn])
@@ -45,7 +50,19 @@ def get_random_sample(table, sample_size):
     return sample
 
 
-def annotate(label, sample, error_rate, is_primary_candidate):
+def get_valve_config(valve_table):
+    result = subprocess.run(["./valve", "--dump_config", valve_table], capture_output=True)
+    if result.returncode != 0:
+        error = result.stderr.decode()
+        output = result.stdout.decode()
+        if output:
+            error = f"{error}\n{output}"
+        print(f"{error}", file=sys.stderr)
+        sys.exit(result.returncode)
+    return json.loads(result.stdout.decode())
+
+
+def annotate(label, sample, dt_hierarchy, error_rate, is_primary_candidate):
     def has_nulltype(target):
         num_values = len(target["values"])
         num_empties = target["values"].count("")
@@ -59,6 +76,52 @@ def has_duplicates(target, ignore_empties):
         distinct_values = set(values)
         return (len(values) - len(distinct_values)) > (error_rate * len(values))
 
+    def get_datatype(target):
+        # For each tree in the hierarchy:
+        #    Look for a match with the 0th element and possibly add it to matching_datatypes.
+        # If there are matches in matching_datatypes:
+        #    Use the tiebreaker rules to find the best match and annotate the target with it.
+        # Else:
+        #    Try again with the next highest element of each tree (if one exists)
+        #
+        # Note that this is guaranteed to work since the get_datatype_hierarchy() function includes
+        # the 'text' datatype which matches anything. So if no matches are found raise an error.
+
+        def is_match(datatype):
+            # If the datatype has no associated condition then it matches anything:
+            if not datatype.get("condition"):
+                return True
+            # TODO: Replace this with actual code to check if there is a match:
+            return bool(random.getrandbits(1))
+
+        def tiebreak(datatypes):
+            # TODO: Replace this with actual code to implement the tiebreaker rules:
+            return random.choice(datatypes)
+
+        curr_index = 0
+        while True:
+            matching_datatypes = []
+            datatypes_to_check = []
+            for dt_name in dt_hierarchy:
+                if len(dt_hierarchy[dt_name]) > curr_index:
+                    datatypes_to_check.append(dt_hierarchy[dt_name][curr_index])
+            if len(datatypes_to_check) == 0:
+                print(f"Could not find a datatype match for column '{label}'")
+                sys.exit(1)
+
+            for datatype in datatypes_to_check:
+                if is_match(datatype):
+                    matching_datatypes.append(datatype)
+
+            if len(matching_datatypes) == 0:
+                continue
+            elif len(matching_datatypes) == 1:
+                return matching_datatypes[0]
+            else:
+                return tiebreak(matching_datatypes)
+
+            curr_index += 1
+
     target = sample[label]
     if has_nulltype(target):
         target["nulltype"] = "empty"
@@ -70,6 +133,50 @@ def has_duplicates(target, ignore_empties):
         else:
             target["structure"] = "unique"
 
+    target["datatype"] = get_datatype(target)["datatype"]
+
+
+def get_datatype_hierarchy(config):
+    """
+    Given a VALVE configuration, return a datatype hierarchy that looks like this:
+    {'dt_name_1': [{'datatype': 'dt_name_1',
+                    'description': 'a description',
+                    ...},
+                   {'datatype': 'parent datatype',
+                    'description': 'a description',
+                    ...},
+                   {'datatype': 'grandparent datatype',
+                    'description': 'a description',
+                    ...},
+                   ...],
+     'dt_name_2': etc.
+    """
+
+    def get_hierarchy_for_dt(primary_dt_name):
+        def get_parents(dt_name):
+            datatypes = []
+            if dt_name is not None:
+                datatype = config["datatype"][dt_name]
+                if datatype["datatype"] != primary_dt_name:
+                    datatypes.append(datatype)
+                datatypes += get_parents(datatype.get("parent"))
+            return datatypes
+
+        return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
+
+    dt_config = config["datatype"]
+    dt_names = [dt_name for dt_name in dt_config]
+    leaf_dts = []
+    for dt in dt_names:
+        children = [child for child in dt_names if dt_config[child].get("parent") == dt]
+        if not children:
+            leaf_dts.append(dt)
+
+    dt_hierarchy = {}
+    for leaf_dt in leaf_dts:
+        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt)
+    return dt_hierarchy
+
 
 if __name__ == "__main__":
     parser = ArgumentParser(description="VALVE guesser (prototype)")
@@ -92,7 +199,10 @@ def has_duplicates(target, ignore_empties):
         "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)"
     )
     parser.add_argument(
-        "TABLE", help="The name of the .TSV file containing the data for which we will be guessing"
+        "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration"
+    )
+    parser.add_argument(
+        "TABLE", help="A .TSV file containing the data for which we will be guessing"
     )
     args = parser.parse_args()
 
@@ -103,17 +213,20 @@ def has_duplicates(target, ignore_empties):
         seed = time.time_ns()
     random.seed(seed)
 
+    # Get the valve configuration:
+    config = get_valve_config(args.VALVE_TABLE)
+
+    # Use the valve config to retrieve the valve datatype hierarchy:
+    dt_hierarchy = get_datatype_hierarchy(config)
+
     sample = get_random_sample(args.TABLE, args.sample_size)
     for i, label in enumerate(sample):
-        annotate(label, sample, args.error_rate, i == 0)
+        annotate(label, sample, dt_hierarchy, args.error_rate, i == 0)
 
+    pprint(sample)
     # For debugging
     # for label in sample:
     #     print(f"{label}: ", end="")
     #     for annotation in sample[label]:
     #         print(f"{annotation} ", end="")
     #     print()
-
-    from pprint import pprint
-
-    pprint(sample)

From 5200b5f63796aa3b7be5ff4c305d4bf3026ec604 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 09:40:11 -0500
Subject: [PATCH 12/48] add stubs for functions to retrieve from() structures

---
 scripts/guess.py | 112 +++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 48 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index fc7cd4f5..8811569e 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -62,7 +62,54 @@ def get_valve_config(valve_table):
     return json.loads(result.stdout.decode())
 
 
-def annotate(label, sample, dt_hierarchy, error_rate, is_primary_candidate):
+def get_datatype_hierarchy(config):
+    """
+    Given a VALVE configuration, return a datatype hierarchy that looks like this:
+    {'dt_name_1': [{'datatype': 'dt_name_1',
+                    'description': 'a description',
+                    ...},
+                   {'datatype': 'parent datatype',
+                    'description': 'a description',
+                    ...},
+                   {'datatype': 'grandparent datatype',
+                    'description': 'a description',
+                    ...},
+                   ...],
+     'dt_name_2': etc.
+    """
+
+    def get_hierarchy_for_dt(primary_dt_name):
+        def get_parents(dt_name):
+            datatypes = []
+            if dt_name is not None:
+                datatype = config["datatype"][dt_name]
+                if datatype["datatype"] != primary_dt_name:
+                    datatypes.append(datatype)
+                datatypes += get_parents(datatype.get("parent"))
+            return datatypes
+
+        return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
+
+    dt_config = config["datatype"]
+    dt_names = [dt_name for dt_name in dt_config]
+    leaf_dts = []
+    for dt in dt_names:
+        children = [child for child in dt_names if dt_config[child].get("parent") == dt]
+        if not children:
+            leaf_dts.append(dt)
+
+    dt_hierarchy = {}
+    for leaf_dt in leaf_dts:
+        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt)
+    return dt_hierarchy
+
+
+def get_foreign_column_data(config):
+    # TODO.
+    pass
+
+
+def annotate(label, sample, config, error_rate, is_primary_candidate):
     def has_nulltype(target):
         num_values = len(target["values"])
         num_empties = target["values"].count("")
@@ -76,7 +123,7 @@ def has_duplicates(target, ignore_empties):
         distinct_values = set(values)
         return (len(values) - len(distinct_values)) > (error_rate * len(values))
 
-    def get_datatype(target):
+    def get_datatype(target, dt_hierarchy):
         # For each tree in the hierarchy:
         #    Look for a match with the 0th element and possibly add it to matching_datatypes.
         # If there are matches in matching_datatypes:
@@ -122,6 +169,10 @@ def tiebreak(datatypes):
 
             curr_index += 1
 
+    def get_from(target, foreign_column_data):
+        # TODO.
+        pass
+
     target = sample[label]
     if has_nulltype(target):
         target["nulltype"] = "empty"
@@ -133,49 +184,17 @@ def tiebreak(datatypes):
         else:
             target["structure"] = "unique"
 
-    target["datatype"] = get_datatype(target)["datatype"]
-
-
-def get_datatype_hierarchy(config):
-    """
-    Given a VALVE configuration, return a datatype hierarchy that looks like this:
-    {'dt_name_1': [{'datatype': 'dt_name_1',
-                    'description': 'a description',
-                    ...},
-                   {'datatype': 'parent datatype',
-                    'description': 'a description',
-                    ...},
-                   {'datatype': 'grandparent datatype',
-                    'description': 'a description',
-                    ...},
-                   ...],
-     'dt_name_2': etc.
-    """
-
-    def get_hierarchy_for_dt(primary_dt_name):
-        def get_parents(dt_name):
-            datatypes = []
-            if dt_name is not None:
-                datatype = config["datatype"][dt_name]
-                if datatype["datatype"] != primary_dt_name:
-                    datatypes.append(datatype)
-                datatypes += get_parents(datatype.get("parent"))
-            return datatypes
-
-        return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
-
-    dt_config = config["datatype"]
-    dt_names = [dt_name for dt_name in dt_config]
-    leaf_dts = []
-    for dt in dt_names:
-        children = [child for child in dt_names if dt_config[child].get("parent") == dt]
-        if not children:
-            leaf_dts.append(dt)
+    # Use the valve config to retrieve the valve datatype hierarchy:
+    dt_hierarchy = get_datatype_hierarchy(config)
+    target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"]
 
-    dt_hierarchy = {}
-    for leaf_dt in leaf_dts:
-        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt)
-    return dt_hierarchy
+    # TODO: Use the valve config to get a list of columns already loaded to the database, then
+    # compare the contents of each column with the contents of the target column and possibly
+    # annotate the target with a from() structure.
+    foreign_column_data = get_foreign_column_data(config)
+    from_structure = get_from(target, foreign_column_data)
+    if from_structure and not target.get("structure"):
+        target["structure"] = from_structure
 
 
 if __name__ == "__main__":
@@ -216,12 +235,9 @@ def get_parents(dt_name):
     # Get the valve configuration:
     config = get_valve_config(args.VALVE_TABLE)
 
-    # Use the valve config to retrieve the valve datatype hierarchy:
-    dt_hierarchy = get_datatype_hierarchy(config)
-
     sample = get_random_sample(args.TABLE, args.sample_size)
     for i, label in enumerate(sample):
-        annotate(label, sample, dt_hierarchy, args.error_rate, i == 0)
+        annotate(label, sample, config, args.error_rate, i == 0)
 
     pprint(sample)
     # For debugging

From bc7cb2caec01b7f96c3b39af7cdb3e5eca2741fa Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 09:47:58 -0500
Subject: [PATCH 13/48] small optimization

---
 scripts/guess.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 8811569e..1e208e77 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -191,10 +191,11 @@ def get_from(target, foreign_column_data):
     # TODO: Use the valve config to get a list of columns already loaded to the database, then
     # compare the contents of each column with the contents of the target column and possibly
     # annotate the target with a from() structure.
-    foreign_column_data = get_foreign_column_data(config)
-    from_structure = get_from(target, foreign_column_data)
-    if from_structure and not target.get("structure"):
-        target["structure"] = from_structure
+    if not target.get("structure"):
+        foreign_column_data = get_foreign_column_data(config)
+        from_structure = get_from(target, foreign_column_data)
+        if from_structure:
+            target["structure"] = from_structure
 
 
 if __name__ == "__main__":

From 6dfac263a84956e8b5a0a92c81a22a90b156bf94 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 09:55:31 -0500
Subject: [PATCH 14/48] add db parameter

---
 scripts/guess.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 1e208e77..542444e5 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -221,6 +221,11 @@ def get_from(target, foreign_column_data):
     parser.add_argument(
         "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration"
     )
+    parser.add_argument(
+        "DATABASE",
+        help="""Can be one of (A) A URL of the form `postgresql://...` or
+        `sqlite://...` (B) The filename (including path) of a sqlite database.""",
+    )
     parser.add_argument(
         "TABLE", help="A .TSV file containing the data for which we will be guessing"
     )
@@ -233,8 +238,9 @@ def get_from(target, foreign_column_data):
         seed = time.time_ns()
     random.seed(seed)
 
-    # Get the valve configuration:
+    # Get the valve configuration and database info:
     config = get_valve_config(args.VALVE_TABLE)
+    config["db"] = args.DATABASE
 
     sample = get_random_sample(args.TABLE, args.sample_size)
     for i, label in enumerate(sample):

From 4a83313fba6df4ded14a6624302b7f69a3a2452a Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 09:59:41 -0500
Subject: [PATCH 15/48] fix unsupported format error in help

---
 scripts/guess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 542444e5..eef31645 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -207,7 +207,7 @@ def get_from(target, foreign_column_data):
         help="Sample size to use when guessing (default: 10,000)",
     )
     parser.add_argument(
-        "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%)"
+        "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%%)"
     )
     parser.add_argument(
         "--enum_size",

From c956c59c55475345978e20d0abf386465ea2942b Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 11:31:37 -0500
Subject: [PATCH 16/48] rename foreign stub

---
 scripts/guess.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index eef31645..13be592e 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -104,8 +104,9 @@ def get_parents(dt_name):
     return dt_hierarchy
 
 
-def get_foreign_column_data(config):
-    # TODO.
+def get_potential_foreign_columns(config, datatype):
+    # TODO. Look for primary and unique columns in other tables that have the same SQL type as the
+    # one associated with the given datatype.
     pass
 
 
@@ -192,8 +193,8 @@ def get_from(target, foreign_column_data):
     # compare the contents of each column with the contents of the target column and possibly
     # annotate the target with a from() structure.
     if not target.get("structure"):
-        foreign_column_data = get_foreign_column_data(config)
-        from_structure = get_from(target, foreign_column_data)
+        potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"])
+        from_structure = get_from(target, potential_foreign_columns)
         if from_structure:
             target["structure"] = from_structure
 
@@ -207,7 +208,10 @@ def get_from(target, foreign_column_data):
         help="Sample size to use when guessing (default: 10,000)",
     )
     parser.add_argument(
-        "--error_rate", type=float, default=0.1, help="Proportion of errors expected (default: 10%%)"
+        "--error_rate",
+        type=float,
+        default=0.1,
+        help="Proportion of errors expected (default: 10%%)",
     )
     parser.add_argument(
         "--enum_size",

From 76be3bd0f26350692f44ea68982daf69e9e94e2b Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 13 Nov 2023 13:33:40 -0500
Subject: [PATCH 17/48] implement datatype guess

---
 scripts/guess.py         |  97 +++++++++++++++++++++++++++----
 scripts/guess_grammar.py | 120 +++++++++++++++++++++++++++++++++++++++
 scripts/requirements.txt |   1 +
 3 files changed, 208 insertions(+), 10 deletions(-)
 create mode 100644 scripts/guess_grammar.py
 create mode 100644 scripts/requirements.txt

diff --git a/scripts/guess.py b/scripts/guess.py
index 13be592e..871c9749 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -8,7 +8,11 @@
 import sys
 import time
 
+from guess_grammar import grammar, TreeToDict, reverse_parse
+
 from argparse import ArgumentParser
+from lark import Lark
+from lark.exceptions import VisitError
 
 # TODO: Remove this import later (used only for debugging):
 from pprint import pprint, pformat
@@ -105,11 +109,58 @@ def get_parents(dt_name):
 
 
 def get_potential_foreign_columns(config, datatype):
-    # TODO. Look for primary and unique columns in other tables that have the same SQL type as the
+    # TODO: Look for primary and unique columns in other tables that have the same SQL type as the
     # one associated with the given datatype.
     pass
 
 
+SAVED_CONDITIONS = {}
+
+
+def get_compiled_condition(condition, parser):
+    global SAVED_CONDITIONS
+
+    if condition in SAVED_CONDITIONS:
+        return SAVED_CONDITIONS[condition]
+
+    parsed_condition = parser.parse(condition)
+    if len(parsed_condition) != 1:
+        print(
+            f"'{condition}' is invalid. Only one condition per column is allowed.", file=sys.stderr
+        )
+        sys.exit(1)
+    parsed_condition = parsed_condition[0]
+    if parsed_condition["type"] == "function" and parsed_condition["name"] == "equals":
+        expected = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["value"])
+        compiled_condition = lambda x: x == expected
+    elif parsed_condition["type"] == "function" and parsed_condition["name"] in (
+        "exclude",
+        "match",
+        "search",
+    ):
+        pattern = re.sub(r"^['\"](.*)['\"]$", r"\1", parsed_condition["args"][0]["pattern"])
+        flags = parsed_condition["args"][0]["flags"]
+        flags = "(?" + "".join(flags) + ")" if flags else ""
+        pattern = re.compile(flags + pattern)
+        if parsed_condition["name"] == "exclude":
+            compiled_condition = lambda x: not bool(pattern.search(x))
+        elif parsed_condition["name"] == "match":
+            compiled_condition = lambda x: bool(pattern.fullmatch(x))
+        else:
+            compiled_condition = lambda x: bool(pattern.search(x))
+    elif parsed_condition["type"] == "function" and parsed_condition["name"] == "in":
+        alternatives = [
+            re.sub(r"^['\"](.*)['\"]$", r"\1", arg["value"]) for arg in parsed_condition["args"]
+        ]
+        compiled_condition = lambda x: x in alternatives
+    else:
+        print(f"Unrecognized condition: {condition}", file=sys.stderr)
+        sys.exit(1)
+
+    SAVED_CONDITIONS[condition] = compiled_condition
+    return compiled_condition
+
+
 def annotate(label, sample, config, error_rate, is_primary_candidate):
     def has_nulltype(target):
         num_values = len(target["values"])
@@ -139,12 +190,26 @@ def is_match(datatype):
             # If the datatype has no associated condition then it matches anything:
             if not datatype.get("condition"):
                 return True
-            # TODO: Replace this with actual code to check if there is a match:
-            return bool(random.getrandbits(1))
+
+            condition = get_compiled_condition(datatype["condition"], config["parser"])
+            num_values = len(target["values"])
+            num_passed = [condition(v) for v in target["values"]].count(True)
+            success_rate = num_passed / num_values
+            if (1 - success_rate) <= error_rate:
+                return success_rate
 
         def tiebreak(datatypes):
-            # TODO: Replace this with actual code to implement the tiebreaker rules:
-            return random.choice(datatypes)
+            in_types = []
+            other_types = []
+            for dt in datatypes:
+                if dt["datatype"]["condition"].startswith("in("):
+                    in_types.append(dt)
+                else:
+                    other_types.append(dt)
+            sorted_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + sorted(
+                other_types, key=lambda k: k["success_rate"], reverse=True
+            )
+            return sorted_types[0]["datatype"]
 
         curr_index = 0
         while True:
@@ -158,20 +223,28 @@ def tiebreak(datatypes):
                 sys.exit(1)
 
             for datatype in datatypes_to_check:
-                if is_match(datatype):
-                    matching_datatypes.append(datatype)
+                success_rate = is_match(datatype)
+                if success_rate:
+                    matching_datatypes.append(
+                        {
+                            "datatype": datatype,
+                            "success_rate": success_rate,
+                        }
+                    )
 
             if len(matching_datatypes) == 0:
                 continue
             elif len(matching_datatypes) == 1:
-                return matching_datatypes[0]
+                return matching_datatypes[0]["datatype"]
             else:
                 return tiebreak(matching_datatypes)
 
             curr_index += 1
 
-    def get_from(target, foreign_column_data):
-        # TODO.
+    def get_from(target, potential_foreign_columns):
+        # TODO: If there is one and only potential foreign column that matches the target, return
+        # it. If there are none, return None. If there is more than one, then also return None, but
+        # print the potential matches to STDOUT.
         pass
 
     target = sample[label]
@@ -246,11 +319,15 @@ def get_from(target, foreign_column_data):
     config = get_valve_config(args.VALVE_TABLE)
     config["db"] = args.DATABASE
 
+    # Attach the condition parser to the config as well:
+    config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict())
+
     sample = get_random_sample(args.TABLE, args.sample_size)
     for i, label in enumerate(sample):
         annotate(label, sample, config, args.error_rate, i == 0)
 
     pprint(sample)
+
     # For debugging
     # for label in sample:
     #     print(f"{label}: ", end="")
diff --git a/scripts/guess_grammar.py b/scripts/guess_grammar.py
new file mode 100644
index 00000000..5e611cfb
--- /dev/null
+++ b/scripts/guess_grammar.py
@@ -0,0 +1,120 @@
+from lark import Transformer
+
+# Grammar used to parse the the contents of `condition` and `structure` columns.
+# See: https://lark-parser.readthedocs.io/en/latest/index.html#
+grammar = r"""
+%import common.WS
+%ignore WS
+
+start: expression+
+?expression: string | function
+
+?string: label
+label: ALPHANUM | DQSTRING | SQSTRING
+
+function: function_name "(" arguments ")"
+function_name: ALPHANUM
+arguments: argument ("," argument)*
+?argument: string | field | function | named_arg | regex
+field: label "." label
+named_arg: label "=" label
+
+?regex: regex_sub | regex_match
+regex_match: "/" regex_pattern "/" regex_flags
+regex_sub: SUB_BEGIN "/" regex_pattern "/" regex_pattern "/" regex_flags
+regex_pattern: REGEX_WITH_FORWARD_SLASH | REGEX_WITHOUT_FORWARD_SLASH
+regex_flags: LOWER_ALPHA*
+
+SUB_BEGIN: "s"
+ALPHANUM: /[a-zA-Z0-9-_]/+
+DQSTRING: "\"" /[^"](\\\")?/* "\""
+SQSTRING: "'" /[^'](\\\')?/* "'"
+LOWER_ALPHA: /[a-z]/
+NO_SLASH: /[^\/]/
+REGEX_WITH_FORWARD_SLASH: NO_SLASH* "\\/" NO_SLASH*
+REGEX_WITHOUT_FORWARD_SLASH: NO_SLASH+
+"""
+
+
+class TreeToDict(Transformer):
+    """Transformer to convert a Tree, generated by the grammar used by CMI-PB to parse the contents
+    of `condition` and `structure` columns, into a list of expressions represented as dicts."""
+
+    def _sanity_check(self, token_list, expected_len):
+        if len(token_list) != expected_len:
+            raise Exception(f"Wrong number of tokens in: {token_list} (expecting {expected_len})")
+
+    def label(self, label):
+        self._sanity_check(label, 1)
+        label = label[0]
+        return {"type": "label", "value": label.value}
+
+    def field(self, field):
+        self._sanity_check(field, 2)
+        return {"type": "field", "table": field[0]["value"], "column": field[1]["value"]}
+
+    def named_arg(self, named_arg):
+        self._sanity_check(named_arg, 2)
+        return {"type": "named_arg", "key": named_arg[0]["value"], "value": named_arg[1]["value"]}
+
+    def regex_match(self, regex_match):
+        self._sanity_check(regex_match, 2)
+        return {"type": "regex", "pattern": regex_match[0], "flags": regex_match[1]}
+
+    def regex_sub(self, regex_sub):
+        self._sanity_check(regex_sub, 4)
+        return {
+            "type": "regex",
+            "pattern": regex_sub[1],
+            "replace": regex_sub[2],
+            "flags": regex_sub[3],
+        }
+
+    def regex_pattern(self, regex_pattern):
+        self._sanity_check(regex_pattern, 1)
+        return regex_pattern[0].value
+
+    def regex_flags(self, flags):
+        return [flag.value for flag in flags]
+
+    def arguments(self, arguments):
+        return arguments
+
+    def function_name(self, function_name):
+        self._sanity_check(function_name, 1)
+        return function_name[0].value
+
+    def function(self, function):
+        self._sanity_check(function, 2)
+        return {"type": "function", "name": function[0], "args": function[1]}
+
+    def start(self, start):
+        return start
+
+
+def reverse_parse(config, parsed_cond):
+    """Given a config map and a parsed condition, return the text version of the condition."""
+    cond_type = parsed_cond["type"]
+    text_cond = None
+    if cond_type == "label":
+        if config["datatype"].get(parsed_cond["value"]):
+            text_cond = config["datatype"][parsed_cond["value"]]["datatype"]
+        else:
+            text_cond = "'{}'".format(parsed_cond["value"])
+    elif cond_type == "field":
+        return "{}.{}".format(parsed_cond["table"], parsed_cond["column"])
+    elif cond_type == "named_arg":
+        text_cond = "{}={}".format(parsed_cond["key"], parsed_cond["value"])
+    elif cond_type == "regex":
+        pattern = parsed_cond["pattern"]
+        flags = "".join(parsed_cond["flags"])
+        replace = parsed_cond.get("replace")
+        text_cond = f"/{pattern}/{flags}" if not replace else f"s/{pattern}/{replace}/{flags}"
+    elif cond_type == "function":
+        text_cond = map(lambda arg: reverse_parse(config, arg), parsed_cond["args"])
+        text_cond = ", ".join(text_cond)
+        text_cond = "{}({})".format(parsed_cond["name"], text_cond)
+    else:
+        raise Exception(f"Unknown parsed_cond type: {cond_type} for {parsed_cond}")
+
+    return text_cond
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 00000000..9547a85f
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1 @@
+lark==1.1.8

From a2f64eec77f2d4152ee0f56445957f8d20dc1786 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Wed, 15 Nov 2023 10:32:51 -0500
Subject: [PATCH 18/48] call lstrip() on in() conditions

---
 scripts/guess.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 871c9749..75123b6e 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -8,11 +8,10 @@
 import sys
 import time
 
-from guess_grammar import grammar, TreeToDict, reverse_parse
+from guess_grammar import grammar, TreeToDict
 
 from argparse import ArgumentParser
 from lark import Lark
-from lark.exceptions import VisitError
 
 # TODO: Remove this import later (used only for debugging):
 from pprint import pprint, pformat
@@ -202,7 +201,7 @@ def tiebreak(datatypes):
             in_types = []
             other_types = []
             for dt in datatypes:
-                if dt["datatype"]["condition"].startswith("in("):
+                if dt["datatype"]["condition"].lstrip().startswith("in("):
                     in_types.append(dt)
                 else:
                     other_types.append(dt)

From 2a4db64a6922daf147b511d85aac3ffecd34c2dd Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Wed, 15 Nov 2023 11:56:31 -0500
Subject: [PATCH 19/48] implement get_potential_foreign_columns()

---
 scripts/guess.py | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 75123b6e..ba05c240 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -17,6 +17,9 @@
 from pprint import pprint, pformat
 
 
+SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"]
+
+
 def has_ncolumn(sample, ncolumn):
     return bool([label for label in sample if sample[label]["normalized"] == ncolumn])
 
@@ -107,10 +110,39 @@ def get_parents(dt_name):
     return dt_hierarchy
 
 
+def get_sql_type(config, datatype):
+    """Given the config map and the name of a datatype, climb the datatype tree (as required),
+    and return the first 'SQL type' found."""
+    if "datatype" not in config:
+        print("Missing datatypes in config")
+        sys.exit(1)
+    if datatype not in config["datatype"]:
+        return None
+    if config["datatype"][datatype].get("SQL type"):
+        return config["datatype"][datatype]["SQL type"]
+    return get_sql_type(config, config["datatype"][datatype].get("parent"))
+
+
 def get_potential_foreign_columns(config, datatype):
-    # TODO: Look for primary and unique columns in other tables that have the same SQL type as the
-    # one associated with the given datatype.
-    pass
+    global SPECIAL_TABLES
+
+    def get_coarser_sql_type(datatype):
+        sql_type = get_sql_type(config, datatype)
+        if sql_type not in ["integer", "numeric", "real"]:
+            return "text"
+        else:
+            return sql_type.casefold()
+
+    potential_foreign_columns = []
+    this_sql_type = get_coarser_sql_type(datatype)
+    for table, table_config in config["table"].items():
+        if table not in SPECIAL_TABLES:
+            for column, column_config in table_config["column"].items():
+                if column_config.get("structure") in ["primary", "unique"]:
+                    foreign_sql_type = get_coarser_sql_type(column_config["datatype"])
+                    if foreign_sql_type == this_sql_type:
+                        potential_foreign_columns.append({"table": table, "column": column})
+    return potential_foreign_columns
 
 
 SAVED_CONDITIONS = {}
@@ -266,6 +298,7 @@ def get_from(target, potential_foreign_columns):
     # annotate the target with a from() structure.
     if not target.get("structure"):
         potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"])
+        pprint(potential_foreign_columns)
         from_structure = get_from(target, potential_foreign_columns)
         if from_structure:
             target["structure"] = from_structure
@@ -316,6 +349,9 @@ def get_from(target, potential_foreign_columns):
 
     # Get the valve configuration and database info:
     config = get_valve_config(args.VALVE_TABLE)
+    if args.TABLE.removesuffix(".tsv") in config["table"]:
+        print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr)
+        sys.exit(0)
     config["db"] = args.DATABASE
 
     # Attach the condition parser to the config as well:
@@ -325,7 +361,7 @@ def get_from(target, potential_foreign_columns):
     for i, label in enumerate(sample):
         annotate(label, sample, config, args.error_rate, i == 0)
 
-    pprint(sample)
+    # pprint(sample)
 
     # For debugging
     # for label in sample:

From e8e163a5e87b1e714b73d283be46ba964f0c03c7 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Wed, 15 Nov 2023 14:54:08 -0500
Subject: [PATCH 20/48] implement get_froms()

---
 scripts/guess.py | 59 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index ba05c240..0fdc622d 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -4,6 +4,7 @@
 import json
 import random
 import re
+import sqlite3
 import subprocess
 import sys
 import time
@@ -12,8 +13,7 @@
 
 from argparse import ArgumentParser
 from lark import Lark
-
-# TODO: Remove this import later (used only for debugging):
+from numbers import Number
 from pprint import pprint, pformat
 
 
@@ -141,7 +141,13 @@ def get_coarser_sql_type(datatype):
                 if column_config.get("structure") in ["primary", "unique"]:
                     foreign_sql_type = get_coarser_sql_type(column_config["datatype"])
                     if foreign_sql_type == this_sql_type:
-                        potential_foreign_columns.append({"table": table, "column": column})
+                        potential_foreign_columns.append(
+                            {
+                                "table": table,
+                                "column": column,
+                                "sql_type": foreign_sql_type,
+                            }
+                        )
     return potential_foreign_columns
 
 
@@ -273,10 +279,31 @@ def tiebreak(datatypes):
             curr_index += 1
 
     def get_from(target, potential_foreign_columns):
-        # TODO: If there is one and only potential foreign column that matches the target, return
-        # it. If there are none, return None. If there is more than one, then also return None, but
-        # print the potential matches to STDOUT.
-        pass
+        candidate_froms = []
+        for foreign in potential_foreign_columns:
+            table = foreign["table"]
+            column = foreign["column"]
+            sql_type = foreign["sql_type"]
+            num_matches = 0
+            num_values = len(target["values"])
+            for value in target["values"]:
+                if target.get("nulltype") == "empty" and value == "":
+                    # If this value is legitimately empty then it should not be taken into account
+                    # when counting the number of values in the target that are found in the
+                    # candidate foreign column:
+                    num_values -= 1
+                    continue
+                if sql_type != "text" and not isinstance(value, Number):
+                    # If this value is of the wrong type then there is no need to explicitly check
+                    # if it exists in the foreign column:
+                    continue
+                if sql_type == "text":
+                    value = f"'{value}'"
+                sql = f'SELECT 1 FROM "{table}" WHERE "{column}" = {value} LIMIT 1'
+                num_matches += len(config["db"].execute(sql).fetchall())
+            if ((num_values - num_matches) / num_values) < error_rate:
+                candidate_froms.append(foreign)
+        return candidate_froms
 
     target = sample[label]
     if has_nulltype(target):
@@ -293,15 +320,16 @@ def get_from(target, potential_foreign_columns):
     dt_hierarchy = get_datatype_hierarchy(config)
     target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"]
 
-    # TODO: Use the valve config to get a list of columns already loaded to the database, then
-    # compare the contents of each column with the contents of the target column and possibly
-    # annotate the target with a from() structure.
+    # Use the valve config to get a list of columns already loaded to the database, then compare
+    # the contents of each column with the contents of the target column and possibly annotate the
+    # target with a from() structure, if there is one and only one candidate from().
     if not target.get("structure"):
         potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"])
-        pprint(potential_foreign_columns)
-        from_structure = get_from(target, potential_foreign_columns)
-        if from_structure:
-            target["structure"] = from_structure
+        froms = get_from(target, potential_foreign_columns)
+        if len(froms) == 1:
+            target["structure"] = froms[0]
+        elif len(froms) > 1:
+            print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}")
 
 
 if __name__ == "__main__":
@@ -352,7 +380,8 @@ def get_from(target, potential_foreign_columns):
     if args.TABLE.removesuffix(".tsv") in config["table"]:
         print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr)
         sys.exit(0)
-    config["db"] = args.DATABASE
+    with sqlite3.connect(args.DATABASE) as conn:
+        config["db"] = conn
 
     # Attach the condition parser to the config as well:
     config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict())

From 43e48ec29b425dc14ca83fdbac1f98d190431acf Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Wed, 15 Nov 2023 16:01:20 -0500
Subject: [PATCH 21/48] do froms before uniques

---
 Makefile         |  6 +++---
 scripts/guess.py | 40 +++++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 8e53c26a..5832e279 100644
--- a/Makefile
+++ b/Makefile
@@ -122,7 +122,7 @@ guess_test_dir = test/guess_test_data
 guess_test_db = build/valve_guess.db
 
 $(guess_test_dir)/table1.tsv: test/generate_random_test_data.py valve $(guess_test_dir)/*.tsv
-	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
+	./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
 
 $(guess_test_dir)/ontology:
 	mkdir -p $@
@@ -130,9 +130,9 @@ $(guess_test_dir)/ontology:
 guess_test_data: test/generate_random_test_data.py $(guess_test_dir)/table1.tsv valve confirm_overwrite.sh $(guess_test_dir)/*.tsv | $(guess_test_dir)/ontology
 	./confirm_overwrite.sh $(guess_test_dir)/ontology
 	rm -f $(guess_test_dir)/table1.tsv
-	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
+	./$< 0 30000 5 $(guess_test_dir)/table.tsv $(guess_test_dir)
 	rm -f $(guess_test_dir)/ontology/*.tsv
-	./$< $$(date +"%s") 50000 5 $(guess_test_dir)/table_expected.tsv $|
+	./$< 0 30000 5 $(guess_test_dir)/table_expected.tsv $|
 	rm -f $(guess_test_dir)/ontology/table1.tsv
 
 $(guess_test_db): valve guess_test_data $(guess_test_dir)/*.tsv | build $(guess_test_dir)/ontology
diff --git a/scripts/guess.py b/scripts/guess.py
index 0fdc622d..cb416a91 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -308,13 +308,6 @@ def get_from(target, potential_foreign_columns):
     target = sample[label]
     if has_nulltype(target):
         target["nulltype"] = "empty"
-    # Since the target has no nulltype (because the previous branch of the if-statement did not
-    # apply), all empties are assumed to be errors, so we pass True here:
-    elif not has_duplicates(target, True):
-        if is_primary_candidate:
-            target["structure"] = "primary"
-        else:
-            target["structure"] = "unique"
 
     # Use the valve config to retrieve the valve datatype hierarchy:
     dt_hierarchy = get_datatype_hierarchy(config)
@@ -323,13 +316,20 @@ def get_from(target, potential_foreign_columns):
     # Use the valve config to get a list of columns already loaded to the database, then compare
     # the contents of each column with the contents of the target column and possibly annotate the
     # target with a from() structure, if there is one and only one candidate from().
+    potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"])
+    froms = get_from(target, potential_foreign_columns)
+    if len(froms) == 1:
+        target["structure"] = froms[0]
+    elif len(froms) > 1:
+        print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}")
+
+    # Check if the column is a unique/primary column:
     if not target.get("structure"):
-        potential_foreign_columns = get_potential_foreign_columns(config, target["datatype"])
-        froms = get_from(target, potential_foreign_columns)
-        if len(froms) == 1:
-            target["structure"] = froms[0]
-        elif len(froms) > 1:
-            print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}")
+        if target.get("nulltype") is None and not has_duplicates(target, True):
+            if is_primary_candidate:
+                target["structure"] = "primary"
+            else:
+                target["structure"] = "unique"
 
 
 if __name__ == "__main__":
@@ -390,11 +390,13 @@ def get_from(target, potential_foreign_columns):
     for i, label in enumerate(sample):
         annotate(label, sample, config, args.error_rate, i == 0)
 
+    # For debugging:
     # pprint(sample)
 
-    # For debugging
-    # for label in sample:
-    #     print(f"{label}: ", end="")
-    #     for annotation in sample[label]:
-    #         print(f"{annotation} ", end="")
-    #     print()
+    # For debugging without values:
+    for label in sample:
+        print(f"{label}: ", end="")
+        for annotation, data in sample[label].items():
+            if annotation != "values":
+                print(f"{annotation}: {data}, ", end="")
+        print()

From d6e286238433821b9d2be877af690ef2fdea53ed Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 10:52:44 -0500
Subject: [PATCH 22/48] textify from structures

---
 scripts/guess.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index cb416a91..4fe84331 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -14,7 +14,6 @@
 from argparse import ArgumentParser
 from lark import Lark
 from numbers import Number
-from pprint import pprint, pformat
 
 
 SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"]
@@ -302,7 +301,7 @@ def get_from(target, potential_foreign_columns):
                 sql = f'SELECT 1 FROM "{table}" WHERE "{column}" = {value} LIMIT 1'
                 num_matches += len(config["db"].execute(sql).fetchall())
             if ((num_values - num_matches) / num_values) < error_rate:
-                candidate_froms.append(foreign)
+                candidate_froms.append(f"from({foreign['table']}.{foreign['column']})")
         return candidate_froms
 
     target = sample[label]
@@ -321,7 +320,7 @@ def get_from(target, potential_foreign_columns):
     if len(froms) == 1:
         target["structure"] = froms[0]
     elif len(froms) > 1:
-        print(f"Column '{label}' has multiple from() candidates: {pformat(froms)}")
+        print(f"Column '{label}' has multiple from() candidates: {', '.join(froms)}")
 
     # Check if the column is a unique/primary column:
     if not target.get("structure"):

From 70cc3d7d8e4bb412b7c9e3c3f95a2885a2590828 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 12:07:40 -0500
Subject: [PATCH 23/48] fix bugs that causes infinite loop and that attempt to
 dereference a NoneType

---
 scripts/guess.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 4fe84331..e619ca43 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -238,7 +238,7 @@ def tiebreak(datatypes):
             in_types = []
             other_types = []
             for dt in datatypes:
-                if dt["datatype"]["condition"].lstrip().startswith("in("):
+                if dt["datatype"].get("condition", "").lstrip().startswith("in("):
                     in_types.append(dt)
                 else:
                     other_types.append(dt)
@@ -269,6 +269,7 @@ def tiebreak(datatypes):
                     )
 
             if len(matching_datatypes) == 0:
+                curr_index += 1
                 continue
             elif len(matching_datatypes) == 1:
                 return matching_datatypes[0]["datatype"]

From 0554aa609181b164997a405c19f6b44fa8ee1d97 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 14:50:08 -0500
Subject: [PATCH 24/48] optimize sampling

---
 scripts/guess.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index e619ca43..df6a066c 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -32,26 +32,27 @@ def get_random_sample(table, sample_size):
         sample_size = total_rows
         sample_row_numbers = range(1, total_rows + 1)
     else:
-        sample_row_numbers = sorted(random.sample(range(1, total_rows + 1), sample_size))
+        sample_row_numbers = random.sample(range(1, total_rows + 1), sample_size)
     with open(table) as f:
         rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+        rows = [r for r in rows]
         sample = {}
-        for i, row in enumerate(rows, start=1):
-            if i in sample_row_numbers:
-                for label, value in row.items():
-                    if label not in sample:
-                        ncolumn = re.sub(r"[^0-9a-zA-Z_]+", "", label).casefold()
-                        if has_ncolumn(sample, ncolumn):
-                            print(
-                                "The data has more than one column with the normalized name "
-                                f"{ncolumn}"
-                            )
-                            sys.exit(1)
-                        sample[label] = {
-                            "normalized": ncolumn,
-                            "values": [],
-                        }
-                    sample[label]["values"].append(value)
+        pattern = re.compile(r"[^0-9a-zA-Z_]+")
+        for i in sample_row_numbers:
+            for label, value in rows[i].items():
+                if label not in sample:
+                    ncolumn = re.sub(pattern, "", label).casefold()
+                    if has_ncolumn(sample, ncolumn):
+                        print(
+                            "The data has more than one column with the normalized name "
+                            f"{ncolumn}"
+                        )
+                        sys.exit(1)
+                    sample[label] = {
+                        "normalized": ncolumn,
+                        "values": [],
+                    }
+                sample[label]["values"].append(value)
     return sample
 
 

From 81d973a66c24de1aae49ff4e8a3425a114ec157b Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 15:01:51 -0500
Subject: [PATCH 25/48] tweak

---
 scripts/guess.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index df6a066c..6f762032 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -30,12 +30,11 @@ def get_random_sample(table, sample_size):
 
     if total_rows <= sample_size:
         sample_size = total_rows
-        sample_row_numbers = range(1, total_rows + 1)
+        sample_row_numbers = range(0, total_rows)
     else:
-        sample_row_numbers = random.sample(range(1, total_rows + 1), sample_size)
+        sample_row_numbers = random.sample(range(0, total_rows), sample_size)
     with open(table) as f:
-        rows = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
-        rows = [r for r in rows]
+        rows = [r for r in csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)]
         sample = {}
         pattern = re.compile(r"[^0-9a-zA-Z_]+")
         for i in sample_row_numbers:

From ea0630ae74265d8050f1fee27d8f26329a3ad620 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 16:04:59 -0500
Subject: [PATCH 26/48] also sort datatypes by depth

---
 scripts/guess.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 6f762032..488cfa59 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -242,9 +242,9 @@ def tiebreak(datatypes):
                     in_types.append(dt)
                 else:
                     other_types.append(dt)
-            sorted_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True) + sorted(
-                other_types, key=lambda k: k["success_rate"], reverse=True
-            )
+            sorted_types = sorted(
+                in_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True
+            ) + sorted(other_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True)
             return sorted_types[0]["datatype"]
 
         curr_index = 0
@@ -264,6 +264,7 @@ def tiebreak(datatypes):
                     matching_datatypes.append(
                         {
                             "datatype": datatype,
+                            "depth": curr_index,
                             "success_rate": success_rate,
                         }
                     )

From 14e00a094c5879e10a65a58bfdb28d585efaf8ec Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 16:17:51 -0500
Subject: [PATCH 27/48] don't duplicate datatype check

---
 scripts/guess.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 488cfa59..3fd484e9 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -252,7 +252,10 @@ def tiebreak(datatypes):
             matching_datatypes = []
             datatypes_to_check = []
             for dt_name in dt_hierarchy:
-                if len(dt_hierarchy[dt_name]) > curr_index:
+                if (
+                    len(dt_hierarchy[dt_name]) > curr_index
+                    and dt_hierarchy[dt_name][curr_index] not in datatypes_to_check
+                ):
                     datatypes_to_check.append(dt_hierarchy[dt_name][curr_index])
             if len(datatypes_to_check) == 0:
                 print(f"Could not find a datatype match for column '{label}'")

From 2e12ddaafd41d35180674080c53f5f56d0810395 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Thu, 16 Nov 2023 16:45:36 -0500
Subject: [PATCH 28/48] make get_hierarchy_for_dt() an outer function

---
 scripts/guess.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 3fd484e9..b1af55bc 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -67,6 +67,19 @@ def get_valve_config(valve_table):
     return json.loads(result.stdout.decode())
 
 
+def get_hierarchy_for_dt(config, primary_dt_name):
+    def get_parents(dt_name):
+        datatypes = []
+        if dt_name is not None:
+            datatype = config["datatype"][dt_name]
+            if datatype["datatype"] != primary_dt_name:
+                datatypes.append(datatype)
+            datatypes += get_parents(datatype.get("parent"))
+        return datatypes
+
+    return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
+
+
 def get_datatype_hierarchy(config):
     """
     Given a VALVE configuration, return a datatype hierarchy that looks like this:
@@ -82,19 +95,6 @@ def get_datatype_hierarchy(config):
                    ...],
      'dt_name_2': etc.
     """
-
-    def get_hierarchy_for_dt(primary_dt_name):
-        def get_parents(dt_name):
-            datatypes = []
-            if dt_name is not None:
-                datatype = config["datatype"][dt_name]
-                if datatype["datatype"] != primary_dt_name:
-                    datatypes.append(datatype)
-                datatypes += get_parents(datatype.get("parent"))
-            return datatypes
-
-        return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
-
     dt_config = config["datatype"]
     dt_names = [dt_name for dt_name in dt_config]
     leaf_dts = []
@@ -105,7 +105,7 @@ def get_parents(dt_name):
 
     dt_hierarchy = {}
     for leaf_dt in leaf_dts:
-        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(leaf_dt)
+        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(config, leaf_dt)
     return dt_hierarchy
 
 
@@ -235,6 +235,14 @@ def is_match(datatype):
                 return success_rate
 
         def tiebreak(datatypes):
+            # TODO: There is a problem with this algorithm, since it implicitly assumes that if two
+            # datatypes are of the same depth, then neither can be a parent of the other. But this
+            # is false. We could have, for example,
+            #   leaf_1 -> non_space -> trimmed_line
+            #   leaf_2 -> word -> non_space -> trimmed_line
+            # Even though non-space is a parent of word, the algorithm classifies both as depth 1.
+            # We need to have another check in this function to determine whether there are any
+            # parent-child dependencies between the datatypes in the tiebreaker list.
             in_types = []
             other_types = []
             for dt in datatypes:

From 318da294400811edf4958fd2b075bf673a0b7b70 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 19 Nov 2023 20:10:54 -0500
Subject: [PATCH 29/48] redesign algorithm for get_datatype()

---
 scripts/guess.py | 160 ++++++++++++++++++++++++-----------------------
 1 file changed, 82 insertions(+), 78 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index b1af55bc..c0c9f4d8 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -9,11 +9,13 @@
 import sys
 import time
 
+from copy import deepcopy
 from guess_grammar import grammar, TreeToDict
 
 from argparse import ArgumentParser
 from lark import Lark
 from numbers import Number
+from pprint import pformat
 
 
 SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"]
@@ -80,33 +82,62 @@ def get_parents(dt_name):
     return [config["datatype"][primary_dt_name]] + get_parents(primary_dt_name)
 
 
-def get_datatype_hierarchy(config):
+def get_dt_hierarchies(config):
     """
     Given a VALVE configuration, return a datatype hierarchy that looks like this:
-    {'dt_name_1': [{'datatype': 'dt_name_1',
-                    'description': 'a description',
-                    ...},
-                   {'datatype': 'parent datatype',
-                    'description': 'a description',
-                    ...},
-                   {'datatype': 'grandparent datatype',
-                    'description': 'a description',
-                    ...},
-                   ...],
-     'dt_name_2': etc.
+    {0: {'dt_name_1': [{'datatype': 'dt_name_1',
+                        'description': 'a description',
+                         ...},
+                       {'datatype': 'parent datatype',
+                        'description': 'a description',
+                        ...},
+                       {'datatype': 'grandparent datatype',
+                        'description': 'a description',
+                        ...},
+                       ...],
+         'dt_name_2': etc.},
+     1: ... etc.}
     """
+
+    def get_higher_datatypes(datatype_hierarchies, universals, depth):
+        current_datatypes = [dt_name for dt_name in datatype_hierarchies.get(depth, [])]
+        higher_datatypes = {}
+        if current_datatypes:
+            universals = [dt_name for dt_name in universals]
+            lower_datatypes = []
+            for i in range(0, depth):
+                lower_datatypes += [dt_name for dt_name in datatype_hierarchies.get(i, [])]
+            for dt_name in dt_hierarchies[depth]:
+                dt_hierarchy = dt_hierarchies[depth][dt_name]
+                if len(dt_hierarchy) > 1:
+                    parent_hierarchy = dt_hierarchy[1:]
+                    parent = parent_hierarchy[0]["datatype"]
+                    if parent not in current_datatypes + lower_datatypes + universals:
+                        higher_datatypes[parent] = parent_hierarchy
+        return higher_datatypes
+
     dt_config = config["datatype"]
     dt_names = [dt_name for dt_name in dt_config]
-    leaf_dts = []
-    for dt in dt_names:
-        children = [child for child in dt_names if dt_config[child].get("parent") == dt]
+    dt_hierarchies = {0: {}}
+    universals = {}
+    for dt_name in dt_names:
+        # Add all the leaf datatypes to dt_hierarchies at 0 depth:
+        children = [child for child in dt_names if dt_config[child].get("parent") == dt_name]
         if not children:
-            leaf_dts.append(dt)
-
-    dt_hierarchy = {}
-    for leaf_dt in leaf_dts:
-        dt_hierarchy[leaf_dt] = get_hierarchy_for_dt(config, leaf_dt)
-    return dt_hierarchy
+            dt_hierarchies[0][dt_name] = get_hierarchy_for_dt(config, dt_name)
+        # Ungrounded and unconditioned datatypes go into the universals category, which are added
+        # to the top of dt_hierarchies later:
+        elif not dt_config[dt_name].get("parent") or not dt_config[dt_name].get("condition"):
+            universals[dt_name] = get_hierarchy_for_dt(config, dt_name)
+
+    depth = 0
+    higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth)
+    while higher_dts:
+        depth += 1
+        dt_hierarchies[depth] = deepcopy(higher_dts)
+        higher_dts = get_higher_datatypes(dt_hierarchies, universals, depth)
+    dt_hierarchies[depth + 1] = universals
+    return dt_hierarchies
 
 
 def get_sql_type(config, datatype):
@@ -211,17 +242,7 @@ def has_duplicates(target, ignore_empties):
         distinct_values = set(values)
         return (len(values) - len(distinct_values)) > (error_rate * len(values))
 
-    def get_datatype(target, dt_hierarchy):
-        # For each tree in the hierarchy:
-        #    Look for a match with the 0th element and possibly add it to matching_datatypes.
-        # If there are matches in matching_datatypes:
-        #    Use the tiebreaker rules to find the best match and annotate the target with it.
-        # Else:
-        #    Try again with the next highest element of each tree (if one exists)
-        #
-        # Note that this is guaranteed to work since the get_datatype_hierarchy() function includes
-        # the 'text' datatype which matches anything. So if no matches are found raise an error.
-
+    def get_datatype(target, dt_hierarchies):
         def is_match(datatype):
             # If the datatype has no associated condition then it matches anything:
             if not datatype.get("condition"):
@@ -235,61 +256,44 @@ def is_match(datatype):
                 return success_rate
 
         def tiebreak(datatypes):
-            # TODO: There is a problem with this algorithm, since it implicitly assumes that if two
-            # datatypes are of the same depth, then neither can be a parent of the other. But this
-            # is false. We could have, for example,
-            #   leaf_1 -> non_space -> trimmed_line
-            #   leaf_2 -> word -> non_space -> trimmed_line
-            # Even though non-space is a parent of word, the algorithm classifies both as depth 1.
-            # We need to have another check in this function to determine whether there are any
-            # parent-child dependencies between the datatypes in the tiebreaker list.
             in_types = []
             other_types = []
+            parents = set([dt["datatype"].get("parent") for dt in datatypes])
+            parents.discard(None)
             for dt in datatypes:
-                if dt["datatype"].get("condition", "").lstrip().startswith("in("):
-                    in_types.append(dt)
-                else:
-                    other_types.append(dt)
-            sorted_types = sorted(
-                in_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True
-            ) + sorted(other_types, key=lambda k: (k["depth"], k["success_rate"]), reverse=True)
-            return sorted_types[0]["datatype"]
-
-        curr_index = 0
-        while True:
-            matching_datatypes = []
-            datatypes_to_check = []
-            for dt_name in dt_hierarchy:
-                if (
-                    len(dt_hierarchy[dt_name]) > curr_index
-                    and dt_hierarchy[dt_name][curr_index] not in datatypes_to_check
-                ):
-                    datatypes_to_check.append(dt_hierarchy[dt_name][curr_index])
-            if len(datatypes_to_check) == 0:
-                print(f"Could not find a datatype match for column '{label}'")
+                if dt["datatype"]["datatype"] not in parents:
+                    if dt["datatype"].get("condition", "").lstrip().startswith("in("):
+                        in_types.append(dt)
+                    else:
+                        other_types.append(dt)
+
+            if len(in_types) == 1:
+                return in_types[0]["datatype"]
+            elif len(in_types) > 1:
+                in_types = sorted(in_types, key=lambda k: k["success_rate"], reverse=True)
+                return in_types[0]["datatype"]
+            elif len(other_types) == 1:
+                return other_types[0]["datatype"]
+            elif len(other_types) > 1:
+                other_types = sorted(other_types, key=lambda k: k["success_rate"], reverse=True)
+                return other_types[0]["datatype"]
+            else:
+                print(f"Error tiebreaking datatypes: {pformat(datatypes)}")
                 sys.exit(1)
 
+        for depth in range(0, len(dt_hierarchies)):
+            datatypes_to_check = [dt_hierarchies[depth][dt][0] for dt in dt_hierarchies[depth]]
+            matching_datatypes = []
             for datatype in datatypes_to_check:
                 success_rate = is_match(datatype)
                 if success_rate:
-                    matching_datatypes.append(
-                        {
-                            "datatype": datatype,
-                            "depth": curr_index,
-                            "success_rate": success_rate,
-                        }
-                    )
-
-            if len(matching_datatypes) == 0:
-                curr_index += 1
-                continue
-            elif len(matching_datatypes) == 1:
+                    matching_datatypes.append({"datatype": datatype, "success_rate": success_rate})
+
+            if len(matching_datatypes) == 1:
                 return matching_datatypes[0]["datatype"]
-            else:
+            elif len(matching_datatypes) > 1:
                 return tiebreak(matching_datatypes)
 
-            curr_index += 1
-
     def get_from(target, potential_foreign_columns):
         candidate_froms = []
         for foreign in potential_foreign_columns:
@@ -322,8 +326,8 @@ def get_from(target, potential_foreign_columns):
         target["nulltype"] = "empty"
 
     # Use the valve config to retrieve the valve datatype hierarchy:
-    dt_hierarchy = get_datatype_hierarchy(config)
-    target["datatype"] = get_datatype(target, dt_hierarchy)["datatype"]
+    dt_hierarchies = get_dt_hierarchies(config)
+    target["datatype"] = get_datatype(target, dt_hierarchies)["datatype"]
 
     # Use the valve config to get a list of columns already loaded to the database, then compare
     # the contents of each column with the contents of the target column and possibly annotate the

From 0ea815fa1112f63a2cfe4f455688c28c80e4ad73 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Wed, 22 Nov 2023 11:05:48 -0500
Subject: [PATCH 30/48] add verbose flag

---
 scripts/guess.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index c0c9f4d8..bbb3bfc7 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -19,6 +19,17 @@
 
 
 SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"]
+VERBOSE = False
+
+
+def log(message, force=False, suppress_time=False):
+    global VERBOSE
+
+    if force or VERBOSE:
+        if not suppress_time:
+            print(f"{time.asctime()} {message}", file=sys.stderr)
+        else:
+            print(f"{message}", file=sys.stderr)
 
 
 def has_ncolumn(sample, ncolumn):
@@ -325,7 +336,7 @@ def get_from(target, potential_foreign_columns):
     if has_nulltype(target):
         target["nulltype"] = "empty"
 
-    # Use the valve config to retrieve the valve datatype hierarchy:
+    # Use the valve config to retrieve the valve datatype hierarchies:
     dt_hierarchies = get_dt_hierarchies(config)
     target["datatype"] = get_datatype(target, dt_hierarchies)["datatype"]
 
@@ -350,6 +361,7 @@ def get_from(target, potential_foreign_columns):
 
 if __name__ == "__main__":
     parser = ArgumentParser(description="VALVE guesser (prototype)")
+    parser.add_argument("--verbose", action="store_true", help="Print logging output to STDERR.")
     parser.add_argument(
         "--sample_size",
         type=int,
@@ -384,6 +396,8 @@ def get_from(target, potential_foreign_columns):
     )
     args = parser.parse_args()
 
+    VERBOSE = args.verbose
+
     # Use the seed argument, or the epoch time if no seed is given, to set up the random generator:
     if args.seed is not None:
         seed = args.seed
@@ -402,9 +416,12 @@ def get_from(target, potential_foreign_columns):
     # Attach the condition parser to the config as well:
     config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict())
 
+    log(f"Getting random sample of {args.sample_size} rows from {args.TABLE} ...")
     sample = get_random_sample(args.TABLE, args.sample_size)
     for i, label in enumerate(sample):
+        log(f"Annotating label '{label}' ...")
         annotate(label, sample, config, args.error_rate, i == 0)
+    log("Done!")
 
     # For debugging:
     # pprint(sample)

From 4a4ea326874057696ec49bdc5de13052ce85a37e Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 13:36:06 -0500
Subject: [PATCH 31/48] write table and column config to db

---
 scripts/guess.py | 121 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 105 insertions(+), 16 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index bbb3bfc7..61e4ea14 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -15,7 +15,9 @@
 from argparse import ArgumentParser
 from lark import Lark
 from numbers import Number
+from pathlib import Path
 from pprint import pformat
+from textwrap import dedent
 
 
 SPECIAL_TABLES = ["table", "column", "datatype", "rule", "history", "message"]
@@ -153,14 +155,14 @@ def get_higher_datatypes(datatype_hierarchies, universals, depth):
 
 def get_sql_type(config, datatype):
     """Given the config map and the name of a datatype, climb the datatype tree (as required),
-    and return the first 'SQL type' found."""
+    and return the first 'SQLite type' found."""
     if "datatype" not in config:
         print("Missing datatypes in config")
         sys.exit(1)
     if datatype not in config["datatype"]:
         return None
-    if config["datatype"][datatype].get("SQL type"):
-        return config["datatype"][datatype]["SQL type"]
+    if config["datatype"][datatype].get("SQLite type"):
+        return config["datatype"][datatype]["SQLite type"]
     return get_sql_type(config, config["datatype"][datatype].get("parent"))
 
 
@@ -258,6 +260,10 @@ def is_match(datatype):
             # If the datatype has no associated condition then it matches anything:
             if not datatype.get("condition"):
                 return True
+            # If the SQLite type is NULL this datatype is ruled out:
+            sqlite_type = datatype.get("SQLite type")
+            if sqlite_type and sqlite_type.casefold() == "null":
+                return False
 
             condition = get_compiled_condition(datatype["condition"], config["parser"])
             num_values = len(target["values"])
@@ -372,7 +378,8 @@ def get_from(target, potential_foreign_columns):
         "--error_rate",
         type=float,
         default=0.1,
-        help="Proportion of errors expected (default: 10%%)",
+        help="""A number between 0 and 1 (inclusive) representing the proportion of errors expected
+        (default: 0.1)""",
     )
     parser.add_argument(
         "--enum_size",
@@ -383,6 +390,11 @@ def get_from(target, potential_foreign_columns):
     parser.add_argument(
         "--seed", type=int, help="Seed to use for random sampling (default: current epoch time)"
     )
+    parser.add_argument(
+        "--yes",
+        action="store_true",
+        help="Do not ask for confirmation before writing suggested modifications to the database",
+    )
     parser.add_argument(
         "VALVE_TABLE", help="The VALVE table table from which to read the VALVE configuration"
     )
@@ -407,8 +419,10 @@ def get_from(target, potential_foreign_columns):
 
     # Get the valve configuration and database info:
     config = get_valve_config(args.VALVE_TABLE)
-    if args.TABLE.removesuffix(".tsv") in config["table"]:
-        print(f"{args.TABLE.removesuffix('.tsv')} is already configured.", file=sys.stderr)
+    table_tsv = args.TABLE
+    table = Path(args.TABLE).stem
+    if table in config["table"]:
+        print(f"{table} is already configured.", file=sys.stderr)
         sys.exit(0)
     with sqlite3.connect(args.DATABASE) as conn:
         config["db"] = conn
@@ -416,20 +430,95 @@ def get_from(target, potential_foreign_columns):
     # Attach the condition parser to the config as well:
     config["parser"] = Lark(grammar, parser="lalr", transformer=TreeToDict())
 
-    log(f"Getting random sample of {args.sample_size} rows from {args.TABLE} ...")
-    sample = get_random_sample(args.TABLE, args.sample_size)
+    log(f"Getting random sample of {args.sample_size} rows from {table_tsv} ...")
+    sample = get_random_sample(table_tsv, args.sample_size)
     for i, label in enumerate(sample):
         log(f"Annotating label '{label}' ...")
         annotate(label, sample, config, args.error_rate, i == 0)
     log("Done!")
 
-    # For debugging:
-    # pprint(sample)
+    table_table_headers = ["table", "path", "type", "description"]
+    column_table_headers = [
+        "table",
+        "column",
+        "label",
+        "nulltype",
+        "datatype",
+        "structure",
+        "description",
+    ]
+    if not args.yes:
+        print()
+
+        print('The following row will be inserted to "table":')
+        data = [table_table_headers, [f"{table}", f"{table_tsv}", "", ""]]
+        # We add +2 for padding
+        col_width = max(len(word) for row in data for word in row) + 2
+        for row in data:
+            print("".join(word.ljust(col_width) for word in row))
+
+        print()
+
+        print('The following row will be inserted to "column":')
+        data = [column_table_headers]
+        for label in sample:
+            row = [
+                f"{table}",
+                f"{sample[label]['normalized']}",
+                f"{label}",
+                f"{sample[label].get('nulltype', '')}",
+                f"{sample[label]['datatype']}",
+                f"{sample[label].get('structure', '')}",
+                f"{sample[label].get('description', '')}",
+            ]
+            data.append(row)
+        # We add +2 for padding
+        col_width = max(len(word) for row in data for word in row) + 2
+        for row in data:
+            print("".join(word.ljust(col_width) for word in row))
 
-    # For debugging without values:
-    for label in sample:
-        print(f"{label}: ", end="")
-        for annotation, data in sample[label].items():
-            if annotation != "values":
-                print(f"{annotation}: {data}, ", end="")
         print()
+
+        answer = input("Do you want to write this updated configuration to the database? (y/n) ")
+        if answer.casefold() != "y":
+            print("Not writing updated configuration to the database.")
+            sys.exit(0)
+
+    log("Updating table configuration in database ...")
+    row_number = conn.execute('SELECT MAX(row_number) FROM "table"').fetchall()[0][0] + 1
+    sql = dedent(
+        f"""
+    INSERT INTO "table" ("row_number", {', '.join([f'"{k}"' for k in table_table_headers])})
+    VALUES ({row_number}, '{table}', '{table_tsv}', NULL, NULL)"""
+    )
+    log(sql, suppress_time=True)
+    log("", suppress_time=True)
+    conn.execute(sql)
+    conn.commit()
+
+    log("Updating column configuration in database ...")
+    row_number = conn.execute('SELECT MAX(row_number) FROM "column"').fetchall()[0][0] + 1
+    for label in sample:
+        values = ", ".join(
+            [
+                f"{row_number}",
+                f"'{table}'",
+                f"'{sample[label]['normalized']}'",
+                f"'{label}'",
+                f"'{sample[label]['nulltype']}'" if sample[label].get("nulltype") else "NULL",
+                f"'{sample[label]['datatype']}'",
+                f"'{sample[label]['structure']}'" if sample[label].get("structure") else "NULL",
+                f"'{sample[label]['description']}'" if sample[label].get("description") else "NULL",
+            ]
+        )
+        sql = dedent(
+            f"""
+        INSERT INTO "column" ("row_number", {', '.join([f'"{k}"' for k in column_table_headers])})
+        VALUES ({values})"""
+        )
+        log(sql, suppress_time=True)
+        conn.execute(sql)
+        conn.commit()
+        row_number += 1
+    log("", suppress_time=True)
+    log("Done!")

From 70d035937d2a8effbfc1d6803542d2522036e9d3 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 14:30:17 -0500
Subject: [PATCH 32/48] warn but do not panic, during load, if table file
 doesn't exist

---
 src/lib.rs | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 296fd7cf..638770a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4403,6 +4403,27 @@ async fn load_db(
     let mut total_infos = 0;
     let mut table_num = 1;
     for table_name in table_list {
+        let path = String::from(
+            config
+                .get("table")
+                .and_then(|t| t.as_object())
+                .and_then(|o| o.get(&table_name))
+                .and_then(|n| n.get("path"))
+                .and_then(|p| p.as_str())
+                .unwrap(),
+        );
+        let mut rdr = {
+            match File::open(path.clone()) {
+                Err(e) => {
+                    eprintln!("WARN: Unable to open '{}': {}", path.clone(), e);
+                    continue;
+                }
+                Ok(table_file) => csv::ReaderBuilder::new()
+                    .has_headers(false)
+                    .delimiter(b'\t')
+                    .from_reader(table_file),
+            }
+        };
         if verbose {
             eprintln!(
                 "{} - Loading table {}/{}: {}",
@@ -4413,21 +4434,6 @@ async fn load_db(
             );
         }
         table_num += 1;
-        let path = String::from(
-            config
-                .get("table")
-                .and_then(|t| t.as_object())
-                .and_then(|o| o.get(&table_name))
-                .and_then(|n| n.get("path"))
-                .and_then(|p| p.as_str())
-                .unwrap(),
-        );
-        let mut rdr = csv::ReaderBuilder::new()
-            .has_headers(false)
-            .delimiter(b'\t')
-            .from_reader(File::open(path.clone()).unwrap_or_else(|err| {
-                panic!("Unable to open '{}': {}", path.clone(), err);
-            }));
 
         // Extract the headers, which we will need later:
         let mut records = rdr.records();

From 0bd3ad3a8de27cf49895298e87414682602404b4 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 14:48:15 -0500
Subject: [PATCH 33/48] add ValveRow alias

---
 src/lib.rs      | 43 ++++++++++++++++++++++---------------------
 src/validate.rs | 22 +++++++++++-----------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 638770a0..ba4b2167 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -77,10 +77,11 @@ lazy_static! {
     static ref SL_SQL_TYPES: Vec<&'static str> = vec!["text", "numeric", "integer", "real"];
 }
 
-/// An alias for [serde_json::Map](..//serde_json/struct.Map.html)<String, [serde_json::Value](../serde_json/enum.Value.html)>.
+/// Aliases for [serde_json::Map](..//serde_json/struct.Map.html)<String, [serde_json::Value](../serde_json/enum.Value.html)>.
 // Note: serde_json::Map is
 // [backed by a BTreeMap by default](https://docs.serde.rs/serde_json/map/index.html)
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
+pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
 /// Represents a structure such as those found in the `structure` column of the `column` table in
 /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its
@@ -1444,7 +1445,7 @@ pub async fn get_affected_rows(
     global_config: &SerdeMap,
     pool: &AnyPool,
     tx: &mut Transaction<'_, sqlx::Any>,
-) -> Result<IndexMap<u32, SerdeMap>, String> {
+) -> Result<IndexMap<u32, ValveRow>, String> {
     // Since the consequence of an update could involve currently invalid rows
     // (in the conflict table) becoming valid or vice versa, we need to check rows for
     // which the value of the column is the same as `value`
@@ -1470,7 +1471,7 @@ pub async fn get_affected_rows(
         .await
         .map_err(|e| e.to_string())?
     {
-        let mut table_row = SerdeMap::new();
+        let mut table_row = ValveRow::new();
         let mut row_number: Option<u32> = None;
         for column in row.columns() {
             let cname = column.name();
@@ -1508,7 +1509,7 @@ pub async fn get_row_from_db(
     tx: &mut Transaction<'_, sqlx::Any>,
     table: &str,
     row_number: &u32,
-) -> Result<SerdeMap, sqlx::Error> {
+) -> Result<ValveRow, sqlx::Error> {
     let sql = format!(
         "{} WHERE row_number = {}",
         query_with_message_values(table, global_config, pool),
@@ -1541,7 +1542,7 @@ pub async fn get_row_from_db(
         }
     };
 
-    let mut row = SerdeMap::new();
+    let mut row = ValveRow::new();
     for column in sql_row.columns() {
         let cname = column.name();
         if !vec!["row_number", "message"].contains(&cname) {
@@ -1649,7 +1650,7 @@ pub async fn get_rows_to_update(
     ),
     String,
 > {
-    fn get_cell_value(row: &SerdeMap, column: &str) -> Result<String, String> {
+    fn get_cell_value(row: &ValveRow, column: &str) -> Result<String, String> {
         match row.get(column).and_then(|cell| cell.get("value")) {
             Some(SerdeValue::String(s)) => Ok(format!("{}", s)),
             Some(SerdeValue::Number(n)) => Ok(format!("{}", n)),
@@ -1900,8 +1901,8 @@ pub async fn record_row_change(
     tx: &mut Transaction<'_, sqlx::Any>,
     table: &str,
     row_number: &u32,
-    from: Option<&SerdeMap>,
-    to: Option<&SerdeMap>,
+    from: Option<&ValveRow>,
+    to: Option<&ValveRow>,
     user: &str,
 ) -> Result<(), sqlx::Error> {
     if let (None, None) = (from, to) {
@@ -1910,8 +1911,8 @@ pub async fn record_row_change(
         ));
     }
 
-    fn to_text(smap: Option<&SerdeMap>, quoted: bool) -> String {
-        match smap {
+    fn to_text(row: Option<&ValveRow>, quoted: bool) -> String {
+        match row {
             None => "NULL".to_string(),
             Some(r) => {
                 let inner = format!("{}", json!(r)).replace("'", "''");
@@ -1932,7 +1933,7 @@ pub async fn record_row_change(
         }
     }
 
-    fn summarize(from: Option<&SerdeMap>, to: Option<&SerdeMap>) -> Result<String, String> {
+    fn summarize(from: Option<&ValveRow>, to: Option<&ValveRow>) -> Result<String, String> {
         // Constructs a summary of the form:
         // {
         //   "column":"bar",
@@ -2420,7 +2421,7 @@ pub async fn insert_new_row(
     compiled_rule_conditions: &HashMap<String, HashMap<String, Vec<ColumnRule>>>,
     pool: &AnyPool,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     new_row_number: Option<u32>,
     user: &str,
 ) -> Result<u32, sqlx::Error> {
@@ -2469,7 +2470,7 @@ pub async fn insert_new_row_tx(
     pool: &AnyPool,
     tx: &mut Transaction<sqlx::Any>,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     new_row_number: Option<u32>,
     skip_validation: bool,
 ) -> Result<u32, sqlx::Error> {
@@ -2789,7 +2790,7 @@ pub async fn update_row(
     compiled_rule_conditions: &HashMap<String, HashMap<String, Vec<ColumnRule>>>,
     pool: &AnyPool,
     table_name: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: &u32,
     user: &str,
 ) -> Result<(), sqlx::Error> {
@@ -2854,7 +2855,7 @@ pub async fn update_row_tx(
     pool: &AnyPool,
     tx: &mut Transaction<sqlx::Any>,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: &u32,
     skip_validation: bool,
     do_not_recurse: bool,
@@ -2967,10 +2968,10 @@ pub async fn update_row_tx(
     Ok(())
 }
 
-/// Given a path, read a TSV file and return a vector of rows represented as SerdeMaps.
+/// Given a path, read a TSV file and return a vector of rows represented as ValveRows.
 /// Note: Use this function to read "small" TSVs only. In particular, use this for the special
 /// configuration tables.
-fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
+fn read_tsv_into_vector(path: &str) -> Vec<ValveRow> {
     let mut rdr =
         csv::ReaderBuilder::new()
             .delimiter(b'\t')
@@ -2981,7 +2982,7 @@ fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
     let rows: Vec<_> = rdr
         .deserialize()
         .map(|result| {
-            let row: SerdeMap = result.expect(format!("Error reading: {}", path).as_str());
+            let row: ValveRow = result.expect(format!("Error reading: {}", path).as_str());
             row
         })
         .collect();
@@ -3010,8 +3011,8 @@ fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
 }
 
 /// Given a database at the specified location, query the "table" table and return a vector of rows
-/// represented as SerdeMaps.
-fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<SerdeMap> {
+/// represented as ValveRows.
+fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<ValveRow> {
     let connection_options;
     if database.starts_with("postgresql://") {
         connection_options = AnyConnectOptions::from_str(database).unwrap();
@@ -3036,7 +3037,7 @@ fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<SerdeMap
     let rows = block_on(sqlx_query(&sql).fetch_all(&pool)).unwrap();
     let mut table_rows = vec![];
     for row in rows {
-        let mut table_row = SerdeMap::new();
+        let mut table_row = ValveRow::new();
         for column in row.columns() {
             let cname = column.name();
             if cname != "row_number" {
diff --git a/src/validate.rs b/src/validate.rs
index 326b9eca..e4b89dc1 100644
--- a/src/validate.rs
+++ b/src/validate.rs
@@ -10,7 +10,7 @@ use std::collections::HashMap;
 use crate::{
     ast::Expression, cast_column_sql_to_text, cast_sql_param_from_text, get_column_value,
     get_sql_type_from_global_config, is_sql_type_error, local_sql_syntax, ColumnRule,
-    CompiledCondition, ParsedStructure, SerdeMap, SQL_PARAM,
+    CompiledCondition, ParsedStructure, SerdeMap, ValveRow, SQL_PARAM,
 };
 
 /// Represents a particular cell in a particular row of data with vaildation results.
@@ -46,7 +46,7 @@ pub struct QueryAsIf {
     // named 'foo' so we need to use an alias:
     pub alias: String,
     pub row_number: u32,
-    pub row: Option<SerdeMap>,
+    pub row: Option<ValveRow>,
 }
 
 /// Given a config map, maps of compiled datatype and rule conditions, a database connection
@@ -62,10 +62,10 @@ pub async fn validate_row(
     pool: &AnyPool,
     tx: Option<&mut Transaction<'_, sqlx::Any>>,
     table_name: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: Option<u32>,
     query_as_if: Option<&QueryAsIf>,
-) -> Result<SerdeMap, sqlx::Error> {
+) -> Result<ValveRow, sqlx::Error> {
     // Fallback to a default transaction if it is not given. Since we do not commit before it falls
     // out of scope the transaction will be rolled back at the end of this function. And since this
     // function is read-only the rollback is trivial and therefore inconsequential.
@@ -944,10 +944,10 @@ pub fn validate_rows_intra(
     result_rows
 }
 
-/// Given a row represented as a SerdeMap, remove any duplicate messages from the row's cells, so
+/// Given a row represented as a ValveRow, remove any duplicate messages from the row's cells, so
 /// that no cell has messages with the same level, rule, and message text.
-fn remove_duplicate_messages(row: &SerdeMap) -> Result<SerdeMap, sqlx::Error> {
-    let mut deduped_row = SerdeMap::new();
+fn remove_duplicate_messages(row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    let mut deduped_row = ValveRow::new();
     for (column_name, cell) in row.iter() {
         let mut messages = cell
             .get("messages")
@@ -981,12 +981,12 @@ fn remove_duplicate_messages(row: &SerdeMap) -> Result<SerdeMap, sqlx::Error> {
     Ok(deduped_row)
 }
 
-/// Given a result row, convert it to a SerdeMap and return it.
+/// Given a result row, convert it to a ValveRow and return it.
 /// Note that if the incoming result row has an associated row_number, this is ignored.
-fn result_row_to_config_map(incoming: &ResultRow) -> SerdeMap {
-    let mut outgoing = SerdeMap::new();
+fn result_row_to_config_map(incoming: &ResultRow) -> ValveRow {
+    let mut outgoing = ValveRow::new();
     for (column, cell) in incoming.contents.iter() {
-        let mut cell_map = SerdeMap::new();
+        let mut cell_map = ValveRow::new();
         if let Some(nulltype) = &cell.nulltype {
             cell_map.insert(
                 "nulltype".to_string(),

From c20bca7e7c094ce70b4e448dd83d25adbde7cb9b Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 15:15:25 -0500
Subject: [PATCH 34/48] add stubs for new API

---
 src/lib.rs | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index ba4b2167..1d056e99 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -83,6 +83,198 @@ lazy_static! {
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
 pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
+pub struct Valve {
+    global_config: SerdeMap,
+    compiled_datatype_conditions: HashMap<String, CompiledCondition>,
+    compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
+    pool: AnyPool,
+    user: String,
+}
+
+impl Valve {
+    /// Given a path to a table table,
+    /// read it, configure VALVE, and return a new Valve struct.
+    /// Return an error if reading or configuration fails.
+    pub fn build(mut self, table_path: &str) -> Result<Self, sqlx::Error> {
+        // Should be ConfigError
+        todo!();
+        Ok(self)
+    }
+
+    /// Set the user name for this instance.
+    /// The username must be a short string without newlines.
+    /// Return an error on invalid username.
+    pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
+        // ConfigError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a database connection string,
+    /// create a database connection for VALVE to use.
+    /// Drop and replace any current database connection.
+    /// Return an error if the connection cannot be created.
+    pub fn connect(mut self, connection: &str) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Create all configured database tables and views
+    /// if they do not already exist as configured.
+    /// Return an error on database problems.
+    pub fn create_all_tables(mut self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Drop all configured tables, in reverse dependency order.
+    /// Return an error on database problem.
+    pub fn drop_all_tables(self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// drop those tables, in the given order.
+    /// Return an error on invalid table name or database problem.
+    pub fn drop_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Truncate all configured tables, in reverse dependency order.
+    /// Return an error on database problem.
+    pub fn truncate_all_tables(self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// truncate those tables, in the given order.
+    /// Return an error on invalid table name or database problem.
+    pub fn truncate_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // ConfigOrDatabaseError
+        //self.create_all_tables();
+        todo!();
+        Ok(self)
+    }
+
+    /// Load all configured tables in dependency order.
+    /// If `validate` is false, just try to insert all rows.
+    /// Return an error on database problem,
+    /// including database conflicts that prevent rows being inserted.
+    pub fn load_all_tables(self, validate: bool) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        //self.create_all_tables();
+        //self.truncate_all_tables();
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// load those tables in the given order.
+    /// If `validate` is false, just try to insert all rows.
+    /// Return an error on invalid table name or database problem.
+    pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result<Self, sqlx::Error> {
+        // ConfigOrDatabaseError
+        //self.create_all_tables();
+        //self.truncate_tables(tables);
+        todo!();
+        Ok(self)
+    }
+
+    /// Save all configured tables to their 'path's.
+    /// Return an error on writing or database problem.
+    pub fn save_all_tables(self) -> Result<Self, sqlx::Error> {
+        // WriteOrDatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// Save thosee tables to their 'path's, in the given order.
+    /// Return an error on writing or database problem.
+    pub fn save_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // WriteOrDatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a table name and a row as JSON,
+    /// return the validated row.
+    /// Return an error on database problem.
+    pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Given a table name and a row as JSON,
+    /// add the row to the table in the database,
+    /// and return the validated row, including its new row_number.
+    /// Return an error invalid table name or database problem.
+    pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Given a table name, a row number, and a row as JSON,
+    /// update the row in the database,
+    /// and return the validated row.
+    /// Return an error invalid table name or row number or database problem.
+    pub fn update_row(
+        self,
+        table_name: &str,
+        row_number: usize,
+        row: &ValveRow,
+    ) -> Result<ValveRow, sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Given a table name and a row number,
+    /// delete that row from the table.
+    /// Return an error invalid table name or row number or database problem.
+    pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Return the next change to undo, or None.
+    /// Return an error on database problem.
+    pub fn get_record_to_undo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Return the next change to redo, or None.
+    /// Return an error on database problem.
+    pub fn get_record_to_redo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Undo one change and return the change record
+    /// or None if there was no change to undo.
+    /// Return an error on database problem.
+    pub fn undo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Redo one change and return the change record
+    /// or None if there was no change to redo.
+    /// Return an error on database problem.
+    pub fn redo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+}
+
 /// Represents a structure such as those found in the `structure` column of the `column` table in
 /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its
 /// original format (i.e., as a plain String).

From 2a6eabcf928ffbdde709c4c5d3b80a99cd6ce2e0 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 19:27:17 -0500
Subject: [PATCH 35/48] implement Valve::build()

---
 src/lib.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 11 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1d056e99..8407b621 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -83,22 +83,116 @@ lazy_static! {
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
 pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
+#[derive(Debug)]
 pub struct Valve {
-    global_config: SerdeMap,
-    compiled_datatype_conditions: HashMap<String, CompiledCondition>,
-    compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
-    pool: AnyPool,
-    user: String,
+    pub global_config: SerdeMap,
+    pub compiled_datatype_conditions: HashMap<String, CompiledCondition>,
+    pub compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
+    pub pool: Option<AnyPool>,
+    pub user: String,
 }
 
+// TODO NEXT: Move the existing public functions into this interface:
 impl Valve {
-    /// Given a path to a table table,
-    /// read it, configure VALVE, and return a new Valve struct.
+    /// Given a path to a table table and its name, read the table table, configure VALVE
+    /// partially ... TODO: finish this.
+    /// , and return a new Valve struct.
     /// Return an error if reading or configuration fails.
-    pub fn build(mut self, table_path: &str) -> Result<Self, sqlx::Error> {
+    pub async fn build(
+        table_path: &str,
+        config_table: &str,
+        // TODO: We need to refactor configure_db() so that it no longer collects the constraints
+        // configuration. We will do that in read_config_files() instead.
+        // Once this is implemented, the code below to construct the AnyPool which is used to
+        // call configure_db() should be removed.
+        // We will also remove the `database`, `initial_load` and `verbose` parameters.
+        database: &str,
+        initial_load: bool,
+        verbose: bool,
+    ) -> Result<Self, sqlx::Error> {
         // Should be ConfigError
-        todo!();
-        Ok(self)
+
+        let parser = StartParser::new();
+
+        let (specials_config, mut tables_config, mut datatypes_config, rules_config) =
+            read_config_files(table_path, config_table);
+
+        ////////////////////////////////////////////////////////////////////////////////////////
+        // TODO: Remove this block of code later (see comment above)
+        let connection_options;
+        if database.starts_with("postgresql://") {
+            connection_options = AnyConnectOptions::from_str(database)?;
+        } else {
+            let connection_string;
+            if !database.starts_with("sqlite://") {
+                connection_string = format!("sqlite://{}?mode=rwc", database);
+            } else {
+                connection_string = database.to_string();
+            }
+            connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
+        }
+
+        let pool = AnyPoolOptions::new()
+            .max_connections(5)
+            .connect_with(connection_options)
+            .await?;
+
+        let (sorted_table_list, constraints_config) = configure_db(
+            &mut tables_config,
+            &mut datatypes_config,
+            &pool,
+            &parser,
+            verbose,
+            &ValveCommand::Config,
+        )
+        .await?;
+        ////////////////////////////////////////////////////////////////////////////////////////
+
+        let mut global_config = SerdeMap::new();
+        global_config.insert(
+            String::from("special"),
+            SerdeValue::Object(specials_config.clone()),
+        );
+        global_config.insert(
+            String::from("table"),
+            SerdeValue::Object(tables_config.clone()),
+        );
+        global_config.insert(
+            String::from("datatype"),
+            SerdeValue::Object(datatypes_config.clone()),
+        );
+        global_config.insert(
+            String::from("rule"),
+            SerdeValue::Object(rules_config.clone()),
+        );
+        global_config.insert(
+            String::from("constraints"),
+            SerdeValue::Object(constraints_config.clone()),
+        );
+        let mut sorted_table_serdevalue_list: Vec<SerdeValue> = vec![];
+        for table in &sorted_table_list {
+            sorted_table_serdevalue_list.push(SerdeValue::String(table.to_string()));
+        }
+        global_config.insert(
+            String::from("sorted_table_list"),
+            SerdeValue::Array(sorted_table_serdevalue_list),
+        );
+
+        let compiled_datatype_conditions =
+            get_compiled_datatype_conditions(&global_config, &parser);
+        let compiled_rule_conditions = get_compiled_rule_conditions(
+            &global_config,
+            compiled_datatype_conditions.clone(),
+            &parser,
+        );
+
+        Ok(Self {
+            global_config: global_config,
+            compiled_datatype_conditions: compiled_datatype_conditions,
+            compiled_rule_conditions: compiled_rule_conditions,
+            pool: None,
+            user: String::from("Valve"),
+        })
     }
 
     /// Set the user name for this instance.
@@ -106,7 +200,7 @@ impl Valve {
     /// Return an error on invalid username.
     pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
         // ConfigError
-        todo!();
+        self.user = user.to_string();
         Ok(self)
     }
 

From 29a1be894a811c417841a3e2a2a39becaa5aaddb Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 08:23:25 -0500
Subject: [PATCH 36/48] refactor, fix api sigs, implement Valve::connect() and
 Valve::create_tables()

---
 src/lib.rs | 112 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 41 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8407b621..fd14adf4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -95,7 +95,7 @@ pub struct Valve {
 // TODO NEXT: Move the existing public functions into this interface:
 impl Valve {
     /// Given a path to a table table and its name, read the table table, configure VALVE
-    /// partially ... TODO: finish this.
+    /// partially ... TODO: finish rewriting this doc string.
     /// , and return a new Valve struct.
     /// Return an error if reading or configuration fails.
     pub async fn build(
@@ -110,7 +110,7 @@ impl Valve {
         initial_load: bool,
         verbose: bool,
     ) -> Result<Self, sqlx::Error> {
-        // Should be ConfigError
+        // TODO: Error type should be ConfigError
 
         let parser = StartParser::new();
 
@@ -119,24 +119,7 @@ impl Valve {
 
         ////////////////////////////////////////////////////////////////////////////////////////
         // TODO: Remove this block of code later (see comment above)
-        let connection_options;
-        if database.starts_with("postgresql://") {
-            connection_options = AnyConnectOptions::from_str(database)?;
-        } else {
-            let connection_string;
-            if !database.starts_with("sqlite://") {
-                connection_string = format!("sqlite://{}?mode=rwc", database);
-            } else {
-                connection_string = database.to_string();
-            }
-            connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
-        }
-
-        let pool = AnyPoolOptions::new()
-            .max_connections(5)
-            .connect_with(connection_options)
-            .await?;
-
+        let pool = get_pool_from_connection_string(database).await?;
         let (sorted_table_list, constraints_config) = configure_db(
             &mut tables_config,
             &mut datatypes_config,
@@ -198,7 +181,7 @@ impl Valve {
     /// Set the user name for this instance.
     /// The username must be a short string without newlines.
     /// Return an error on invalid username.
-    pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
+    pub fn set_user(&mut self, user: &str) -> Result<&mut Self, sqlx::Error> {
         // ConfigError
         self.user = user.to_string();
         Ok(self)
@@ -208,24 +191,48 @@ impl Valve {
     /// create a database connection for VALVE to use.
     /// Drop and replace any current database connection.
     /// Return an error if the connection cannot be created.
-    pub fn connect(mut self, connection: &str) -> Result<Self, sqlx::Error> {
+    pub async fn connect(&mut self, connection: &str) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        todo!();
+        self.pool = Some(get_pool_from_connection_string(connection).await?);
         Ok(self)
     }
 
     /// Create all configured database tables and views
     /// if they do not already exist as configured.
     /// Return an error on database problems.
-    pub fn create_all_tables(mut self) -> Result<Self, sqlx::Error> {
+    pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        todo!();
+        let mut tables_config = self
+            .global_config
+            .get_mut("table")
+            .and_then(|t| t.as_object_mut())
+            .unwrap();
+        let mut tables_config = tables_config.clone();
+        let mut datatypes_config = self
+            .global_config
+            .get_mut("datatype")
+            .and_then(|d| d.as_object_mut())
+            .unwrap();
+        let mut datatypes_config = datatypes_config.clone();
+        let pool = self.pool.as_ref().unwrap();
+        let parser = StartParser::new();
+
+        // TODO: Revisit this once te configure_db() function has been refactored:
+        let (_, _) = configure_db(
+            &mut tables_config,
+            &mut datatypes_config,
+            &pool,
+            &parser,
+            verbose,
+            &ValveCommand::Create,
+        )
+        .await?;
         Ok(self)
     }
 
     /// Drop all configured tables, in reverse dependency order.
     /// Return an error on database problem.
-    pub fn drop_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn drop_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -234,7 +241,7 @@ impl Valve {
     /// Given a vector of table names,
     /// drop those tables, in the given order.
     /// Return an error on invalid table name or database problem.
-    pub fn drop_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn drop_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -242,7 +249,7 @@ impl Valve {
 
     /// Truncate all configured tables, in reverse dependency order.
     /// Return an error on database problem.
-    pub fn truncate_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn truncate_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -251,7 +258,7 @@ impl Valve {
     /// Given a vector of table names,
     /// truncate those tables, in the given order.
     /// Return an error on invalid table name or database problem.
-    pub fn truncate_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
         //self.create_all_tables();
         todo!();
@@ -262,7 +269,7 @@ impl Valve {
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
-    pub fn load_all_tables(self, validate: bool) -> Result<Self, sqlx::Error> {
+    pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         //self.create_all_tables();
         //self.truncate_all_tables();
@@ -274,7 +281,7 @@ impl Valve {
     /// load those tables in the given order.
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on invalid table name or database problem.
-    pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result<Self, sqlx::Error> {
+    pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
         //self.create_all_tables();
         //self.truncate_tables(tables);
@@ -284,7 +291,7 @@ impl Valve {
 
     /// Save all configured tables to their 'path's.
     /// Return an error on writing or database problem.
-    pub fn save_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn save_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // WriteOrDatabaseError
         todo!();
         Ok(self)
@@ -293,7 +300,7 @@ impl Valve {
     /// Given a vector of table names,
     /// Save thosee tables to their 'path's, in the given order.
     /// Return an error on writing or database problem.
-    pub fn save_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn save_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // WriteOrDatabaseError
         todo!();
         Ok(self)
@@ -302,7 +309,7 @@ impl Valve {
     /// Given a table name and a row as JSON,
     /// return the validated row.
     /// Return an error on database problem.
-    pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    pub fn validate_row(&self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -311,7 +318,7 @@ impl Valve {
     /// add the row to the table in the database,
     /// and return the validated row, including its new row_number.
     /// Return an error invalid table name or database problem.
-    pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    pub fn insert_row(&self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
         // ConfigOrDatabaseError
         todo!();
     }
@@ -321,7 +328,7 @@ impl Valve {
     /// and return the validated row.
     /// Return an error invalid table name or row number or database problem.
     pub fn update_row(
-        self,
+        &self,
         table_name: &str,
         row_number: usize,
         row: &ValveRow,
@@ -333,21 +340,21 @@ impl Valve {
     /// Given a table name and a row number,
     /// delete that row from the table.
     /// Return an error invalid table name or row number or database problem.
-    pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
+    pub fn delete_row(&self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
         // ConfigOrDatabaseError
         todo!();
     }
 
     /// Return the next change to undo, or None.
     /// Return an error on database problem.
-    pub fn get_record_to_undo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+    pub fn get_record_to_undo(&self) -> Result<Option<AnyRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
 
     /// Return the next change to redo, or None.
     /// Return an error on database problem.
-    pub fn get_record_to_redo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+    pub fn get_record_to_redo(&self) -> Result<Option<AnyRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -355,7 +362,7 @@ impl Valve {
     /// Undo one change and return the change record
     /// or None if there was no change to undo.
     /// Return an error on database problem.
-    pub fn undo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+    pub fn undo(&self) -> Result<Option<ValveRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -363,7 +370,7 @@ impl Valve {
     /// Redo one change and return the change record
     /// or None if there was no change to redo.
     /// Return an error on database problem.
-    pub fn redo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+    pub fn redo(&self) -> Result<Option<ValveRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -432,6 +439,29 @@ impl std::fmt::Debug for ColumnRule {
     }
 }
 
+/// TODO: Add docstring here. Note that once we have refactored configure_db() (see above) it may
+/// make more sense for this function to be an inner function of Valve.
+pub async fn get_pool_from_connection_string(database: &str) -> Result<AnyPool, sqlx::Error> {
+    let connection_options;
+    if database.starts_with("postgresql://") {
+        connection_options = AnyConnectOptions::from_str(database)?;
+    } else {
+        let connection_string;
+        if !database.starts_with("sqlite://") {
+            connection_string = format!("sqlite://{}?mode=rwc", database);
+        } else {
+            connection_string = database.to_string();
+        }
+        connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
+    }
+
+    let pool = AnyPoolOptions::new()
+        .max_connections(5)
+        .connect_with(connection_options)
+        .await?;
+    Ok(pool)
+}
+
 /// Given the path to a configuration table (either a table.tsv file or a database containing a
 /// table named "table"), load and check the 'table', 'column', and 'datatype' tables, and return
 /// SerdeMaps corresponding to specials, tables, datatypes, and rules.

From c6f96fc4598e54ae86ebeeae45b89cac65f6b380 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 08:25:36 -0500
Subject: [PATCH 37/48] rename create_all_tables to create_missing_tables

---
 src/lib.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index fd14adf4..73721b07 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -200,7 +200,7 @@ impl Valve {
     /// Create all configured database tables and views
     /// if they do not already exist as configured.
     /// Return an error on database problems.
-    pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
+    pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
         let mut tables_config = self
             .global_config
@@ -260,7 +260,7 @@ impl Valve {
     /// Return an error on invalid table name or database problem.
     pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         todo!();
         Ok(self)
     }
@@ -270,8 +270,10 @@ impl Valve {
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
     pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
+        // YOU ARE HERE.
+
         // DatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         //self.truncate_all_tables();
         todo!();
         Ok(self)
@@ -283,7 +285,7 @@ impl Valve {
     /// Return an error on invalid table name or database problem.
     pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         //self.truncate_tables(tables);
         todo!();
         Ok(self)

From 83e741ed51030529f98d3885874749e260b25099 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 09:07:17 -0500
Subject: [PATCH 38/48] implement (rough) load_all_tables()

---
 src/lib.rs  | 63 ++++++++++++++++++++++++++++++++++++++++++++++-------
 src/main.rs | 23 +++++++++++--------
 2 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 73721b07..4eefe7e9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -105,9 +105,8 @@ impl Valve {
         // configuration. We will do that in read_config_files() instead.
         // Once this is implemented, the code below to construct the AnyPool which is used to
         // call configure_db() should be removed.
-        // We will also remove the `database`, `initial_load` and `verbose` parameters.
+        // We will also remove the `database`  and `verbose` parameters.
         database: &str,
-        initial_load: bool,
         verbose: bool,
     ) -> Result<Self, sqlx::Error> {
         // TODO: Error type should be ConfigError
@@ -202,6 +201,10 @@ impl Valve {
     /// Return an error on database problems.
     pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
+
+        // TODO: Revisit the implementation of this once te configure_db() function has been
+        // refactored. Currently it implicitly drops and recreates _all_ tables but eventually this
+        // function needs to do this only for _missing_ tables.
         let mut tables_config = self
             .global_config
             .get_mut("table")
@@ -217,7 +220,6 @@ impl Valve {
         let pool = self.pool.as_ref().unwrap();
         let parser = StartParser::new();
 
-        // TODO: Revisit this once te configure_db() function has been refactored:
         let (_, _) = configure_db(
             &mut tables_config,
             &mut datatypes_config,
@@ -269,13 +271,58 @@ impl Valve {
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
-    pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
-        // YOU ARE HERE.
-
+    pub async fn load_all_tables(
+        &mut self,
+        validate: bool,
+        verbose: bool,
+        initial_load: bool,
+    ) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        //self.create_missing_tables();
+
+        self.create_missing_tables(verbose);
         //self.truncate_all_tables();
-        todo!();
+        if let Some(pool) = &self.pool {
+            if pool.any_kind() == AnyKind::Sqlite {
+                sqlx_query("PRAGMA foreign_keys = ON").execute(pool).await?;
+                if initial_load {
+                    // These pragmas are unsafe but they are used during initial loading since data
+                    // integrity is not a priority in this case.
+                    sqlx_query("PRAGMA journal_mode = OFF")
+                        .execute(pool)
+                        .await?;
+                    sqlx_query("PRAGMA synchronous = 0").execute(pool).await?;
+                    sqlx_query("PRAGMA cache_size = 1000000")
+                        .execute(pool)
+                        .await?;
+                    sqlx_query("PRAGMA temp_store = MEMORY")
+                        .execute(pool)
+                        .await?;
+                }
+            }
+
+            if verbose {
+                eprintln!(
+                    "{} - Processing {} tables.",
+                    Utc::now(),
+                    self.global_config
+                        .get("sorted_table_list")
+                        .and_then(|l| l.as_array())
+                        .unwrap()
+                        .len()
+                );
+            }
+            load_db(
+                &self.global_config,
+                &pool,
+                &self.compiled_datatype_conditions,
+                &self.compiled_rule_conditions,
+                verbose,
+            )
+            .await?;
+        } else {
+            eprintln!("WARN: Attempt to load tables but Valve is not connected to a database.");
+        }
+
         Ok(self)
     }
 
diff --git a/src/main.rs b/src/main.rs
index 7e61aba4..4c919167 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,6 +7,7 @@ use argparse::{ArgumentParser, Store, StoreTrue};
 use ontodev_valve::{
     get_compiled_datatype_conditions, get_compiled_rule_conditions,
     get_parsed_structure_conditions, valve, valve_grammar::StartParser, ValveCommand,
+    Valve
 };
 use serde_json::{from_str, Value as SerdeValue};
 use std::{env, process};
@@ -156,15 +157,19 @@ async fn main() -> Result<(), sqlx::Error> {
         )
         .await?;
     } else {
-        valve(
-            &source,
-            &destination,
-            &ValveCommand::Load,
-            verbose,
-            initial_load,
-            &config_table,
-        )
-        .await?;
+        let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?;
+        valve.connect(&destination).await?;
+        valve.create_missing_tables(verbose).await?;
+        valve.load_all_tables(true, verbose, initial_load).await?;
+        // valve(
+        //     &source,
+        //     &destination,
+        //     &ValveCommand::Load,
+        //     verbose,
+        //     initial_load,
+        //     &config_table,
+        // )
+        // .await?;
     }
 
     Ok(())

From 8dceabf780e8f1943f20baa00b13dde97eb1ba54 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 09:14:36 -0500
Subject: [PATCH 39/48] fix small bug in call to create_all_tables()

---
 src/lib.rs  | 2 +-
 src/main.rs | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 4eefe7e9..5bc104bf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -279,7 +279,7 @@ impl Valve {
     ) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
 
-        self.create_missing_tables(verbose);
+        self.create_missing_tables(verbose).await?;
         //self.truncate_all_tables();
         if let Some(pool) = &self.pool {
             if pool.any_kind() == AnyKind::Sqlite {
diff --git a/src/main.rs b/src/main.rs
index 4c919167..486cb522 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -159,7 +159,6 @@ async fn main() -> Result<(), sqlx::Error> {
     } else {
         let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?;
         valve.connect(&destination).await?;
-        valve.create_missing_tables(verbose).await?;
         valve.load_all_tables(true, verbose, initial_load).await?;
         // valve(
         //     &source,

From f4495be667609d30bee4039a07e9f9f201f7cb56 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 11:49:34 -0500
Subject: [PATCH 40/48] do not store label if it is the same as the normalized
 column name, and replace consecutive illegal characters with a single
 underscore (and remove trailing underscores).

---
 scripts/guess.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/guess.py b/scripts/guess.py
index 61e4ea14..0f9ab864 100755
--- a/scripts/guess.py
+++ b/scripts/guess.py
@@ -55,7 +55,7 @@ def get_random_sample(table, sample_size):
         for i in sample_row_numbers:
             for label, value in rows[i].items():
                 if label not in sample:
-                    ncolumn = re.sub(pattern, "", label).casefold()
+                    ncolumn = re.sub(pattern, "_", label).casefold().strip("_")
                     if has_ncolumn(sample, ncolumn):
                         print(
                             "The data has more than one column with the normalized name "
@@ -465,7 +465,7 @@ def get_from(target, potential_foreign_columns):
             row = [
                 f"{table}",
                 f"{sample[label]['normalized']}",
-                f"{label}",
+                f"{label if label != sample[label]['normalized'] else ''}",
                 f"{sample[label].get('nulltype', '')}",
                 f"{sample[label]['datatype']}",
                 f"{sample[label].get('structure', '')}",
@@ -504,7 +504,7 @@ def get_from(target, potential_foreign_columns):
                 f"{row_number}",
                 f"'{table}'",
                 f"'{sample[label]['normalized']}'",
-                f"'{label}'",
+                f"'{label}'" if label != sample[label]["normalized"] else "NULL",
                 f"'{sample[label]['nulltype']}'" if sample[label].get("nulltype") else "NULL",
                 f"'{sample[label]['datatype']}'",
                 f"'{sample[label]['structure']}'" if sample[label].get("structure") else "NULL",

From dca2095bb4953720aa05820567011d60de8dcdd6 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 14:30:17 -0500
Subject: [PATCH 41/48] warn but do not panic, during load, if table file
 doesn't exist

---
 src/lib.rs | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 296fd7cf..638770a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4403,6 +4403,27 @@ async fn load_db(
     let mut total_infos = 0;
     let mut table_num = 1;
     for table_name in table_list {
+        let path = String::from(
+            config
+                .get("table")
+                .and_then(|t| t.as_object())
+                .and_then(|o| o.get(&table_name))
+                .and_then(|n| n.get("path"))
+                .and_then(|p| p.as_str())
+                .unwrap(),
+        );
+        let mut rdr = {
+            match File::open(path.clone()) {
+                Err(e) => {
+                    eprintln!("WARN: Unable to open '{}': {}", path.clone(), e);
+                    continue;
+                }
+                Ok(table_file) => csv::ReaderBuilder::new()
+                    .has_headers(false)
+                    .delimiter(b'\t')
+                    .from_reader(table_file),
+            }
+        };
         if verbose {
             eprintln!(
                 "{} - Loading table {}/{}: {}",
@@ -4413,21 +4434,6 @@ async fn load_db(
             );
         }
         table_num += 1;
-        let path = String::from(
-            config
-                .get("table")
-                .and_then(|t| t.as_object())
-                .and_then(|o| o.get(&table_name))
-                .and_then(|n| n.get("path"))
-                .and_then(|p| p.as_str())
-                .unwrap(),
-        );
-        let mut rdr = csv::ReaderBuilder::new()
-            .has_headers(false)
-            .delimiter(b'\t')
-            .from_reader(File::open(path.clone()).unwrap_or_else(|err| {
-                panic!("Unable to open '{}': {}", path.clone(), err);
-            }));
 
         // Extract the headers, which we will need later:
         let mut records = rdr.records();

From 81800669a70d2b416e09904163d547dc00f0e70c Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 14:48:15 -0500
Subject: [PATCH 42/48] add ValveRow alias

---
 src/lib.rs      | 43 ++++++++++++++++++++++---------------------
 src/validate.rs | 22 +++++++++++-----------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 638770a0..ba4b2167 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -77,10 +77,11 @@ lazy_static! {
     static ref SL_SQL_TYPES: Vec<&'static str> = vec!["text", "numeric", "integer", "real"];
 }
 
-/// An alias for [serde_json::Map](..//serde_json/struct.Map.html)<String, [serde_json::Value](../serde_json/enum.Value.html)>.
+/// Aliases for [serde_json::Map](..//serde_json/struct.Map.html)<String, [serde_json::Value](../serde_json/enum.Value.html)>.
 // Note: serde_json::Map is
 // [backed by a BTreeMap by default](https://docs.serde.rs/serde_json/map/index.html)
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
+pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
 /// Represents a structure such as those found in the `structure` column of the `column` table in
 /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its
@@ -1444,7 +1445,7 @@ pub async fn get_affected_rows(
     global_config: &SerdeMap,
     pool: &AnyPool,
     tx: &mut Transaction<'_, sqlx::Any>,
-) -> Result<IndexMap<u32, SerdeMap>, String> {
+) -> Result<IndexMap<u32, ValveRow>, String> {
     // Since the consequence of an update could involve currently invalid rows
     // (in the conflict table) becoming valid or vice versa, we need to check rows for
     // which the value of the column is the same as `value`
@@ -1470,7 +1471,7 @@ pub async fn get_affected_rows(
         .await
         .map_err(|e| e.to_string())?
     {
-        let mut table_row = SerdeMap::new();
+        let mut table_row = ValveRow::new();
         let mut row_number: Option<u32> = None;
         for column in row.columns() {
             let cname = column.name();
@@ -1508,7 +1509,7 @@ pub async fn get_row_from_db(
     tx: &mut Transaction<'_, sqlx::Any>,
     table: &str,
     row_number: &u32,
-) -> Result<SerdeMap, sqlx::Error> {
+) -> Result<ValveRow, sqlx::Error> {
     let sql = format!(
         "{} WHERE row_number = {}",
         query_with_message_values(table, global_config, pool),
@@ -1541,7 +1542,7 @@ pub async fn get_row_from_db(
         }
     };
 
-    let mut row = SerdeMap::new();
+    let mut row = ValveRow::new();
     for column in sql_row.columns() {
         let cname = column.name();
         if !vec!["row_number", "message"].contains(&cname) {
@@ -1649,7 +1650,7 @@ pub async fn get_rows_to_update(
     ),
     String,
 > {
-    fn get_cell_value(row: &SerdeMap, column: &str) -> Result<String, String> {
+    fn get_cell_value(row: &ValveRow, column: &str) -> Result<String, String> {
         match row.get(column).and_then(|cell| cell.get("value")) {
             Some(SerdeValue::String(s)) => Ok(format!("{}", s)),
             Some(SerdeValue::Number(n)) => Ok(format!("{}", n)),
@@ -1900,8 +1901,8 @@ pub async fn record_row_change(
     tx: &mut Transaction<'_, sqlx::Any>,
     table: &str,
     row_number: &u32,
-    from: Option<&SerdeMap>,
-    to: Option<&SerdeMap>,
+    from: Option<&ValveRow>,
+    to: Option<&ValveRow>,
     user: &str,
 ) -> Result<(), sqlx::Error> {
     if let (None, None) = (from, to) {
@@ -1910,8 +1911,8 @@ pub async fn record_row_change(
         ));
     }
 
-    fn to_text(smap: Option<&SerdeMap>, quoted: bool) -> String {
-        match smap {
+    fn to_text(row: Option<&ValveRow>, quoted: bool) -> String {
+        match row {
             None => "NULL".to_string(),
             Some(r) => {
                 let inner = format!("{}", json!(r)).replace("'", "''");
@@ -1932,7 +1933,7 @@ pub async fn record_row_change(
         }
     }
 
-    fn summarize(from: Option<&SerdeMap>, to: Option<&SerdeMap>) -> Result<String, String> {
+    fn summarize(from: Option<&ValveRow>, to: Option<&ValveRow>) -> Result<String, String> {
         // Constructs a summary of the form:
         // {
         //   "column":"bar",
@@ -2420,7 +2421,7 @@ pub async fn insert_new_row(
     compiled_rule_conditions: &HashMap<String, HashMap<String, Vec<ColumnRule>>>,
     pool: &AnyPool,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     new_row_number: Option<u32>,
     user: &str,
 ) -> Result<u32, sqlx::Error> {
@@ -2469,7 +2470,7 @@ pub async fn insert_new_row_tx(
     pool: &AnyPool,
     tx: &mut Transaction<sqlx::Any>,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     new_row_number: Option<u32>,
     skip_validation: bool,
 ) -> Result<u32, sqlx::Error> {
@@ -2789,7 +2790,7 @@ pub async fn update_row(
     compiled_rule_conditions: &HashMap<String, HashMap<String, Vec<ColumnRule>>>,
     pool: &AnyPool,
     table_name: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: &u32,
     user: &str,
 ) -> Result<(), sqlx::Error> {
@@ -2854,7 +2855,7 @@ pub async fn update_row_tx(
     pool: &AnyPool,
     tx: &mut Transaction<sqlx::Any>,
     table: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: &u32,
     skip_validation: bool,
     do_not_recurse: bool,
@@ -2967,10 +2968,10 @@ pub async fn update_row_tx(
     Ok(())
 }
 
-/// Given a path, read a TSV file and return a vector of rows represented as SerdeMaps.
+/// Given a path, read a TSV file and return a vector of rows represented as ValveRows.
 /// Note: Use this function to read "small" TSVs only. In particular, use this for the special
 /// configuration tables.
-fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
+fn read_tsv_into_vector(path: &str) -> Vec<ValveRow> {
     let mut rdr =
         csv::ReaderBuilder::new()
             .delimiter(b'\t')
@@ -2981,7 +2982,7 @@ fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
     let rows: Vec<_> = rdr
         .deserialize()
         .map(|result| {
-            let row: SerdeMap = result.expect(format!("Error reading: {}", path).as_str());
+            let row: ValveRow = result.expect(format!("Error reading: {}", path).as_str());
             row
         })
         .collect();
@@ -3010,8 +3011,8 @@ fn read_tsv_into_vector(path: &str) -> Vec<SerdeMap> {
 }
 
 /// Given a database at the specified location, query the "table" table and return a vector of rows
-/// represented as SerdeMaps.
-fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<SerdeMap> {
+/// represented as ValveRows.
+fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<ValveRow> {
     let connection_options;
     if database.starts_with("postgresql://") {
         connection_options = AnyConnectOptions::from_str(database).unwrap();
@@ -3036,7 +3037,7 @@ fn read_db_table_into_vector(database: &str, config_table: &str) -> Vec<SerdeMap
     let rows = block_on(sqlx_query(&sql).fetch_all(&pool)).unwrap();
     let mut table_rows = vec![];
     for row in rows {
-        let mut table_row = SerdeMap::new();
+        let mut table_row = ValveRow::new();
         for column in row.columns() {
             let cname = column.name();
             if cname != "row_number" {
diff --git a/src/validate.rs b/src/validate.rs
index 326b9eca..e4b89dc1 100644
--- a/src/validate.rs
+++ b/src/validate.rs
@@ -10,7 +10,7 @@ use std::collections::HashMap;
 use crate::{
     ast::Expression, cast_column_sql_to_text, cast_sql_param_from_text, get_column_value,
     get_sql_type_from_global_config, is_sql_type_error, local_sql_syntax, ColumnRule,
-    CompiledCondition, ParsedStructure, SerdeMap, SQL_PARAM,
+    CompiledCondition, ParsedStructure, SerdeMap, ValveRow, SQL_PARAM,
 };
 
 /// Represents a particular cell in a particular row of data with vaildation results.
@@ -46,7 +46,7 @@ pub struct QueryAsIf {
     // named 'foo' so we need to use an alias:
     pub alias: String,
     pub row_number: u32,
-    pub row: Option<SerdeMap>,
+    pub row: Option<ValveRow>,
 }
 
 /// Given a config map, maps of compiled datatype and rule conditions, a database connection
@@ -62,10 +62,10 @@ pub async fn validate_row(
     pool: &AnyPool,
     tx: Option<&mut Transaction<'_, sqlx::Any>>,
     table_name: &str,
-    row: &SerdeMap,
+    row: &ValveRow,
     row_number: Option<u32>,
     query_as_if: Option<&QueryAsIf>,
-) -> Result<SerdeMap, sqlx::Error> {
+) -> Result<ValveRow, sqlx::Error> {
     // Fallback to a default transaction if it is not given. Since we do not commit before it falls
     // out of scope the transaction will be rolled back at the end of this function. And since this
     // function is read-only the rollback is trivial and therefore inconsequential.
@@ -944,10 +944,10 @@ pub fn validate_rows_intra(
     result_rows
 }
 
-/// Given a row represented as a SerdeMap, remove any duplicate messages from the row's cells, so
+/// Given a row represented as a ValveRow, remove any duplicate messages from the row's cells, so
 /// that no cell has messages with the same level, rule, and message text.
-fn remove_duplicate_messages(row: &SerdeMap) -> Result<SerdeMap, sqlx::Error> {
-    let mut deduped_row = SerdeMap::new();
+fn remove_duplicate_messages(row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    let mut deduped_row = ValveRow::new();
     for (column_name, cell) in row.iter() {
         let mut messages = cell
             .get("messages")
@@ -981,12 +981,12 @@ fn remove_duplicate_messages(row: &SerdeMap) -> Result<SerdeMap, sqlx::Error> {
     Ok(deduped_row)
 }
 
-/// Given a result row, convert it to a SerdeMap and return it.
+/// Given a result row, convert it to a ValveRow and return it.
 /// Note that if the incoming result row has an associated row_number, this is ignored.
-fn result_row_to_config_map(incoming: &ResultRow) -> SerdeMap {
-    let mut outgoing = SerdeMap::new();
+fn result_row_to_config_map(incoming: &ResultRow) -> ValveRow {
+    let mut outgoing = ValveRow::new();
     for (column, cell) in incoming.contents.iter() {
-        let mut cell_map = SerdeMap::new();
+        let mut cell_map = ValveRow::new();
         if let Some(nulltype) = &cell.nulltype {
             cell_map.insert(
                 "nulltype".to_string(),

From 529dd28ebc03c3958dbb40291600052c0742c535 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 15:15:25 -0500
Subject: [PATCH 43/48] add stubs for new API

---
 src/lib.rs | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index ba4b2167..1d056e99 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -83,6 +83,198 @@ lazy_static! {
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
 pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
+pub struct Valve {
+    global_config: SerdeMap,
+    compiled_datatype_conditions: HashMap<String, CompiledCondition>,
+    compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
+    pool: AnyPool,
+    user: String,
+}
+
+impl Valve {
+    /// Given a path to a table table,
+    /// read it, configure VALVE, and return a new Valve struct.
+    /// Return an error if reading or configuration fails.
+    pub fn build(mut self, table_path: &str) -> Result<Self, sqlx::Error> {
+        // Should be ConfigError
+        todo!();
+        Ok(self)
+    }
+
+    /// Set the user name for this instance.
+    /// The username must be a short string without newlines.
+    /// Return an error on invalid username.
+    pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
+        // ConfigError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a database connection string,
+    /// create a database connection for VALVE to use.
+    /// Drop and replace any current database connection.
+    /// Return an error if the connection cannot be created.
+    pub fn connect(mut self, connection: &str) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Create all configured database tables and views
+    /// if they do not already exist as configured.
+    /// Return an error on database problems.
+    pub fn create_all_tables(mut self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Drop all configured tables, in reverse dependency order.
+    /// Return an error on database problem.
+    pub fn drop_all_tables(self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// drop those tables, in the given order.
+    /// Return an error on invalid table name or database problem.
+    pub fn drop_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Truncate all configured tables, in reverse dependency order.
+    /// Return an error on database problem.
+    pub fn truncate_all_tables(self) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// truncate those tables, in the given order.
+    /// Return an error on invalid table name or database problem.
+    pub fn truncate_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // ConfigOrDatabaseError
+        //self.create_all_tables();
+        todo!();
+        Ok(self)
+    }
+
+    /// Load all configured tables in dependency order.
+    /// If `validate` is false, just try to insert all rows.
+    /// Return an error on database problem,
+    /// including database conflicts that prevent rows being inserted.
+    pub fn load_all_tables(self, validate: bool) -> Result<Self, sqlx::Error> {
+        // DatabaseError
+        //self.create_all_tables();
+        //self.truncate_all_tables();
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// load those tables in the given order.
+    /// If `validate` is false, just try to insert all rows.
+    /// Return an error on invalid table name or database problem.
+    pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result<Self, sqlx::Error> {
+        // ConfigOrDatabaseError
+        //self.create_all_tables();
+        //self.truncate_tables(tables);
+        todo!();
+        Ok(self)
+    }
+
+    /// Save all configured tables to their 'path's.
+    /// Return an error on writing or database problem.
+    pub fn save_all_tables(self) -> Result<Self, sqlx::Error> {
+        // WriteOrDatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a vector of table names,
+    /// Save thosee tables to their 'path's, in the given order.
+    /// Return an error on writing or database problem.
+    pub fn save_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+        // WriteOrDatabaseError
+        todo!();
+        Ok(self)
+    }
+
+    /// Given a table name and a row as JSON,
+    /// return the validated row.
+    /// Return an error on database problem.
+    pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Given a table name and a row as JSON,
+    /// add the row to the table in the database,
+    /// and return the validated row, including its new row_number.
+    /// Return an error invalid table name or database problem.
+    pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Given a table name, a row number, and a row as JSON,
+    /// update the row in the database,
+    /// and return the validated row.
+    /// Return an error invalid table name or row number or database problem.
+    pub fn update_row(
+        self,
+        table_name: &str,
+        row_number: usize,
+        row: &ValveRow,
+    ) -> Result<ValveRow, sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Given a table name and a row number,
+    /// delete that row from the table.
+    /// Return an error invalid table name or row number or database problem.
+    pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
+        // ConfigOrDatabaseError
+        todo!();
+    }
+
+    /// Return the next change to undo, or None.
+    /// Return an error on database problem.
+    pub fn get_record_to_undo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Return the next change to redo, or None.
+    /// Return an error on database problem.
+    pub fn get_record_to_redo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Undo one change and return the change record
+    /// or None if there was no change to undo.
+    /// Return an error on database problem.
+    pub fn undo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+
+    /// Redo one change and return the change record
+    /// or None if there was no change to redo.
+    /// Return an error on database problem.
+    pub fn redo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+        // DatabaseError
+        todo!();
+    }
+}
+
 /// Represents a structure such as those found in the `structure` column of the `column` table in
 /// both its parsed format (i.e., as an [Expression](ast/enum.Expression.html)) as well as in its
 /// original format (i.e., as a plain String).

From efeb611eac7fc4e0a93fb20c56981713ae1a2595 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Sun, 26 Nov 2023 19:27:17 -0500
Subject: [PATCH 44/48] implement Valve::build()

---
 src/lib.rs | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 11 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1d056e99..8407b621 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -83,22 +83,116 @@ lazy_static! {
 pub type SerdeMap = serde_json::Map<String, SerdeValue>;
 pub type ValveRow = serde_json::Map<String, SerdeValue>;
 
+#[derive(Debug)]
 pub struct Valve {
-    global_config: SerdeMap,
-    compiled_datatype_conditions: HashMap<String, CompiledCondition>,
-    compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
-    pool: AnyPool,
-    user: String,
+    pub global_config: SerdeMap,
+    pub compiled_datatype_conditions: HashMap<String, CompiledCondition>,
+    pub compiled_rule_conditions: HashMap<String, HashMap<String, Vec<ColumnRule>>>,
+    pub pool: Option<AnyPool>,
+    pub user: String,
 }
 
+// TODO NEXT: Move the existing public functions into this interface:
 impl Valve {
-    /// Given a path to a table table,
-    /// read it, configure VALVE, and return a new Valve struct.
+    /// Given a path to a table table and its name, read the table table, configure VALVE
+    /// partially ... TODO: finish this.
+    /// , and return a new Valve struct.
     /// Return an error if reading or configuration fails.
-    pub fn build(mut self, table_path: &str) -> Result<Self, sqlx::Error> {
+    pub async fn build(
+        table_path: &str,
+        config_table: &str,
+        // TODO: We need to refactor configure_db() so that it no longer collects the constraints
+        // configuration. We will do that in read_config_files() instead.
+        // Once this is implemented, the code below to construct the AnyPool which is used to
+        // call configure_db() should be removed.
+        // We will also remove the `database`, `initial_load` and `verbose` parameters.
+        database: &str,
+        initial_load: bool,
+        verbose: bool,
+    ) -> Result<Self, sqlx::Error> {
         // Should be ConfigError
-        todo!();
-        Ok(self)
+
+        let parser = StartParser::new();
+
+        let (specials_config, mut tables_config, mut datatypes_config, rules_config) =
+            read_config_files(table_path, config_table);
+
+        ////////////////////////////////////////////////////////////////////////////////////////
+        // TODO: Remove this block of code later (see comment above)
+        let connection_options;
+        if database.starts_with("postgresql://") {
+            connection_options = AnyConnectOptions::from_str(database)?;
+        } else {
+            let connection_string;
+            if !database.starts_with("sqlite://") {
+                connection_string = format!("sqlite://{}?mode=rwc", database);
+            } else {
+                connection_string = database.to_string();
+            }
+            connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
+        }
+
+        let pool = AnyPoolOptions::new()
+            .max_connections(5)
+            .connect_with(connection_options)
+            .await?;
+
+        let (sorted_table_list, constraints_config) = configure_db(
+            &mut tables_config,
+            &mut datatypes_config,
+            &pool,
+            &parser,
+            verbose,
+            &ValveCommand::Config,
+        )
+        .await?;
+        ////////////////////////////////////////////////////////////////////////////////////////
+
+        let mut global_config = SerdeMap::new();
+        global_config.insert(
+            String::from("special"),
+            SerdeValue::Object(specials_config.clone()),
+        );
+        global_config.insert(
+            String::from("table"),
+            SerdeValue::Object(tables_config.clone()),
+        );
+        global_config.insert(
+            String::from("datatype"),
+            SerdeValue::Object(datatypes_config.clone()),
+        );
+        global_config.insert(
+            String::from("rule"),
+            SerdeValue::Object(rules_config.clone()),
+        );
+        global_config.insert(
+            String::from("constraints"),
+            SerdeValue::Object(constraints_config.clone()),
+        );
+        let mut sorted_table_serdevalue_list: Vec<SerdeValue> = vec![];
+        for table in &sorted_table_list {
+            sorted_table_serdevalue_list.push(SerdeValue::String(table.to_string()));
+        }
+        global_config.insert(
+            String::from("sorted_table_list"),
+            SerdeValue::Array(sorted_table_serdevalue_list),
+        );
+
+        let compiled_datatype_conditions =
+            get_compiled_datatype_conditions(&global_config, &parser);
+        let compiled_rule_conditions = get_compiled_rule_conditions(
+            &global_config,
+            compiled_datatype_conditions.clone(),
+            &parser,
+        );
+
+        Ok(Self {
+            global_config: global_config,
+            compiled_datatype_conditions: compiled_datatype_conditions,
+            compiled_rule_conditions: compiled_rule_conditions,
+            pool: None,
+            user: String::from("Valve"),
+        })
     }
 
     /// Set the user name for this instance.
@@ -106,7 +200,7 @@ impl Valve {
     /// Return an error on invalid username.
     pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
         // ConfigError
-        todo!();
+        self.user = user.to_string();
         Ok(self)
     }
 

From 1c4980821e7a93234b49ac7bb26843969a1d5e89 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 08:23:25 -0500
Subject: [PATCH 45/48] refactor, fix api sigs, implement Valve::connect() and
 Valve::create_tables()

---
 src/lib.rs | 112 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 41 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8407b621..fd14adf4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -95,7 +95,7 @@ pub struct Valve {
 // TODO NEXT: Move the existing public functions into this interface:
 impl Valve {
     /// Given a path to a table table and its name, read the table table, configure VALVE
-    /// partially ... TODO: finish this.
+    /// partially ... TODO: finish rewriting this doc string.
     /// , and return a new Valve struct.
     /// Return an error if reading or configuration fails.
     pub async fn build(
@@ -110,7 +110,7 @@ impl Valve {
         initial_load: bool,
         verbose: bool,
     ) -> Result<Self, sqlx::Error> {
-        // Should be ConfigError
+        // TODO: Error type should be ConfigError
 
         let parser = StartParser::new();
 
@@ -119,24 +119,7 @@ impl Valve {
 
         ////////////////////////////////////////////////////////////////////////////////////////
         // TODO: Remove this block of code later (see comment above)
-        let connection_options;
-        if database.starts_with("postgresql://") {
-            connection_options = AnyConnectOptions::from_str(database)?;
-        } else {
-            let connection_string;
-            if !database.starts_with("sqlite://") {
-                connection_string = format!("sqlite://{}?mode=rwc", database);
-            } else {
-                connection_string = database.to_string();
-            }
-            connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
-        }
-
-        let pool = AnyPoolOptions::new()
-            .max_connections(5)
-            .connect_with(connection_options)
-            .await?;
-
+        let pool = get_pool_from_connection_string(database).await?;
         let (sorted_table_list, constraints_config) = configure_db(
             &mut tables_config,
             &mut datatypes_config,
@@ -198,7 +181,7 @@ impl Valve {
     /// Set the user name for this instance.
     /// The username must be a short string without newlines.
     /// Return an error on invalid username.
-    pub fn set_user(mut self, user: &str) -> Result<Self, sqlx::Error> {
+    pub fn set_user(&mut self, user: &str) -> Result<&mut Self, sqlx::Error> {
         // ConfigError
         self.user = user.to_string();
         Ok(self)
@@ -208,24 +191,48 @@ impl Valve {
     /// create a database connection for VALVE to use.
     /// Drop and replace any current database connection.
     /// Return an error if the connection cannot be created.
-    pub fn connect(mut self, connection: &str) -> Result<Self, sqlx::Error> {
+    pub async fn connect(&mut self, connection: &str) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        todo!();
+        self.pool = Some(get_pool_from_connection_string(connection).await?);
         Ok(self)
     }
 
     /// Create all configured database tables and views
     /// if they do not already exist as configured.
     /// Return an error on database problems.
-    pub fn create_all_tables(mut self) -> Result<Self, sqlx::Error> {
+    pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        todo!();
+        let mut tables_config = self
+            .global_config
+            .get_mut("table")
+            .and_then(|t| t.as_object_mut())
+            .unwrap();
+        let mut tables_config = tables_config.clone();
+        let mut datatypes_config = self
+            .global_config
+            .get_mut("datatype")
+            .and_then(|d| d.as_object_mut())
+            .unwrap();
+        let mut datatypes_config = datatypes_config.clone();
+        let pool = self.pool.as_ref().unwrap();
+        let parser = StartParser::new();
+
+        // TODO: Revisit this once te configure_db() function has been refactored:
+        let (_, _) = configure_db(
+            &mut tables_config,
+            &mut datatypes_config,
+            &pool,
+            &parser,
+            verbose,
+            &ValveCommand::Create,
+        )
+        .await?;
         Ok(self)
     }
 
     /// Drop all configured tables, in reverse dependency order.
     /// Return an error on database problem.
-    pub fn drop_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn drop_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -234,7 +241,7 @@ impl Valve {
     /// Given a vector of table names,
     /// drop those tables, in the given order.
     /// Return an error on invalid table name or database problem.
-    pub fn drop_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn drop_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -242,7 +249,7 @@ impl Valve {
 
     /// Truncate all configured tables, in reverse dependency order.
     /// Return an error on database problem.
-    pub fn truncate_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn truncate_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         todo!();
         Ok(self)
@@ -251,7 +258,7 @@ impl Valve {
     /// Given a vector of table names,
     /// truncate those tables, in the given order.
     /// Return an error on invalid table name or database problem.
-    pub fn truncate_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
         //self.create_all_tables();
         todo!();
@@ -262,7 +269,7 @@ impl Valve {
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
-    pub fn load_all_tables(self, validate: bool) -> Result<Self, sqlx::Error> {
+    pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
         // DatabaseError
         //self.create_all_tables();
         //self.truncate_all_tables();
@@ -274,7 +281,7 @@ impl Valve {
     /// load those tables in the given order.
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on invalid table name or database problem.
-    pub fn load_tables(self, tables: Vec<&str>, validate: bool) -> Result<Self, sqlx::Error> {
+    pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
         //self.create_all_tables();
         //self.truncate_tables(tables);
@@ -284,7 +291,7 @@ impl Valve {
 
     /// Save all configured tables to their 'path's.
     /// Return an error on writing or database problem.
-    pub fn save_all_tables(self) -> Result<Self, sqlx::Error> {
+    pub fn save_all_tables(&self) -> Result<&Self, sqlx::Error> {
         // WriteOrDatabaseError
         todo!();
         Ok(self)
@@ -293,7 +300,7 @@ impl Valve {
     /// Given a vector of table names,
     /// Save thosee tables to their 'path's, in the given order.
     /// Return an error on writing or database problem.
-    pub fn save_tables(self, tables: Vec<&str>) -> Result<Self, sqlx::Error> {
+    pub fn save_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // WriteOrDatabaseError
         todo!();
         Ok(self)
@@ -302,7 +309,7 @@ impl Valve {
     /// Given a table name and a row as JSON,
     /// return the validated row.
     /// Return an error on database problem.
-    pub fn validate_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    pub fn validate_row(&self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -311,7 +318,7 @@ impl Valve {
     /// add the row to the table in the database,
     /// and return the validated row, including its new row_number.
     /// Return an error invalid table name or database problem.
-    pub fn insert_row(self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
+    pub fn insert_row(&self, table_name: &str, row: &ValveRow) -> Result<ValveRow, sqlx::Error> {
         // ConfigOrDatabaseError
         todo!();
     }
@@ -321,7 +328,7 @@ impl Valve {
     /// and return the validated row.
     /// Return an error invalid table name or row number or database problem.
     pub fn update_row(
-        self,
+        &self,
         table_name: &str,
         row_number: usize,
         row: &ValveRow,
@@ -333,21 +340,21 @@ impl Valve {
     /// Given a table name and a row number,
     /// delete that row from the table.
     /// Return an error invalid table name or row number or database problem.
-    pub fn delete_row(self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
+    pub fn delete_row(&self, table_name: &str, row_number: usize) -> Result<(), sqlx::Error> {
         // ConfigOrDatabaseError
         todo!();
     }
 
     /// Return the next change to undo, or None.
     /// Return an error on database problem.
-    pub fn get_record_to_undo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+    pub fn get_record_to_undo(&self) -> Result<Option<AnyRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
 
     /// Return the next change to redo, or None.
     /// Return an error on database problem.
-    pub fn get_record_to_redo(self) -> Result<Option<AnyRow>, sqlx::Error> {
+    pub fn get_record_to_redo(&self) -> Result<Option<AnyRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -355,7 +362,7 @@ impl Valve {
     /// Undo one change and return the change record
     /// or None if there was no change to undo.
     /// Return an error on database problem.
-    pub fn undo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+    pub fn undo(&self) -> Result<Option<ValveRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -363,7 +370,7 @@ impl Valve {
     /// Redo one change and return the change record
     /// or None if there was no change to redo.
     /// Return an error on database problem.
-    pub fn redo(self) -> Result<Option<ValveRow>, sqlx::Error> {
+    pub fn redo(&self) -> Result<Option<ValveRow>, sqlx::Error> {
         // DatabaseError
         todo!();
     }
@@ -432,6 +439,29 @@ impl std::fmt::Debug for ColumnRule {
     }
 }
 
+/// TODO: Add docstring here. Note that once we have refactored configure_db() (see above) it may
+/// make more sense for this function to be an inner function of Valve.
+pub async fn get_pool_from_connection_string(database: &str) -> Result<AnyPool, sqlx::Error> {
+    let connection_options;
+    if database.starts_with("postgresql://") {
+        connection_options = AnyConnectOptions::from_str(database)?;
+    } else {
+        let connection_string;
+        if !database.starts_with("sqlite://") {
+            connection_string = format!("sqlite://{}?mode=rwc", database);
+        } else {
+            connection_string = database.to_string();
+        }
+        connection_options = AnyConnectOptions::from_str(connection_string.as_str()).unwrap();
+    }
+
+    let pool = AnyPoolOptions::new()
+        .max_connections(5)
+        .connect_with(connection_options)
+        .await?;
+    Ok(pool)
+}
+
 /// Given the path to a configuration table (either a table.tsv file or a database containing a
 /// table named "table"), load and check the 'table', 'column', and 'datatype' tables, and return
 /// SerdeMaps corresponding to specials, tables, datatypes, and rules.

From b5ea3a811ebe38999e63da46e21f5c209a008f94 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 08:25:36 -0500
Subject: [PATCH 46/48] rename create_all_tables to create_missing_tables

---
 src/lib.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index fd14adf4..73721b07 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -200,7 +200,7 @@ impl Valve {
     /// Create all configured database tables and views
     /// if they do not already exist as configured.
     /// Return an error on database problems.
-    pub async fn create_all_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
+    pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
         let mut tables_config = self
             .global_config
@@ -260,7 +260,7 @@ impl Valve {
     /// Return an error on invalid table name or database problem.
     pub fn truncate_tables(&self, tables: Vec<&str>) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         todo!();
         Ok(self)
     }
@@ -270,8 +270,10 @@ impl Valve {
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
     pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
+        // YOU ARE HERE.
+
         // DatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         //self.truncate_all_tables();
         todo!();
         Ok(self)
@@ -283,7 +285,7 @@ impl Valve {
     /// Return an error on invalid table name or database problem.
     pub fn load_tables(&self, tables: Vec<&str>, validate: bool) -> Result<&Self, sqlx::Error> {
         // ConfigOrDatabaseError
-        //self.create_all_tables();
+        //self.create_missing_tables();
         //self.truncate_tables(tables);
         todo!();
         Ok(self)

From 268bd2aa171bef1171fd11989fd08a8a4a00103f Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 09:07:17 -0500
Subject: [PATCH 47/48] implement (rough) load_all_tables()

---
 src/lib.rs  | 63 ++++++++++++++++++++++++++++++++++++++++++++++-------
 src/main.rs | 23 +++++++++++--------
 2 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 73721b07..4eefe7e9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -105,9 +105,8 @@ impl Valve {
         // configuration. We will do that in read_config_files() instead.
         // Once this is implemented, the code below to construct the AnyPool which is used to
         // call configure_db() should be removed.
-        // We will also remove the `database`, `initial_load` and `verbose` parameters.
+        // We will also remove the `database`  and `verbose` parameters.
         database: &str,
-        initial_load: bool,
         verbose: bool,
     ) -> Result<Self, sqlx::Error> {
         // TODO: Error type should be ConfigError
@@ -202,6 +201,10 @@ impl Valve {
     /// Return an error on database problems.
     pub async fn create_missing_tables(&mut self, verbose: bool) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
+
+        // TODO: Revisit the implementation of this once te configure_db() function has been
+        // refactored. Currently it implicitly drops and recreates _all_ tables but eventually this
+        // function needs to do this only for _missing_ tables.
         let mut tables_config = self
             .global_config
             .get_mut("table")
@@ -217,7 +220,6 @@ impl Valve {
         let pool = self.pool.as_ref().unwrap();
         let parser = StartParser::new();
 
-        // TODO: Revisit this once te configure_db() function has been refactored:
         let (_, _) = configure_db(
             &mut tables_config,
             &mut datatypes_config,
@@ -269,13 +271,58 @@ impl Valve {
     /// If `validate` is false, just try to insert all rows.
     /// Return an error on database problem,
     /// including database conflicts that prevent rows being inserted.
-    pub fn load_all_tables(&self, validate: bool) -> Result<&Self, sqlx::Error> {
-        // YOU ARE HERE.
-
+    pub async fn load_all_tables(
+        &mut self,
+        validate: bool,
+        verbose: bool,
+        initial_load: bool,
+    ) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
-        //self.create_missing_tables();
+
+        self.create_missing_tables(verbose);
         //self.truncate_all_tables();
-        todo!();
+        if let Some(pool) = &self.pool {
+            if pool.any_kind() == AnyKind::Sqlite {
+                sqlx_query("PRAGMA foreign_keys = ON").execute(pool).await?;
+                if initial_load {
+                    // These pragmas are unsafe but they are used during initial loading since data
+                    // integrity is not a priority in this case.
+                    sqlx_query("PRAGMA journal_mode = OFF")
+                        .execute(pool)
+                        .await?;
+                    sqlx_query("PRAGMA synchronous = 0").execute(pool).await?;
+                    sqlx_query("PRAGMA cache_size = 1000000")
+                        .execute(pool)
+                        .await?;
+                    sqlx_query("PRAGMA temp_store = MEMORY")
+                        .execute(pool)
+                        .await?;
+                }
+            }
+
+            if verbose {
+                eprintln!(
+                    "{} - Processing {} tables.",
+                    Utc::now(),
+                    self.global_config
+                        .get("sorted_table_list")
+                        .and_then(|l| l.as_array())
+                        .unwrap()
+                        .len()
+                );
+            }
+            load_db(
+                &self.global_config,
+                &pool,
+                &self.compiled_datatype_conditions,
+                &self.compiled_rule_conditions,
+                verbose,
+            )
+            .await?;
+        } else {
+            eprintln!("WARN: Attempt to load tables but Valve is not connected to a database.");
+        }
+
         Ok(self)
     }
 
diff --git a/src/main.rs b/src/main.rs
index 7e61aba4..4c919167 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,6 +7,7 @@ use argparse::{ArgumentParser, Store, StoreTrue};
 use ontodev_valve::{
     get_compiled_datatype_conditions, get_compiled_rule_conditions,
     get_parsed_structure_conditions, valve, valve_grammar::StartParser, ValveCommand,
+    Valve
 };
 use serde_json::{from_str, Value as SerdeValue};
 use std::{env, process};
@@ -156,15 +157,19 @@ async fn main() -> Result<(), sqlx::Error> {
         )
         .await?;
     } else {
-        valve(
-            &source,
-            &destination,
-            &ValveCommand::Load,
-            verbose,
-            initial_load,
-            &config_table,
-        )
-        .await?;
+        let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?;
+        valve.connect(&destination).await?;
+        valve.create_missing_tables(verbose).await?;
+        valve.load_all_tables(true, verbose, initial_load).await?;
+        // valve(
+        //     &source,
+        //     &destination,
+        //     &ValveCommand::Load,
+        //     verbose,
+        //     initial_load,
+        //     &config_table,
+        // )
+        // .await?;
     }
 
     Ok(())

From 2b4073070959c839b6514df4b1bb4182da834235 Mon Sep 17 00:00:00 2001
From: Michael Cuffaro <consulting@michaelcuffaro.com>
Date: Mon, 27 Nov 2023 09:14:36 -0500
Subject: [PATCH 48/48] fix small bug in call to create_all_tables()

---
 src/lib.rs  | 2 +-
 src/main.rs | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 4eefe7e9..5bc104bf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -279,7 +279,7 @@ impl Valve {
     ) -> Result<&mut Self, sqlx::Error> {
         // DatabaseError
 
-        self.create_missing_tables(verbose);
+        self.create_missing_tables(verbose).await?;
         //self.truncate_all_tables();
         if let Some(pool) = &self.pool {
             if pool.any_kind() == AnyKind::Sqlite {
diff --git a/src/main.rs b/src/main.rs
index 4c919167..486cb522 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -159,7 +159,6 @@ async fn main() -> Result<(), sqlx::Error> {
     } else {
         let mut valve = Valve::build(&source, &config_table, &destination, verbose).await?;
         valve.connect(&destination).await?;
-        valve.create_missing_tables(verbose).await?;
         valve.load_all_tables(true, verbose, initial_load).await?;
         // valve(
         //     &source,