diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index dc03e15..565a5fa 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -18,7 +18,15 @@
 	],
 
 	// Configure tool-specific properties.
-	// "customizations": {},
+	"customizations": {
+		"vscode":{
+			"extensions": [
+				"timonwong.shellcheck",
+				"GitHub.vscode-pull-request-github",
+				"charliermarsh.ruff"
+			]
+		}
+	},
 
 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
 	// "remoteUser": "root"
diff --git a/.env.template b/.env.template
new file mode 100644
index 0000000..4592aea
--- /dev/null
+++ b/.env.template
@@ -0,0 +1,12 @@
+LOUIS_DSN=
+PGBASE=
+PGUSER=
+USER=
+PGHOST=
+POSTGRES_PASSWORD=
+PGPASSWORD=
+OPENAI_API_KEY=
+AZURE_OPENAI_SERVICE=
+LOUIS_SCHEMA=
+DB_SERVER_CONTAINER_NAME=
+PGDATA=
diff --git a/.gitignore b/.gitignore
index 390df2b..c0f7379 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-.env
+.env**
 .pgpassfile
 dumps/**
 reports/**
diff --git a/DEVELOPER.md b/DEVELOPER.md
new file mode 100644
index 0000000..2c67321
--- /dev/null
+++ b/DEVELOPER.md
@@ -0,0 +1,53 @@
+# Development guidelines for louis-db
+
+## Making changes to the database schema
+
+### Run latest schema locally
+
+* Setup .env environment variables
+  * LOUIS_DSN: Data Source Name (DSN) used for configuring a database connection in Louis's system.
+
+  * PGBASE: the base directory where PostgreSQL related files or resources are stored or accessed.
+
+  * PGUSER: the username or role required to authenticate and access a PostgreSQL database.
+
+  * USER: the username required for validation and access
+
+  * PGHOST: the hostname or IP address of the server where the PostgreSQL database is hosted.
+
+  * PGPASSWORD: the password for the user authentication when connecting to the PostgreSQL database.
+
+  * POSTGRES_PASSWORD: the password for the database, for authentication when connecting to the PostgreSQL database.
+
+  * PGDATA: path to the directory where PostgreSQL data files are stored.
+
+  * OPENAI_API_KEY: the API key required for authentication when making requests to the OpenAI API.
+
+  * AZURE_OPENAI_SERVICE: information related to an Azure-based service for OpenAI.
+
+  * LOUIS_SCHEMA: the Louis schema within database.
+
+  * DB_SERVER_CONTAINER_NAME: name of your database server container.
+
+* Run database locally (see bin/postgres.sh)
+* Restore latest schema dump
+
+### before every change
+
+* pgdump the schema using ```bin/backup-db-docker.sh```
+
+### Create change
+
+* make sure to create a Github Issue issue #X first describing the work to be done
+* create a branch ```issueX-descriptive-name```
+* add a new SQL file YYYY-mm-dd-issueX-descriptive-name
+  * explain in top header comment the changes to be made
+  * provide original DDL of files to be modified
+* create a test case in tests/test_db.py
+  * load your new SQL file within a transaction (that will be rolled back)
+  * ensure you have an assert to test for
+* once your test passes, commit change to the database by running your script with bin/psql.sh
+  * you should now be able to remove the load SQL file and run the test successfully
+* re-run test suite and fix exposed database functions affected by your changes (failing)
+* dump the new schema as louis_v00X with X+1
+* test new schema with your client apps.
diff --git a/Dockerfile b/Dockerfile
index f3df76f..ac24f3d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1
 FROM alpine
 RUN apk add && apk add postgresql-client
-COPY docker-entrypoint.sh /entrypoint.sh
+COPY bin/docker-entrypoint.sh /entrypoint.sh
 ENV LOUIS_DSN=
 ENV LOUIS_SCHEMA=
 ENV LOAD_DATA_ONLY=
diff --git a/README.md b/README.md
index 24a0794..0ca513a 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,20 @@
 
 ## Installing python package
 
+If you need to interface with the database, use this to install:
+
+```
+pip install git+https://github.com/ai-cfia/louis-db@v0.0.5-alpha3
 ```
-pip install git+https://github.com/ai-cfia/louis-db@v0.0.5-alpha2
-```
\ No newline at end of file
+
+You'll often want to add, move or modify existing database layer functions found in louis-db from a client repository.
+
+To edit, you can install an editable version of the package dependencies such as:
+
+```
+pip install -e git+https://github.com/ai-cfia/louis-db#egg=louis_db
+```
+
+this will checkout the latest source in a local git in src/louis-db allowing edits in that directory to be immediately available for use by louis-crawler.
+
+Don't forget to create a PR with your changes once you're done!
\ No newline at end of file
diff --git a/backup-db-docker.sh b/backup-db-docker.sh
deleted file mode 100755
index 0a441d2..0000000
--- a/backup-db-docker.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-DIRNAME=`dirname $0`
-. $DIRNAME/lib.sh
-TODAY=`date +%Y-%m-%d`
-
-if [ -z "$PGBASE" -o -z "$LOUIS_SCHEMA" ]; then
-    echo "Environment variables PGBASE and LOUIS_SCHEMA must be specified"
-    exit 1
-fi
-
-NAME=dumps/$LOUIS_SCHEMA.$TODAY.pg_dump
-PGDUMP="docker exec -it louis-db-server pg_dump -U postgres -d $PGBASE"
-
-if [ ! -f "$NAME" ]; then
-    echo "Backing up $LOUIS_SCHEMA to $NAME"
-    $PGDUMP --schema=$LOUIS_SCHEMA --no-owner --no-privileges > $NAME
-    if [ "$?" -eq 0 ]; then
-        echo "Dumped to $NAME"
-    else
-        echo "Error dumping to $NAME"
-        cat $NAME
-        rm $NAME
-        exit 1
-    fi
-else
-    echo "File $NAME already exists"
-fi
-
-ARCHIVE_FILENAME="$NAME.zip"
-if [ ! -f "$ARCHIVE_FILENAME" ]; then
-    zip $ARCHIVE_FILENAME $NAME
-else
-    echo "File $ARCHIVE_FILENAME.zip already exists"
-fi
-
diff --git a/backup-db.sh b/backup-db.sh
deleted file mode 100755
index ba1a836..0000000
--- a/backup-db.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-TODAY=`date +%Y-%m-%d`
-NAME=dumps/inspection.canada.ca.$TODAY.pg_dump
-if [ ! -f "$NAME" ]; then
-    pg_dump --no-owner --no-privileges -d inspection.canada.ca > $NAME
-else
-    echo "File $NAME already exists"
-fi
-
-if [ ! -f "$NAME.zip" ]; then
-    zip $NAME.zip $NAME
-else
-    echo "File $NAME.zip already exists"
-fi
-
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..724d279
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,64 @@
+# creating a new schema
+
+## environment
+
+This assumes:
+
+* you are running WSL
+* you are running a dockerized version of Postgresql 15 under WSL
+* you are running louis-db in a DevContainer under Visual Studio Code
+* your source is on WSL under ~/src
+
+## configuration
+
+database connection parameters is set in .env file
+
+you can create multiple .env.NAME and symlink as needed:
+
+working on local source:
+
+```
+ln -sf .env.louis_v004_local .env
+```
+
+switching to target
+
+```
+ln -sf .env.louis_v005_azure .env
+```
+
+## Running the database server locally
+
+* use Dockerfile in postgres directory
+* use ```bin/postgres.sh``` script as your startup script (symlink)
+
+## Editing
+
+* Create adhoc modifications as scripts in sql/ with proper YYYY-mm-dd prefix
+* Create tests that apply these sql scripts in a transaction and test them
+* Once satisfied, commit changes to database
+
+
+
+## backing up schema and data
+
+in this example, the modified louis_v004 becomes the louis_v005 schema:
+
+```
+./bin/dump-versioned-schema.sh louis_v004 louis_v005
+./bin/dump-versioned-data.sh louis_v004 louis_v005
+```
+
+## loading schema
+
+change your .env to link to your target database first
+
+```
+./bin/load-versioned-schema.sh louis_v005
+```
+
+validate manually that schema is as expected here (dbBeaver ERD diagram) before loading the data:
+
+```
+./bin/load-versioned-data.sh louis_v005
+```
\ No newline at end of file
diff --git a/bin/backup-db-docker.sh b/bin/backup-db-docker.sh
new file mode 100755
index 0000000..a97d09c
--- /dev/null
+++ b/bin/backup-db-docker.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+DIRNAME=`dirname $0`
+. $DIRNAME/lib.sh
+
+docker cp $DIRNAME/backup-db.sh louis-db-server:backup-db.sh
+docker cp $DIRNAME/lib.sh louis-db-server:lib.sh
+docker exec -it -e PGDUMP_FILENAME=/dev/stdout --env-file $ENV_FILE louis-db-server ./backup-db.sh > $PGDUMP_FILENAME
\ No newline at end of file
diff --git a/bin/backup-db.sh b/bin/backup-db.sh
new file mode 100755
index 0000000..5f706c8
--- /dev/null
+++ b/bin/backup-db.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+DIRNAME=`dirname $0`
+. $DIRNAME/lib.sh
+
+if [ ! -f "$NAME" ]; then
+    echo "preparing to dump $PGBASE.$LOUIS_SCHEMA to $PGDUMP_FILENAME"
+    # apparently pg_dump doesn't use the environment variables PG*
+    pg_dump -d $PGBASE --schema=$LOUIS_SCHEMA --no-owner --no-privileges --file $PGDUMP_FILENAME
+else
+    echo "File $PGDUMP_FILENAME already exists"
+fi
+
+if [ -f "$PGDUMP_FILENAME" ]; then
+    if [ ! -f "$PGDUMP_FILENAME.zip" ]; then
+        zip $PGDUMP_FILENAME.zip $PGDUMP_FILENAME
+    else
+        echo "File $PGDUMP_FILENAME.zip already exists"
+    fi
+fi
\ No newline at end of file
diff --git a/build-data-volume.sh b/bin/build-data-volume.sh
similarity index 100%
rename from build-data-volume.sh
rename to bin/build-data-volume.sh
diff --git a/build-dataloader.sh b/bin/build-dataloader.sh
similarity index 100%
rename from build-dataloader.sh
rename to bin/build-dataloader.sh
diff --git a/combine-iis-logs.sh b/bin/combine-iis-logs.sh
similarity index 100%
rename from combine-iis-logs.sh
rename to bin/combine-iis-logs.sh
diff --git a/deprecated/init-schema-louis.sh b/bin/deprecated/init-schema-louis.sh
similarity index 100%
rename from deprecated/init-schema-louis.sh
rename to bin/deprecated/init-schema-louis.sh
diff --git a/deprecated/load-db.sh b/bin/deprecated/load-db.sh
similarity index 100%
rename from deprecated/load-db.sh
rename to bin/deprecated/load-db.sh
diff --git a/deprecated/load-schema.sh b/bin/deprecated/load-schema.sh
similarity index 100%
rename from deprecated/load-schema.sh
rename to bin/deprecated/load-schema.sh
diff --git a/deprecated/migrate.sh b/bin/deprecated/migrate.sh
similarity index 100%
rename from deprecated/migrate.sh
rename to bin/deprecated/migrate.sh
diff --git a/docker-entrypoint.sh b/bin/docker-entrypoint.sh
similarity index 100%
rename from docker-entrypoint.sh
rename to bin/docker-entrypoint.sh
diff --git a/dump-versioned-data.sh b/bin/dump-versioned-data.sh
similarity index 65%
rename from dump-versioned-data.sh
rename to bin/dump-versioned-data.sh
index 5be37d8..d9fed0f 100755
--- a/dump-versioned-data.sh
+++ b/bin/dump-versioned-data.sh
@@ -8,7 +8,7 @@ INPUT_SCHEMA=$1
 OUTPUT_SCHEMA=$2
 
 if [ -z "$PGHOST" -o "$PGHOST" == "localhost" ]; then
-    RELPATH=dumps/$OUTPUT_SCHEMA
+    RELPATH=$PROJECT_DIR/dumps/$OUTPUT_SCHEMA
     OUTPUT_DIR=`realpath $RELPATH`
     if [ -d "$OUTPUT_DIR" ]; then
        echo "Warning: Directory exist: $OUTPUT_DIR"
@@ -18,7 +18,11 @@ else
     OUTPUT_DIR=/var/lib/postgresql/data
 fi
 
-$PSQL_ADMIN < $DIRNAME/sql/schema_to_csv.sql
+$PSQL_ADMIN -f $PROJECT_DIR/sql/schema_to_csv.sql
+if [ $? -ne 0 ]; then
+    echo "Failed to install schema_to_csv function"
+    exit 3
+fi
 
 echo "Outputting all tables from schema $INPUT_SCHEMA as csv to $OUTPUT_DIR on the database server"
-$PSQL_ADMIN -c "select * from schema_to_csv('$INPUT_SCHEMA', '$OUTPUT_DIR')"
\ No newline at end of file
+$PSQL_ADMIN -c "select * from public.schema_to_csv('$INPUT_SCHEMA'::text, '$OUTPUT_DIR'::text)"
\ No newline at end of file
diff --git a/dump-versioned-schema.sh b/bin/dump-versioned-schema.sh
similarity index 72%
rename from dump-versioned-schema.sh
rename to bin/dump-versioned-schema.sh
index ae7f71c..8981bf7 100755
--- a/dump-versioned-schema.sh
+++ b/bin/dump-versioned-schema.sh
@@ -1,23 +1,24 @@
 #!/bin/bash
 DIRNAME=`dirname $0`
 . $DIRNAME/lib.sh
-TODAY=`date +%Y-%m-%d`
 
 if [ -z $2 ]; then
     echo "usage: $0 source_schema output_schema"
+    echo "example: $0 louis_v005 to louis_v006"
     exit 1
 fi
 
 SOURCE_SCHEMA=$1
 TARGET_SCHEMA=$2
 
-SCHEMA_OUTPUT_DIR=$DIRNAME/dumps/$TARGET_SCHEMA
+SCHEMA_OUTPUT_DIR=$PROJECT_DIR/dumps/$TARGET_SCHEMA
 mkdir -p $SCHEMA_OUTPUT_DIR
 SCHEMA_OUTPUT_FILENAME=$SCHEMA_OUTPUT_DIR/schema.sql
 if [ -f "$SCHEMA_OUTPUT_FILENAME" ]; then
-    echo "File $SCHEMA_OUTPUT_FILENAME already exists"
-    #exit 2
+    echo "File $SCHEMA_OUTPUT_FILENAME already exists, exiting"
+    exit 2
 fi
+echo "dumping schema to $SCHEMA_OUTPUT_FILENAME"
 pg_dump -n $SOURCE_SCHEMA -d $PGBASE \
     --no-owner --no-privileges --no-security-labels \
     --no-table-access-method --no-tablespaces --schema-only \
diff --git a/bin/install-postgresl-client-15.sh b/bin/install-postgresl-client-15.sh
new file mode 100755
index 0000000..e1efd9c
--- /dev/null
+++ b/bin/install-postgresl-client-15.sh
@@ -0,0 +1,4 @@
+sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+sudo apt update
+sudo apt install postgresql-client-15
\ No newline at end of file
diff --git a/bin/lib.sh b/bin/lib.sh
new file mode 100644
index 0000000..6a32e5c
--- /dev/null
+++ b/bin/lib.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+DIRNAME=$(dirname $(realpath $0))
+PARENT_DIR=$DIRNAME/..
+PROJECT_DIR=$(realpath $PARENT_DIR)
+ENV_FILE=$PROJECT_DIR/.env
+
+if [ -f "$ENV_FILE" ]; then
+  # shellcheck source=lib.sh
+  . "$ENV_FILE"
+else
+echo "WARNING: File $ENV_FILE does not exist, relying on environment variables"
+fi
+
+check_environment_variables_defined () {
+    variable_not_set=0
+    for VARIABLE in "$@"; do
+        if [ -z "${!VARIABLE}" ]; then
+            echo "Environment variable $VARIABLE is not set"
+            variable_not_set=1
+        fi
+    done
+
+    if [ $variable_not_set -eq 1 ]; then
+        echo "One or more variables are not defined, the program cannot continue"
+        exit 1
+    fi
+}
+
+export PGOPTIONS="--search_path=$LOUIS_SCHEMA"
+export PGBASE
+export PGDATABASE
+export PGHOST
+export PGUSER
+export PGPORT
+export PGHOST
+export PGPASSFILE
+export PGPASSWORD
+
+VERSION15=$(psql --version | grep 15.)
+
+if [ -z "$VERSION15" ]; then
+    echo "postgresql-client-15 required"
+    exit 1
+fi
+
+TODAY=$(date +%Y-%m-%d)
+
+if [ -z "$PGDUMP_FILENAME" ]; then
+    PGDUMP_FILENAME=$PROJECT_DIR/dumps/$TODAY.$PGBASE.pg_dump
+fi
+
+export PSQL_ADMIN="psql -v ON_ERROR_STOP=1 --single-transaction -d $PGBASE"
diff --git a/load-data-container.sh b/bin/load-data-container.sh
similarity index 100%
rename from load-data-container.sh
rename to bin/load-data-container.sh
diff --git a/bin/load-versioned-data.sh b/bin/load-versioned-data.sh
new file mode 100755
index 0000000..9051192
--- /dev/null
+++ b/bin/load-versioned-data.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+DIRNAME=$(dirname "$0")
+. $DIRNAME/lib.sh
+
+## To debug
+# set -x -e  
+
+if [ -z "$1" ]; then
+    echo "usage: $0 target_schema"
+    exit 1
+fi
+TARGET_SCHEMA=$1
+SOURCE_DIR=$PROJECT_DIR/dumps/$TARGET_SCHEMA
+CSV_TO_SCHEMA=$PROJECT_DIR/sql/csv_to_schema.sql
+
+if [ ! -d "$SOURCE_DIR" ]; then
+    echo "Directory does not exist: $SOURCE_DIR"
+    exit 1
+fi
+
+if [ ! -f "$CSV_TO_SCHEMA" ]; then
+    echo "Source file does not exist: $CSV_TO_SCHEMA"
+    exit 1
+fi
+
+# $PSQL_ADMIN -d $PGBASE < $CSV_TO_SCHEMA
+# if [ $? -ne 0 ]; then
+#     echo "Failed to load csv_to_schema.sql"
+#     exit 1
+# fi
+# $PSQL_ADMIN -d $PGBASE -c "select * from csv_to_schema('$TARGET_SCHEMA', '$SOURCE_DIR', array['crawl', 'chunk', 'token', 'ada_002', 'link', 'score', 'query'])"
+
+TABLE_LIST=$SOURCE_DIR/tables.txt
+echo $TABLE_LIST
+if [ ! -f "$TABLE_LIST" ]; then
+    echo "File defining list of table and their load order does not exist: $TABLE_LIST"
+    exit 1
+fi
+TABLES=$(cat "$TABLE_LIST")
+
+# we check that there's a csv file for each table
+for table in $TABLES; do
+    FILENAME=$SOURCE_DIR/$table.csv
+    if [ ! -f "$FILENAME" ]; then
+        echo "File does not exist: $FILENAME"
+        exit 1
+    fi
+done
+
+# we check that there's a table for each csv file
+for file in $SOURCE_DIR/*.csv; do
+    echo "Checking $file is expected in table list"
+    TABLE=`basename $file .csv`
+    if ! grep -q $TABLE $TABLE_LIST; then
+        echo "File $file is not expected in table list"
+        exit 1
+    fi
+done
+
+CSV_TO_SCHEMA_PSQL=$(mktemp)
+echo "" > $CSV_TO_SCHEMA_PSQL
+# echo "set session_replica_role = 'replica';" >> $CSV_TO_SCHEMA_PSQL
+for table in $TABLES; do
+    echo "Loading $table"
+    FILENAME=$SOURCE_DIR/$table.csv
+    echo "\COPY $TARGET_SCHEMA.$table FROM $FILENAME WITH DELIMITER as ';' CSV HEADER" >> $CSV_TO_SCHEMA_PSQL
+done
+$PSQL_ADMIN -f $CSV_TO_SCHEMA_PSQL
+if [ $? -ne 0 ]; then
+    echo "Failed to load $table"
+    exit 1
+fi
\ No newline at end of file
diff --git a/bin/load-versioned-schema.sh b/bin/load-versioned-schema.sh
new file mode 100755
index 0000000..93a9b1c
--- /dev/null
+++ b/bin/load-versioned-schema.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+DIRNAME=$(dirname "$0")
+. $DIRNAME/lib.sh
+
+## To debug
+# set -x -e  
+
+## Explain usage if missing argument
+if [ -z "$1" ]; then
+    echo "usage: $0 source_schema"
+    exit 1
+fi
+
+SOURCE_SCHEMA=$1  
+  
+## Create a path using the project directory and the schema name  
+SOURCE_DIR=$PROJECT_DIR/dumps/$1  
+  
+## If the directory at the path doesn't exist, print an error and exit  
+if [ ! -d "$SOURCE_DIR" ]; then  
+    echo "Directory does not exist: $SOURCE_DIR"  
+    exit 1  
+fi  
+
+## Create a path to a file named schema.sql
+SCHEMA_FILE=$SOURCE_DIR/schema.sql  
+  
+## If the file does not exist, print an error message and exit with status code 2
+if [ ! -f "$SCHEMA_FILE" ]; then    
+    echo "File does not exist: $SCHEMA_FILE"  
+    exit 2  
+fi  
+## If the file does exist, pass it as input to a PostgreSQL admin command  
+$PSQL_ADMIN < $SCHEMA_FILE  
diff --git a/bin/postgres.sh b/bin/postgres.sh
new file mode 100755
index 0000000..45d6089
--- /dev/null
+++ b/bin/postgres.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+DIRNAME=$(dirname "$(realpath "$0")")
+. "$DIRNAME"/lib.sh
+
+if [ -z "$PGDATA" ]; then  
+    echo "PGDATA is not set. Setting to default directory..."  
+    PGDATA=$HOME/pgdata  
+fi  
+  
+if [ ! -d "$PGDATA" ]; then  
+    echo "PGDATA directory $PGDATA does not exist, creating it..."  
+    mkdir -p "$PGDATA"  
+fi  
+
+check_environment_variables_defined DB_SERVER_CONTAINER_NAME POSTGRES_PASSWORD 
+
+STATUS=$(docker inspect "$DB_SERVER_CONTAINER_NAME" -f '{{.State.Status}}')
+
+if [ "$STATUS" = "exited" ]; then
+    echo "container $DB_SERVER_CONTAINER_NAME exist but has exited, restarting"
+    docker start "$DB_SERVER_CONTAINER_NAME"
+
+elif [ "$STATUS" != "running" ]; then
+
+    echo "container $DB_SERVER_CONTAINER_NAME does not exist, creating"
+
+    docker run --name "$DB_SERVER_CONTAINER_NAME" \
+        -e POSTGRES_PASSWORD="$POSTGRES_PASSWORD" \
+        --network louis_network \
+        --mount type=bind,src="$PGDATA",target=/var/lib/postgresql/data \
+        --publish 5432:5432 \
+        --user "$(id -u):$(id -g)" -v /etc/passwd:/etc/passwd:ro \
+        -d "louis-postgres"
+else
+    echo "Postgres is already running"
+fi
diff --git a/bin/psql.sh b/bin/psql.sh
new file mode 100755
index 0000000..5486a0a
--- /dev/null
+++ b/bin/psql.sh
@@ -0,0 +1,8 @@
+DIRNAME=`dirname $0`
+. $DIRNAME/lib.sh
+if [ -z "$1" ]; then
+  psql
+else
+  SQL_SCRIPT=$1
+  $PSQL_ADMIN -f "$SQL_SCRIPT"
+fi
\ No newline at end of file
diff --git a/restore-db-docker.sh b/bin/restore-db-docker.sh
similarity index 100%
rename from restore-db-docker.sh
rename to bin/restore-db-docker.sh
diff --git a/bin/setup-db-docker.sh b/bin/setup-db-docker.sh
new file mode 100755
index 0000000..00f2bff
--- /dev/null
+++ b/bin/setup-db-docker.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+DIRNAME=$(dirname "$0")
+. $DIRNAME/lib.sh
+
+check_environment_variables_defined PGBASE DB_SERVER_CONTAINER_NAME PSQL_ADMIN PGPASSWORD USER
+
+# set -x -e  
+DOCKER_EXEC="docker exec -it $DB_SERVER_CONTAINER_NAME"  
+
+## Check if user already exist  
+$DOCKER_EXEC psql -U postgres -c "    
+DO    
+\$do\$    
+BEGIN    
+   IF NOT EXISTS (    
+      SELECT FROM pg_roles WHERE rolname = '$USER'  
+   ) THEN  
+      CREATE USER $USER WITH PASSWORD '$PGPASSWORD';    
+      ALTER USER $USER WITH SUPERUSER;    
+   END IF;    
+END    
+\$do\$;"
+
+## Print all existing users
+$DOCKER_EXEC psql -U postgres -c '\du'  
+
+## Check if database already exist  
+DB_EXISTS=$($DOCKER_EXEC psql -U "$USER" -d "$PGBASE" -tAc "SELECT 1 FROM pg_database WHERE datname='$PGBASE'" | tr -d '\r')  
+
+if [ "$DB_EXISTS" = '1' ]  
+then  
+    echo "Database $PGBASE already exists."  
+else  
+    echo "Database $PGBASE does not exist, creating..."  
+    $DOCKER_EXEC createdb -E utf-8 -U postgres "$PGBASE"
+fi
+
+## Print all existing databases
+$DOCKER_EXEC psql -U postgres -c '\l'
+
+$DOCKER_EXEC pip install pgxnclient
+
+VECTOR_INSTALLED=$(docker exec -u 0 -i "$DB_SERVER_CONTAINER_NAME" psql -U postgres -tAc "SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector');")  
+  
+if [ "$VECTOR_INSTALLED" = "t" ]  
+then  
+    echo "Extension vector is already installed."  
+else  
+    echo "Extension vector is not installed. Installing..."  
+    docker exec -u 0 -it "$DB_SERVER_CONTAINER_NAME" pgxn install vector  
+fi  
+
+$DOCKER_EXEC psql -U postgres -v ON_ERROR_STOP=1 --single-transaction -d $PGBASE -c 'SET search_path TO public; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS vector;'
+
+# $DOCKER_EXEC $PSQL_ADMIN -c "SET search_path TO public; CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"; CREATE EXTENSION IF NOT EXISTS vector;"
+# User creation
+# $DOCKER_EXEC $PSQL_ADMIN -c "CREATE USER $USER; ALTER USER $USER WITH SUPERUSER;"
\ No newline at end of file
diff --git a/setup-db.sh b/bin/setup-db.sh
similarity index 90%
rename from setup-db.sh
rename to bin/setup-db.sh
index 8a7e6e1..aa32bbd 100755
--- a/setup-db.sh
+++ b/bin/setup-db.sh
@@ -1,4 +1,5 @@
-DIRNAME=`dirname $0`
+#!/bin/bash
+DIRNAME=$(dirname "$0")
 . $DIRNAME/lib.sh
 
 $PSQL_ADMIN -f $DIRNAME/sql/fix-utf8-template.sql
diff --git a/update-schema-louis.sh b/bin/update-schema-louis.sh
similarity index 100%
rename from update-schema-louis.sh
rename to bin/update-schema-louis.sh
diff --git a/lib.sh b/lib.sh
deleted file mode 100644
index 2dbcb4c..0000000
--- a/lib.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-DIRNAME=`dirname $0`
-. $DIRNAME/.env
-export PGDATABASE
-export PGHOST
-export PGUSER
-export PGPORT
-export PGHOST
-export PGPASSFILE
-export PGPASSWORD
-
-PSQL_ADMIN="psql -v ON_ERROR_STOP=1 -U postgres --single-transaction -d $PGBASE"
-TODAY=`date +%Y-%m-%d`
diff --git a/load-versioned-data.sh b/load-versioned-data.sh
deleted file mode 100755
index faf5fcd..0000000
--- a/load-versioned-data.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-DIRNAME=`dirname $0`
-. $DIRNAME/lib.sh
-RELPATH=dumps/$TARGET_SCHEMA
-SOURCE_DIR=`realpath $RELPATH`
-if [ ! -d "$SOURCE_DIR" ]; then
-    echo "Directory does not exist: $SOURCE_DIR"
-    exit 1
-fi
-
-$PSQL_ADMIN -d $PGBASE < $DIRNAME/sql/csv_to_schema.sql
-$PSQL_ADMIN -d $PGBASE -c "select * from csv_to_schema('$TARGET_SCHEMA', '$SOURCE_DIR', array['crawl', 'chunk', 'token', 'ada_002', 'link', 'score', 'query'])"
\ No newline at end of file
diff --git a/load-versioned-schema.sh b/load-versioned-schema.sh
deleted file mode 100755
index 5a12433..0000000
--- a/load-versioned-schema.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-DIRNAME=`dirname $0`
-. $DIRNAME/lib.sh
-
-if [ -z "$1" ]; then
-    echo "usage: $0 source_schema"
-    exit 1
-fi
-SOURCE_SCHEMA=$1
-SOURCE_DIR=$DIRNAME/dumps/$1
-if [ ! -d "$SOURCE_DIR" ]; then
-    echo "Directory does not exist: $SOURCE_DIR"
-    exit 1
-fi
-
-SCHEMA_FILE=$SOURCE_DIR/schema.sql
-if [ ! -f "$SCHEMA_FILE" ]; then
-    echo "File does not exist: $SCHEMA_FILE"
-    exit 2
-fi
-$PSQL_ADMIN < $SCHEMA_FILE
diff --git a/louis/db/__init__.py b/louis/db/__init__.py
index d1dacc0..b241b5b 100644
--- a/louis/db/__init__.py
+++ b/louis/db/__init__.py
@@ -1,21 +1,25 @@
 """Database functions for the Louis project."""
+import hashlib
 import logging
 import os
 import urllib
 
-import numpy as np
 import psycopg
-import psycopg.sql as sql
 from pgvector.psycopg import register_vector
 from psycopg.rows import dict_row
 
-from louis.models import openai
 
 LOGGER = logging.getLogger(__name__)
 logging.basicConfig(level=logging.DEBUG)
 
+class DBError(Exception):
+    pass
+
+class DBMissingEnvironmentVariable(DBError):
+    pass
+
 def raise_error(message):
-    raise Exception(message)
+    raise DBMissingEnvironmentVariable(message)
 
 LOUIS_DSN = os.environ.get("LOUIS_DSN") or raise_error("LOUIS_DSN is not set")
 LOUIS_SCHEMA = os.environ.get("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set")
@@ -28,6 +32,8 @@ def connect_db():
         row_factory=dict_row,
         autocommit=False,
         options=f"-c search_path={LOUIS_SCHEMA},public")
+    assert connection.info.encoding == 'utf-8', (
+        'Encoding is not UTF8: ' + connection.info.encoding)
     # psycopg.extras.register_uuid()
     register_vector(connection)
     return connection
@@ -36,223 +42,27 @@ def cursor(connection):
     """Return a cursor for the given connection."""
     return connection.cursor()
 
-def store_chunk_item(cursor, item):
-    """Process a ChunkItem and insert it into the database."""
-    try:
-        data = {
-                'url': item["url"],
-                'title': item["title"],
-                'text_content': item["text_content"],
-                'tokens': item["tokens"],
-                'encoding': 'cl100k_base'
-        }
-        cursor.execute(
-            """SELECT id FROM crawl WHERE url = %(url)s
-               ORDER BY last_updated DESC LIMIT 1""",
-            data
-        )
-        data['crawl_id'] = cursor.fetchone()['id']
-        cursor.execute(
-            "INSERT INTO chunk (crawl_id, title, text_content)"
-                " VALUES(%(crawl_id)s::UUID, %(title)s, %(text_content)s)"
-            " RETURNING id",
-            data
-        )
-        data['chunk_id'] = cursor.fetchone()['id']
-        cursor.execute(
-            "INSERT INTO token (chunk_id, tokens, encoding)"
-                " VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s)"
-            " RETURNING id",
-            data
-        )
-        data['token_id'] = cursor.fetchone()['id']
-
-        return item
-    except psycopg.IntegrityError:
-        # ignore duplicates and keep processing
-        return item
-
-def store_crawl_item(cursor, item):
-    """Process a CrawlItem and insert it into the database."""
-    try:
-        cursor.execute(
-            """INSERT INTO crawl
-               (url, title, lang, html_content, last_crawled, last_updated)
-               VALUES (%s, %s, %s, %s, %s, %s)""",
-            (
-                item["url"],
-                item["title"],
-                item["lang"],
-                item["html_content"],
-                item["last_crawled"],
-                item["last_updated"],
-            )
-        )
-        return item
-    except psycopg.IntegrityError:
-        # ignore duplicates and keep processing
-        return item
-
-def store_embedding_item(cursor, item):
-    """Process an EmbeddingItem and insert it into the database."""
-    try:
-        data = {
-            'token_id': item["token_id"],
-            # TODO: shouldn't python-pgvector support casting from smallint[] to vector?
-            'embedding': np.array(item["embedding"]),
-            'embedding_model': item["embedding_model"],
-        }
-        query = sql.SQL(
-                'INSERT INTO {embedding_model} (token_id, embedding)'
-                ' VALUES (%(token_id)s, %(embedding)s::vector)'
-            ).format(embedding_model=sql.Identifier(data['embedding_model'])).as_string(cursor)
-        cursor.execute(
-           query,
-            data
-        )
-        return item
-    except psycopg.IntegrityError:
-        # ignore duplicates and keep processing
-        return item
-
-
-def link_pages(cursor, source_url, destination_url):
-    """Link two pages together in the database."""
-    data = {
-        'source_url': source_url,
-        'destination_url': destination_url,
-    }
-    cursor.execute(
-        """SELECT id FROM crawl
-           WHERE url = %(source_url)s ORDER BY last_updated DESC LIMIT 1""",
-        data
-    )
-    data['source_crawl_id'] = cursor.fetchone()['id']
-    cursor.execute(
-        """SELECT id FROM crawl
-           WHERE url = %(destination_url)s ORDER BY last_updated DESC LIMIT 1""",
-        data
-    )
-    data['destination_crawl_id'] = cursor.fetchone()['id']
-    cursor.execute(
-        "INSERT INTO link (source_crawl_id, destination_crawl_id)"
-        " VALUES (%(source_crawl_id)s, %(destination_crawl_id)s)"
-        " ON CONFLICT DO NOTHING",
-        data
-    )
-
-
-def fetch_links(cursor, url):
-    """Fetch all links from a given url."""
-    data = {
-        'source_url': url
-    }
-    cursor.execute(
-        "SELECT url FROM link"
-        " JOIN crawl ON link.destination_crawl_id = crawl.id"
-        " WHERE source_crawl_id = ("
-            " SELECT id FROM crawl WHERE url = %(source_url)s"
-            " ORDER BY last_updated DESC LIMIT 1)",
-        data
-    )
-    data['destination_urls'] = [r['url'] for r in cursor.fetchall()]
-    return data['destination_urls']
-
-def fetch_chunk_id_without_embedding(cursor, embedding_model='ada_002'):
-    """Fetch all chunk ids without an embedding."""
-    query = sql.SQL(
-        "SELECT chunk_id FROM chunk"
-        " JOIN token ON chunk.id = token.chunk_id"
-        " LEFT JOIN {embedding_model} ON token.id = {embedding_model}.token_id"
-        " WHERE {embedding_model}.embedding IS NULL"
-    ).format(embedding_model=sql.Identifier(embedding_model)).as_string(cursor)
-    cursor.execute(query)
-    return [chunk_id['chunk_id'] for chunk_id in cursor.fetchall()]
-
-def fetch_crawl_row(cursor, url):
-    """Fetch the most recent crawl row for a given url."""
-    data = {
-        'url': url
-    }
-    cursor.execute(
-        "SELECT * FROM crawl WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1",
-        data
-    )
-    return cursor.fetchone()
-
-def fetch_chunk_token_row(cursor, url):
-    """Fetch the most recent chunk token for a given chunk id."""
-
-    # TODO: eventually we could generalize the use of these postgresql
-    # url to data but for now keep it simple
-    data = parse_postgresql_url(url)
-    cursor.execute(
-        "SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk"
-        " JOIN token ON chunk.id = token.chunk_id"
-        " JOIN crawl ON chunk.crawl_id = crawl.id"
-        " WHERE chunk.id = %(entity_uuid)s LIMIT 1",
-        data
-    )
-    # psycopg.extras.DictRow is not a real dict and will convert
-    # to string as a list so we force convert to dict
-    return dict(cursor.fetchone())
-
-def create_postgresql_url(dbname, tablename, entity_uuid, parameters=None):
+def create_postgresql_url(dbname, tablename, entity_id, parameters=None):
     if parameters is None:
-        return f'postgresql://{dbname}/public/{tablename}/{entity_uuid}'
-    return f'postgresql://{dbname}/public/{tablename}/{entity_uuid}?{urllib.parse.urlencode(parameters)}'
+        return f'postgresql://{dbname}/{LOUIS_SCHEMA}/{tablename}/{entity_id}'
+    return f'postgresql://{dbname}/{LOUIS_SCHEMA}/{tablename}/{entity_id}?{urllib.parse.urlencode(parameters)}'
 
 
 def parse_postgresql_url(url):
     """Parse a postgresql url and return a dictionary with the parameters."""
     parsed = urllib.parse.urlparse(url)
+    path_split = parsed.path.split('/')
     return {
         'dbname': parsed.hostname,
-        'tablename': parsed.path.split('/')[2],
-        'entity_uuid': parsed.path.split('/')[3],
+        'schema': path_split[1],
+        'tablename': path_split[2],
+        'id': path_split[3],
         'parameters': urllib.parse.parse_qs(parsed.query)
     }
 
-def match_documents(cursor, query_embedding):
-    """Match documents with a given query."""
-    data = {
-        # TODO: use of np.array to get it to recognize the vector type
-        # is there a simpler way to do this? only reason we use this
-        # dependency
-        # 'query_embedding': np.array(query_embedding),
-        'query_embedding': query_embedding,
-        'match_threshold': 0.5,
-        'match_count': 10
-    }
-
-    # cursor.callproc('match_documents', data)
-    cursor.execute(
-        "SELECT * FROM match_documents"
-        "(%(query_embedding)s::vector, %(match_threshold)s, %(match_count)s)",
-        data)
-
-    # turn into list of dict now to preserve dictionaries
-    return [dict(r) for r in cursor.fetchall()]
-
-def match_documents_from_text_query(cursor, query):
-    data = {
-        'query': query,
-        'tokens': openai.get_tokens_from_text(query)
-    }
-    results = cursor.execute("""
-        SELECT *
-        FROM query
-        WHERE tokens = %(tokens)s::integer[]
-    """, data)
-    db_data = results.fetchone()
-    if not db_data:
-        data['embedding'] = openai.fetch_embedding(data['tokens'])
-        results = cursor.execute(
-            "INSERT INTO query(query, tokens, embedding)"
-            " VALUES(%(query)s, %(tokens)s, %(embedding)s) RETURNING id", data)
-        data['query_id'] = results.fetchone()['id']
-    else:
-        data.update(db_data)
-    docs = match_documents(cursor, data['embedding'])
+def hash(text):
+    """Return the hash of the given text.
 
-    return docs
+    We hash using the Python library to remove a roundtrip to the database
+    """
+    return hashlib.md5(text.encode()).hexdigest()
\ No newline at end of file
diff --git a/louis/db/api/__init__.py b/louis/db/api/__init__.py
new file mode 100644
index 0000000..c7613f6
--- /dev/null
+++ b/louis/db/api/__init__.py
@@ -0,0 +1,46 @@
+from louis.models import openai
+
+
+def match_documents(cursor, query_embedding):
+    """Match documents with a given query."""
+    data = {
+        # TODO: use of np.array to get it to recognize the vector type
+        # is there a simpler way to do this? only reason we use this
+        # dependency
+        # 'query_embedding': np.array(query_embedding),
+        'query_embedding': query_embedding,
+        'match_threshold': 0.5,
+        'match_count': 10
+    }
+
+    # cursor.callproc('match_documents', data)
+    cursor.execute(
+        "SELECT * FROM match_documents"
+        "(%(query_embedding)s::vector, %(match_threshold)s, %(match_count)s)",
+        data)
+
+    # turn into list of dict now to preserve dictionaries
+    return [dict(r) for r in cursor.fetchall()]
+
+def match_documents_from_text_query(cursor, query):
+    data = {
+        'query': query,
+        'tokens': openai.get_tokens_from_text(query)
+    }
+    results = cursor.execute("""
+        SELECT *
+        FROM query
+        WHERE tokens = %(tokens)s::integer[]
+    """, data)
+    db_data = results.fetchone()
+    if not db_data:
+        data['embedding'] = openai.fetch_embedding(data['tokens'])
+        results = cursor.execute(
+            "INSERT INTO query(query, tokens, embedding)"
+            " VALUES(%(query)s, %(tokens)s, %(embedding)s) RETURNING id", data)
+        data['query_id'] = results.fetchone()['id']
+    else:
+        data.update(db_data)
+    docs = match_documents(cursor, data['embedding'])
+
+    return docs
diff --git a/louis/db/crawler/__init__.py b/louis/db/crawler/__init__.py
new file mode 100644
index 0000000..18d15eb
--- /dev/null
+++ b/louis/db/crawler/__init__.py
@@ -0,0 +1,220 @@
+import psycopg
+import numpy as np
+
+import louis.db as db
+
+def link_pages(cursor, source_url, destination_url):
+    """Link two pages together in the database."""
+    data = {
+        'source_url': source_url,
+        'destination_url': destination_url,
+    }
+    cursor.execute(
+        """SELECT id FROM crawl
+           WHERE url = %(source_url)s ORDER BY last_updated DESC LIMIT 1""",
+        data
+    )
+    data['source_crawl_id'] = cursor.fetchone()['id']
+    cursor.execute(
+        """SELECT id FROM crawl
+           WHERE url = %(destination_url)s ORDER BY last_updated DESC LIMIT 1""",
+        data
+    )
+    data['destination_crawl_id'] = cursor.fetchone()['id']
+    cursor.execute(
+        "INSERT INTO link (source_crawl_id, destination_crawl_id)"
+        " VALUES (%(source_crawl_id)s, %(destination_crawl_id)s)"
+        " ON CONFLICT DO NOTHING",
+        data
+    )
+
+
+def fetch_links(cursor, url):
+    """Fetch all links from a given url."""
+    data = {
+        'source_url': url
+    }
+    cursor.execute(
+        "SELECT url FROM link"
+        " JOIN crawl ON link.destination_crawl_id = crawl.id"
+        " WHERE source_crawl_id = ("
+            " SELECT id FROM crawl WHERE url = %(source_url)s"
+            " ORDER BY last_updated DESC LIMIT 1)",
+        data
+    )
+    data['destination_urls'] = [r['url'] for r in cursor.fetchall()]
+    return data['destination_urls']
+
+def store_chunk_item(cursor, item):
+    """Process a ChunkItem and insert it into the database."""
+    try:
+        data = {
+                'url': item["url"],
+                'title': item["title"],
+                'text_content': item["text_content"],
+                'tokens': item["tokens"],
+                'encoding': 'cl100k_base'
+        }
+        cursor.execute(
+            """SELECT md5hash FROM crawl WHERE url = %(url)s
+               ORDER BY last_updated DESC LIMIT 1""",
+            data
+        )
+        data['md5hash'] = cursor.fetchone()['md5hash']
+
+        # TODO: should probably update the title even if the text_content
+        # is already present as we may have changed how we create the title
+        cursor.execute(
+            """
+            WITH e as(
+                INSERT INTO chunk (title, text_content)
+                    VALUES(%(title)s, %(text_content)s)
+                ON CONFLICT DO NOTHING
+                RETURNING id
+            )
+            SELECT id FROM e
+            UNION ALL
+            SELECT id FROM chunk WHERE text_content = %(text_content)s
+            """,
+            data
+        )
+        data['chunk_id'] = cursor.fetchone()['id']
+        cursor.execute(
+            """
+            INSERT INTO html_content_to_chunk (html_content_md5hash, chunk_id)
+            VALUES(%(md5hash)s, %(chunk_id)s::UUID)
+            ON CONFLICT DO NOTHING
+            """,
+            data)
+        cursor.execute(
+            """
+            WITH e as(
+                INSERT INTO token (chunk_id, tokens, encoding)
+                    VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s)
+                ON CONFLICT DO NOTHING
+                RETURNING *
+            )
+            SELECT id FROM e
+            UNION ALL
+            SELECT id FROM token
+                WHERE chunk_id = %(chunk_id)s::UUID
+                and tokens = %(tokens)s::INTEGER[]
+                and encoding = %(encoding)s
+            """,
+            data
+        )
+        data['token_id'] = cursor.fetchone()['id']
+        return data
+    except psycopg.IntegrityError as e:
+        raise db.DBError("Error storing chunk item for %s" % item['url']) from e
+
+
+def store_crawl_item(cursor, item):
+    """Process a CrawlItem and insert it into the database."""
+    try:
+        item['html_content_md5hash'] = db.hash(item["html_content"])
+        cursor.execute(
+            """INSERT INTO html_content (content, md5hash)
+               VALUES(%(html_content)s, %(html_content_md5hash)s)
+               ON CONFLICT DO NOTHING""",
+            item)
+        cursor.execute(
+            """INSERT INTO crawl
+               (url, title, lang, md5hash, last_crawled, last_updated)
+               VALUES (
+                %(url)s, %(title)s, %(lang)s, %(html_content_md5hash)s,
+                %(last_crawled)s, %(last_updated)s)
+            """,
+            item
+        )
+        return item
+    except psycopg.IntegrityError as e:
+        raise db.DBError("Error storing crawl item for %s" % item['url']) from e
+
+
+def store_embedding_item(cursor, item):
+    """Process an EmbeddingItem and insert it into the database."""
+    try:
+        data = {
+            'token_id': item["token_id"],
+            # TODO: shouldn't python-pgvector support casting from smallint[] to vector?
+            'embedding': np.array(item["embedding"]),
+            'embedding_model': item["embedding_model"],
+        }
+        query = psycopg.sql.SQL(
+                'INSERT INTO {embedding_model} (token_id, embedding)'
+                ' VALUES (%(token_id)s, %(embedding)s::vector)'
+            ).format(
+                embedding_model=psycopg.sql.Identifier(
+                data['embedding_model'])
+            ).as_string(cursor)
+        cursor.execute(
+           query,
+            data
+        )
+        return item
+    except psycopg.IntegrityError as e:
+        raise db.DBError(
+            "Error storing embedding item for token %s" % item['token_id']) from e
+
+def fetch_crawl_ids_without_chunk(cursor):
+    """Fetch all crawl ids without an embedding."""
+    query = psycopg.sql.SQL(
+        """
+        SELECT crawl.id FROM crawl
+         LEFT JOIN html_content_to_chunk
+         ON crawl.md5hash = html_content_to_chunk.html_content_md5hash
+         WHERE chunk_id IS NULL
+        """
+    ).as_string(cursor)
+    cursor.execute(query)
+    return [crawl_id['id'] for crawl_id in cursor.fetchall()]
+
+def fetch_chunk_id_without_embedding(cursor, embedding_model='ada_002'):
+    """Fetch all chunk ids without an embedding."""
+    query = psycopg.sql.SQL(
+        "SELECT chunk_id FROM chunk"
+        " JOIN token ON chunk.id = token.chunk_id"
+        " LEFT JOIN {embedding_model} ON token.id = {embedding_model}.token_id"
+        " WHERE {embedding_model}.embedding IS NULL"
+    ).format(embedding_model=psycopg.sql.Identifier(embedding_model)).as_string(cursor)
+    cursor.execute(query)
+    return [chunk_id['chunk_id'] for chunk_id in cursor.fetchall()]
+
+def fetch_crawl_row(cursor, url):
+    """Fetch the most recent crawl row for a given url."""
+    if url.startswith('postgresql://'):
+        data = db.parse_postgresql_url(url)
+
+        cursor.execute(
+            """SELECT *, content as html_content FROM crawl
+                INNER JOIN html_content on crawl.md5hash = html_content.md5hash
+                WHERE id = %(id)s ORDER BY last_updated DESC LIMIT 1""",
+            data
+        )
+    else:
+        data = {'url': url}
+        cursor.execute(
+            """SELECT *, content as html_content FROM crawl
+                INNER JOIN html_content on crawl.md5hash = html_content.md5hash
+                WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1""",
+            data
+        )
+    if cursor.rowcount == 0:
+        raise db.DBError("No crawl found for id: {}".format(data))
+    row = cursor.fetchone()
+    assert 'html_content' in row.keys()
+    return row
+
+def fetch_chunk_token_row(cursor, url):
+    """Fetch the most recent chunk token for a given chunk id."""
+    data = db.parse_postgresql_url(url)
+    cursor.execute(
+        "SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk"
+        " JOIN token ON chunk.id = token.chunk_id"
+        " WHERE chunk.id = %(id)s LIMIT 1",
+        data
+    )
+    # psycopg.extras.DictRow is not a real dict and will convert
+    # to string as a list so we force convert to dict
+    return cursor.fetchone()
diff --git a/louis/models/openai.py b/louis/models/openai.py
index e057356..4a4eafb 100644
--- a/louis/models/openai.py
+++ b/louis/models/openai.py
@@ -3,6 +3,9 @@
 import openai
 import tiktoken
 
+import dotenv
+dotenv.load_dotenv()
+
 # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=python
 
 def safe_get(key):
diff --git a/psql.sh b/psql.sh
deleted file mode 100755
index ed7b965..0000000
--- a/psql.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-DIRNAME=`dirname $0`
-. $DIRNAME/lib.sh
-psql
\ No newline at end of file
diff --git a/setup-db-docker.sh b/setup-db-docker.sh
deleted file mode 100755
index 15b4d23..0000000
--- a/setup-db-docker.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-DIRNAME=`dirname $0`
-. $DIRNAME/lib.sh
-
-if [ -z "$PGBASE" ]; then
-    echo "PGBASE is not set"
-    exit 1
-fi
-DOCKER_EXEC="docker exec -it louis-db-server"
-$DOCKER_EXEC createdb -E utf-8 $PGBASE
-$DOCKER_EXEC $PSQL_ADMIN -c "CREATE USER $USER; ALTER USER $USER WITH SUPERUSER;"
-$DOCKER_EXEC pip install pgxnclient
-$DOCKER_EXEC pgxn install vector
-$DOCKER_EXEC $PSQL_ADMIN -c "SET search_path TO public; CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"; CREATE EXTENSION IF NOT EXISTS vector;"
\ No newline at end of file
diff --git a/sql/2023-08-08-count-crawl-with-missing-chunk.sql b/sql/2023-08-08-count-crawl-with-missing-chunk.sql
new file mode 100644
index 0000000..6bb2bd9
--- /dev/null
+++ b/sql/2023-08-08-count-crawl-with-missing-chunk.sql
@@ -0,0 +1,4 @@
+select count(*) 
+from crawl left join chunk 
+on crawl.id = chunk.crawl_id 
+where chunk.crawl_id is null
\ No newline at end of file
diff --git a/sql/2023-08-09-find-duplicated_html_content.sql b/sql/2023-08-09-find-duplicated_html_content.sql
new file mode 100644
index 0000000..373f537
--- /dev/null
+++ b/sql/2023-08-09-find-duplicated_html_content.sql
@@ -0,0 +1,9 @@
+with hashes as (
+select id, md5(html_content) as md5sum, url from crawl 
+),
+aggregated as(
+	select array_agg(id), array_agg(url), count(*) as dups
+	from hashes 
+	group by md5sum
+)
+select * from aggregated where dups > 1
\ No newline at end of file
diff --git a/sql/2023-08-09-issue8-html_content-table.sql b/sql/2023-08-09-issue8-html_content-table.sql
new file mode 100644
index 0000000..64de142
--- /dev/null
+++ b/sql/2023-08-09-issue8-html_content-table.sql
@@ -0,0 +1,103 @@
+-- because the same html_content can be used by multiple crawl entries
+-- we modify the crawl table by moving the column html_content to a new table html_content
+-- and add a foreign key to the crawl table to html_content
+-- we also modify chunk, originally linked to the crawl table as follows:
+-- * the same chunk (example: this page has been archived can be extracted from different crawl entries (and html_content)
+--   even when these pages are not the same (do not have the same md5sum hash)
+-- * we add an N:N relation between chunk and html_content
+
+-- original tables:
+---
+-- CREATE TABLE crawl (
+-- 	id uuid NOT NULL DEFAULT uuid_generate_v4(),
+-- 	url text NULL,
+-- 	title text NULL,
+-- 	lang bpchar(2) NULL,
+-- 	html_content text NULL,
+-- 	last_crawled text NULL,
+-- 	last_updated text NULL,
+-- 	last_updated_date date NULL,
+-- 	CONSTRAINT crawl_pkey PRIMARY KEY (id),
+-- 	CONSTRAINT crawl_url_last_updated_key UNIQUE (url, last_updated)
+-- );
+--
+-- CREATE TABLE chunk (
+-- 	id uuid NOT NULL DEFAULT uuid_generate_v4(),
+-- 	crawl_id uuid NULL,
+-- 	title text NULL,
+-- 	text_content text NULL,
+-- 	CONSTRAINT chunk_pkey PRIMARY KEY (id),
+-- 	CONSTRAINT chunk_text_content_key UNIQUE (text_content),
+-- 	CONSTRAINT chunk_crawl_uuid_fkey FOREIGN KEY (crawl_id) REFERENCES crawl(id) ON DELETE CASCADE
+-- );
+
+CREATE table if not EXISTS html_content (
+  content text NOT NULL,
+  md5hash CHAR(32) NOT NULL,
+  CONSTRAINT html_content_md5hash_key UNIQUE (md5hash)
+);
+
+ALTER TABLE crawl
+    ADD COLUMN IF NOT EXISTS md5hash CHAR(32);
+
+-- the tables crawl and chunk already contain data so we need to move the data to the new table first before applying constraints
+-- on duplicate key value
+
+UPDATE crawl
+  SET md5hash = md5(html_content);
+
+INSERT INTO html_content (content, md5hash)
+  SELECT html_content, md5hash FROM crawl
+ON CONFLICT (md5hash) DO NOTHING;
+
+-- we create N:N mapping between html_content and chunk
+-- as from different html_content it is possible to extract
+-- the same chunk even if not all chunks are the same
+CREATE table html_content_to_chunk (
+  html_content_md5hash CHAR(32) NOT NULL,
+  chunk_id uuid NOT NULL
+);
+
+insert into html_content_to_chunk (html_content_md5hash, chunk_id)
+  select crawl.md5hash, chunk.id
+    from chunk
+      join crawl on chunk.crawl_id = crawl.id;
+
+-- we add constraint AFTER insertion for better performance
+ALTER TABLE html_content_to_chunk
+  ADD CONSTRAINT html_content_to_chunk_pkey PRIMARY KEY (html_content_md5hash, chunk_id),
+  ADD CONSTRAINT html_content_to_chunk_html_content_md5hash_fkey FOREIGN KEY (html_content_md5hash) REFERENCES html_content(md5hash) ON DELETE CASCADE,
+  ADD CONSTRAINT html_content_to_chunk_chunk_id_fkey FOREIGN KEY (chunk_id) REFERENCES chunk(id) ON DELETE CASCADE;
+
+CREATE OR REPLACE VIEW documents
+AS SELECT crawl.id,
+    chunk.id AS chunk_id,
+    crawl.url,
+    html_content.content as html_content,
+    crawl.title,
+    chunk.title AS subtitle,
+    chunk.text_content AS content,
+    embedding.embedding,
+    cardinality(token.tokens) AS tokens_count,
+    crawl.last_updated,
+    scoring.score
+   FROM crawl,
+    html_content,
+    html_content_to_chunk,
+    chunk,
+    token,
+    ada_002 embedding,
+    scoring
+  WHERE chunk.id = token.chunk_id
+    AND token.id = embedding.token_id
+    AND crawl.id = scoring.entity_id
+    AND crawl.md5hash = html_content.md5hash
+    AND html_content_to_chunk.html_content_md5hash = html_content.md5hash
+    AND html_content_to_chunk.chunk_id = chunk.id;
+
+ALTER TABLE chunk
+  DROP CONSTRAINT chunk_crawl_uuid_fkey,
+  DROP COLUMN crawl_id;
+
+alter table crawl
+  drop column html_content;
\ No newline at end of file
diff --git a/sql/2023-09-04-add-missing-constraints-rename-column.sql b/sql/2023-09-04-add-missing-constraints-rename-column.sql
new file mode 100644
index 0000000..fe740fe
--- /dev/null
+++ b/sql/2023-09-04-add-missing-constraints-rename-column.sql
@@ -0,0 +1,3 @@
+ALTER TABLE crawl ADD CONSTRAINT crawl_to_html_content_md5hash_fkey FOREIGN KEY (md5hash) REFERENCES html_content(md5hash) ON DELETE cascade;
+ALTER TABLE default_chunks ADD CONSTRAINT default_chunks_to_chunk_fkey FOREIGN KEY (chunk_id) REFERENCES chunk(id) ON DELETE cascade;
+ALTER TABLE html_content_to_chunk RENAME COLUMN html_content_md5hash TO md5hash;
diff --git a/sql/2023-09-20-seed-identification-API-for-Nachet-frontend.sql b/sql/2023-09-20-seed-identification-API-for-Nachet-frontend.sql
new file mode 100644
index 0000000..e69de29
diff --git a/sql/schema_to_csv.sql b/sql/schema_to_csv.sql
index c3584e8..cb6523d 100644
--- a/sql/schema_to_csv.sql
+++ b/sql/schema_to_csv.sql
@@ -1,4 +1,4 @@
-CREATE OR REPLACE FUNCTION schema_to_csv(schema_source TEXT, path TEXT) RETURNS void AS $$
+CREATE OR REPLACE FUNCTION public.schema_to_csv(schema_source TEXT, path TEXT) RETURNS void AS $$
 declare
    tables RECORD;
    statement TEXT;
@@ -17,4 +17,3 @@ END LOOP;
 return;
 end;
 $$ LANGUAGE plpgsql;
-
diff --git a/tests/test_db.py b/tests/test_db.py
deleted file mode 100644
index ebef987..0000000
--- a/tests/test_db.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-import unittest
-import pytest
-import psycopg
-import json
-from psycopg.rows import dict_row
-
-import dotenv
-dotenv.load_dotenv()
-
-def raise_error(message):
-    raise Exception(message)
-
-LOUIS_DSN = os.getenv("LOUIS_DSN") or raise_error("LOUIS_DSN is not set")
-LOUIS_SCHEMA = os.getenv("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set")
-MATCH_THRESHOLD = 0.5
-MATCH_COUNT = 10
-
-class DBTest(unittest.TestCase):
-
-    def execute(self, filename):
-        query = open(filename).read()
-        self.cursor.execute(query)
-
-    def setUp(self):
-        self.connection = psycopg.connect(LOUIS_DSN)
-        self.cursor = self.connection.cursor(row_factory=dict_row)
-        self.cursor.execute("SET search_path TO louis_v004, public")
-
-    def tearDown(self):
-        self.cursor.close()
-        self.connection.close()
-
-    def upgrade_schema(self):
-        return
-        if LOUIS_SCHEMA == 'louis_v004':
-            self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql')
-            self.execute('sql/2023-07-11-populate-link.sql')
-            self.execute('sql/2023-07-12-score-current.sql')
-            self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql')
-            self.execute('sql/2023-07-19-modified-documents.sql')
-            self.execute('sql/2023-07-19-weighted_search.sql')
-            self.execute('sql/2023-07-21-default_chunk.sql')
-
-    def test_well_formed_xml(self):
-        self.upgrade_schema()
-        # SELECT count(*) FROM crawl WHERE NOT xml_is_well_formed(html_content);
-        self.cursor.execute("""
-            SELECT count(*)
-            FROM crawl
-            WHERE NOT xml_is_well_formed(html_content);""")
-        result = self.cursor.fetchall()
-        self.assertEqual(result[0]['count'], 0, "All xml should be well formed")
-
-    def test_weighted_search(self):
-        self.upgrade_schema()
-
-        with open('tests/embeddings/president.json') as f:
-            embeddings = json.load(f)
-        query = 'who is the president of the CFIA?'
-        weights = json.dumps(
-            {'similarity': 0.6, 'recency': 0.2, 'traffic': 0.0, 'current': 0.1})
-        self.cursor.execute(
-            "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", (
-                query, embeddings, MATCH_THRESHOLD, MATCH_COUNT, weights))
-        results = self.cursor.fetchall()
-        result = results[0]['search']
-        self.assertEqual(
-            result[0]['title'],
-            "Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency")
-
-        query_id = result[0]['query_id']
-        self.cursor.execute("SELECT * FROM query where id = %s::uuid", (query_id,))
-        result = self.cursor.fetchall()
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0]['query'], query)
-        result_embedding = json.loads(result[0]['embedding'])
-        self.assertAlmostEqual(result_embedding[0], embeddings[0])
-        self.assertEqual(len(result[0]['result']), MATCH_COUNT)
-
-    def test_weighted_search_with_empty_query(self):
-        self.upgrade_schema()
-
-        weights = json.dumps({ 'recency': 0.4, 'traffic': 0.4, 'current': 0.2})
-        self.cursor.execute(
-            "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", (
-                None, None, MATCH_THRESHOLD, MATCH_COUNT, weights))
-        result = self.cursor.fetchall()[0]['search']
-        self.assertEqual(len(result), MATCH_COUNT, "Should return 10 results")
-        urls = dict([(r['url'], True) for r in result])
-        self.assertEqual(len(urls.keys()), MATCH_COUNT, "All urls should be unique")
-
-
-    @unittest.skip("we have to re-chunk the documents using louis-crawler first")
-    @pytest.mark.skip(
-        reason="we have to re-chunk the documents using louis-crawler first")
-    def test_every_crawl_doc_should_have_at_least_one_chunk(self):
-        self.cursor.execute("""
-            SELECT count(*)
-            FROM crawl LEFT JOIN chunk ON crawl.id = chunk.crawl_id
-            WHERE chunk.id IS NULL""")
-        result = self.cursor.fetchall()
-        self.assertEqual(
-            result[0]['count'], 0,
-            "Every crawl doc should have at least one chunk")
diff --git a/tests/test_db_api.py b/tests/test_db_api.py
new file mode 100644
index 0000000..804cb0d
--- /dev/null
+++ b/tests/test_db_api.py
@@ -0,0 +1,74 @@
+"""test database functions"""
+import unittest
+import json
+
+import louis.db as db
+import louis.db.api as api
+
+import testing_utils as test
+
+
+class TestDBAPI(unittest.TestCase):
+    """Test the database functions"""
+    def setUp(self):
+        self.connection = db.connect_db()
+        self.cursor = db.cursor(self.connection)
+
+    def tearDown(self):
+        self.connection.rollback()
+        self.connection.close()
+
+    def test_match_documents_text_query(self):
+        with db.cursor(self.connection) as cursor:
+            docs = api.match_documents_from_text_query(
+                cursor,
+                'what are the cooking temperatures for e.coli?')
+            self.connection.rollback()
+        self.assertEqual(len(docs), 10)
+
+    # obsoleted by weighted search
+    # def test_president_of_cfia(self):
+    #     with db.cursor(self.connection) as cursor:
+    #         docs = api.match_documents_from_text_query(
+    #             cursor, 'who is the president of the CFIA?')
+    #         self.connection.rollback()
+    #     self.assertEqual(
+    #         docs[0]['title'],
+    #         'Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency')
+
+    def test_weighted_search(self):
+        with open('tests/embeddings/president.json') as f:
+            embeddings = json.load(f)
+        query = 'who is the president of the CFIA?'
+        weights = json.dumps(
+            {'similarity': 0.6, 'recency': 0.2, 'traffic': 0.0, 'current': 0.1})
+        self.cursor.execute(
+            "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", (
+                query, embeddings, test.MATCH_THRESHOLD, test.MATCH_COUNT, weights))
+        results = self.cursor.fetchall()
+        result = results[0]['search']
+        self.assertEqual(
+            result[0]['title'],
+            "Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency")
+
+        query_id = result[0]['query_id']
+        self.cursor.execute("SELECT * FROM query where id = %s::uuid", (query_id,))
+        result = self.cursor.fetchall()
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0]['query'], query)
+        result_embedding = result[0]['embedding']
+        self.assertAlmostEqual(result_embedding[0], embeddings[0])
+        self.assertEqual(len(result[0]['result']), test.MATCH_COUNT)
+
+    def test_weighted_search_with_empty_query(self):
+        weights = json.dumps({ 'recency': 0.4, 'traffic': 0.4, 'current': 0.2})
+        self.cursor.execute(
+            "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", (
+                None, None, test.MATCH_THRESHOLD, test.MATCH_COUNT, weights))
+        result = self.cursor.fetchall()[0]['search']
+        self.assertEqual(len(result), test.MATCH_COUNT, "Should return 10 results")
+        urls = dict([(r['url'], True) for r in result])
+        self.assertEqual(
+            len(urls.keys()),
+            test.MATCH_COUNT,
+            "All urls should be unique")
\ No newline at end of file
diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py
new file mode 100644
index 0000000..63db26c
--- /dev/null
+++ b/tests/test_db_crawler.py
@@ -0,0 +1,77 @@
+"""test database functions"""
+import unittest
+
+import louis.db as db
+import louis.db.crawler as crawler
+
+import testing_utils as test
+
+class TestDBCrawler(unittest.TestCase):
+    """Test the database functions"""
+    def setUp(self):
+        self.connection = db.connect_db()
+
+    def tearDown(self):
+        self.connection.close()
+
+    def test_link_pages_and_fetch_links(self):
+        """sample test to check if link_pages works"""
+        with db.cursor(self.connection) as cursor:
+            source_url = "https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149"
+            destination_url = "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/australia/eng/1321292836314/1321292933011"
+            crawler.link_pages(cursor, source_url, destination_url)
+            links = crawler.fetch_links(cursor, source_url)
+            self.connection.rollback()
+        self.assertTrue(destination_url in links)
+
+    def test_fetch_crawl_row_by_http_url(self):
+        """sample test to check if fetch_crawl_row works"""
+        with db.cursor(self.connection) as cursor:
+            row = crawler.fetch_crawl_row(
+                cursor,
+                "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699"
+                )
+            self.connection.rollback()
+        self.assertEqual(row['url'], "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699")
+        self.assertEqual(
+            row['title'],
+            "Mandat - Agence canadienne d'inspection des aliments")
+
+    def test_fetch_crawl_row_by_postgresql_url(self):
+        """sample test to check if fetch_crawl_row works"""
+        url = db.create_postgresql_url(
+            "DBNAME",
+            "crawl",
+            "8b25a4d3-bd83-412d-8cd8-0fd969f28efc")
+        with db.cursor(self.connection) as cursor:
+            row = crawler.fetch_crawl_row(
+                cursor,
+                url
+                )
+            self.connection.rollback()
+        self.assertEqual(row['url'], "https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149")
+        self.assertEqual(
+            row['title'],
+            "Sampling procedures - Canadian Food Inspection Agency")
+
+    def test_fetch_chunk_row(self):
+        """sample test to check if fetch_chunk_row works"""
+        url = db.create_postgresql_url(
+            "DBNAME",
+            "chunk",
+            "469812c5-190c-4e56-9f88-c8621592bcb5")
+        with db.cursor(self.connection) as cursor:
+            row = crawler.fetch_chunk_token_row(cursor, url)
+            self.connection.rollback()
+        self.assertTrue(isinstance(row, dict))
+        self.assertEqual(len(row['tokens']), 76)
+        self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5")
+        self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285')
+
+    def test_fetch_chunk_id_without_embedding(self):
+        """sample test to check if fetch_chunk_id_without_embedding works"""
+        with db.cursor(self.connection) as cursor:
+            cursor.execute(test.embedding_table.format(embedding_model='test-model'))
+            rows = crawler.fetch_chunk_id_without_embedding(cursor, 'test-model')
+            _entity_id = rows[0]
+            self.connection.rollback()
\ No newline at end of file
diff --git a/tests/test_db_data.py b/tests/test_db_data.py
new file mode 100644
index 0000000..7bc9f0e
--- /dev/null
+++ b/tests/test_db_data.py
@@ -0,0 +1,54 @@
+import unittest
+
+import psycopg
+from psycopg.rows import dict_row
+
+import testing_utils as test
+
+class TestDBData(unittest.TestCase):
+
+    def execute(self, filename):
+        query = open(filename).read()
+        self.cursor.execute(query)
+
+    def setUp(self):
+        self.connection = psycopg.connect(test.LOUIS_DSN)
+        self.cursor = self.connection.cursor(row_factory=dict_row)
+        self.cursor.execute("SET search_path TO louis_v004, public")
+
+    def tearDown(self):
+        self.connection.rollback()
+        self.cursor.close()
+        self.connection.close()
+
+    def upgrade_schema(self):
+        return
+        if test.LOUIS_SCHEMA == 'louis_v004':
+            self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql')
+            self.execute('sql/2023-07-11-populate-link.sql')
+            self.execute('sql/2023-07-12-score-current.sql')
+            self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql')
+            self.execute('sql/2023-07-19-modified-documents.sql')
+            self.execute('sql/2023-07-19-weighted_search.sql')
+            self.execute('sql/2023-07-21-default_chunk.sql')
+
+    def test_well_formed_xml(self):
+        self.upgrade_schema()
+        # SELECT count(*) FROM crawl WHERE NOT xml_is_well_formed(html_content);
+        self.cursor.execute("""
+            SELECT count(*)
+            FROM html_content
+            WHERE NOT xml_is_well_formed(content);""")
+        result = self.cursor.fetchall()
+        self.assertEqual(result[0]['count'], 0, "All xml should be well formed")
+
+    def test_every_crawl_doc_should_have_at_least_one_chunk(self):
+        # self.execute('sql/2023-08-09-issue8-html_content-table.sql')
+        self.cursor.execute("""
+            select count(*)
+                from crawl left join documents on crawl.id = documents.id
+                where documents.id is null""")
+        result = self.cursor.fetchall()
+        self.assertEqual(
+            result[0]['count'], 0,
+            "Every crawl doc should have at least one chunk")
diff --git a/tests/test_db_layer.py b/tests/test_db_layer.py
deleted file mode 100644
index a3664df..0000000
--- a/tests/test_db_layer.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""test database functions"""
-import os
-import unittest
-
-import louis.db as db
-
-embedding_table = """
-create table if not exists "{embedding_model}" (
-	id uuid default uuid_generate_v4 (),
-	token_id uuid references token(id),
-	embedding vector(1536),
-	primary key(id),
-	unique(token_id)
-);
-"""
-
-class TestDBLayer(unittest.TestCase):
-    """Test the database functions"""
-    def setUp(self):
-        self.connection = db.connect_db()
-
-    def tearDown(self):
-        self.connection.close()
-
-    def test_schema(self):
-        """sample test to check if the schema is correct and idempotent"""
-        LOUIS_SCHEMA = os.environ.get('LOUIS_SCHEMA')
-        with open(f"dumps/{LOUIS_SCHEMA}/schema.sql", encoding='utf-8') as schema_file:
-            schema = schema_file.read()
-            schema = schema.replace(LOUIS_SCHEMA, 'test')
-        with db.cursor(self.connection) as cursor:
-            cursor.execute(schema)
-            self.connection.rollback()
-
-    def test_link_pages_and_fetch_links(self):
-        """sample test to check if link_pages works"""
-        with db.cursor(self.connection) as cursor:
-            source_url = "https://inspection.canada.ca/splash"
-            destination_url = "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/australia/eng/1321292836314/1321292933011"
-            db.link_pages(cursor, source_url, destination_url)
-            links = db.fetch_links(cursor, "https://inspection.canada.ca/splash")
-            self.connection.rollback()
-        self.assertEqual(links, [destination_url])
-
-    def test_fetch_crawl_row(self):
-        """sample test to check if fetch_crawl_row works"""
-        with db.cursor(self.connection) as cursor:
-            row = db.fetch_crawl_row(
-                cursor,
-                "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699"
-                )
-            self.connection.rollback()
-        self.assertEqual(row['url'], "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699")
-        self.assertEqual(
-            row['title'],
-            "Mandat - Agence canadienne d'inspection des aliments")
-
-    def test_fetch_chunk_row(self):
-        """sample test to check if fetch_chunk_row works"""
-        with db.cursor(self.connection) as cursor:
-            row = db.fetch_chunk_token_row(
-                cursor,
-                "postgresql://inspection.canada.ca/public/chunk/469812c5-190c-4e56-9f88-c8621592bcb5")
-            self.connection.rollback()
-        self.assertEqual(len(row['tokens']), 76)
-        self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5")
-        self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285')
-
-
-    def test_fetch_chunk_id_without_embedding(self):
-        """sample test to check if fetch_chunk_id_without_embedding works"""
-        with db.cursor(self.connection) as cursor:
-            cursor.execute(embedding_table.format(embedding_model='test-model'))
-            rows = db.fetch_chunk_id_without_embedding(cursor, 'test-model')
-            _entity_id = rows[0]
-            self.connection.rollback()
-
-    def test_create_postgresql_url(self):
-        """sample test to check if create_parse_postgresql_url works"""
-        entity_uuid = '5cef886d-8408-4868-9a69-0f0ca2167941'
-        url = db.create_postgresql_url(
-            "inspection.canada.ca",
-            "chunk", entity_uuid,
-            {'encoding': 'cl100k_base'})
-        self.assertEqual(url, f"postgresql://inspection.canada.ca/public/chunk/{entity_uuid}?encoding=cl100k_base")
-        parsed = db.parse_postgresql_url(url)
-        self.assertEqual(parsed['dbname'], "inspection.canada.ca")
-        self.assertEqual(parsed['tablename'], "chunk")
-        self.assertEqual(parsed['entity_uuid'], entity_uuid)
-        self.assertEqual(parsed['parameters']['encoding'][0], "cl100k_base")
-
-    def test_match_documents_text_query(self):
-        with db.cursor(self.connection) as cursor:
-            docs = db.match_documents_from_text_query(
-                cursor,
-                'what are the cooking temperatures for e.coli?')
-            self.connection.rollback()
-        self.assertEqual(len(docs), 10)
-
-    def test_president_of_cfia(self):
-        with db.cursor(self.connection) as cursor:
-            docs = db.match_documents_from_text_query(
-                cursor, 'who is the president of the CFIA?')
-            self.connection.rollback()
-        self.assertEqual(
-            docs[0]['title'],
-            'Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency')
diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py
new file mode 100644
index 0000000..e8ee7eb
--- /dev/null
+++ b/tests/test_db_schema.py
@@ -0,0 +1,24 @@
+"""test database functions"""
+import unittest
+
+import testing_utils as test
+import louis.db as db
+
+
+class TestDBSchema(unittest.TestCase):
+    """Test the database functions"""
+    def setUp(self):
+        self.connection = db.connect_db()
+
+    def tearDown(self):
+        self.connection.close()
+
+    def test_schema(self):
+        """sample test to check if the schema is correct and idempotent"""
+        schema_filename = f"dumps/{test.LOUIS_SCHEMA}/schema.sql"
+        with open(schema_filename, encoding='utf-8') as schema_file:
+            schema = schema_file.read()
+            schema = schema.replace(test.LOUIS_SCHEMA, 'test')
+        with db.cursor(self.connection) as cursor:
+            cursor.execute(schema)
+            self.connection.rollback()
\ No newline at end of file
diff --git a/tests/test_db_utils.py b/tests/test_db_utils.py
new file mode 100644
index 0000000..7194cea
--- /dev/null
+++ b/tests/test_db_utils.py
@@ -0,0 +1,28 @@
+"""test database functions"""
+import unittest
+
+import testing_utils as test
+import louis.db as db
+
+
+class TestDBUtils(unittest.TestCase):
+    """Test the database functions"""
+    def setUp(self):
+        self.connection = db.connect_db()
+
+    def tearDown(self):
+        self.connection.close()
+
+    def test_create_postgresql_url(self):
+        """sample test to check if create_parse_postgresql_url works"""
+        entity_uuid = '5cef886d-8408-4868-9a69-0f0ca2167941'
+        url = db.create_postgresql_url(
+            "inspection.canada.ca",
+            "chunk", entity_uuid,
+            {'encoding': 'cl100k_base'})
+        self.assertEqual(url, f"postgresql://inspection.canada.ca/{test.LOUIS_SCHEMA}/chunk/{entity_uuid}?encoding=cl100k_base")
+        parsed = db.parse_postgresql_url(url)
+        self.assertEqual(parsed['dbname'], "inspection.canada.ca")
+        self.assertEqual(parsed['tablename'], "chunk")
+        self.assertEqual(parsed['id'], entity_uuid)
+        self.assertEqual(parsed['parameters']['encoding'][0], "cl100k_base")
\ No newline at end of file
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
new file mode 100644
index 0000000..9e560cb
--- /dev/null
+++ b/tests/testing_utils.py
@@ -0,0 +1,21 @@
+import os
+import dotenv
+dotenv.load_dotenv()
+
+def raise_error(message):
+    raise Exception(message)
+
+LOUIS_SCHEMA = os.getenv("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set")
+LOUIS_DSN = os.getenv("LOUIS_DSN") or raise_error("LOUIS_DSN is not set")
+MATCH_THRESHOLD = 0.5
+MATCH_COUNT = 10
+
+embedding_table = """
+create table if not exists "{embedding_model}" (
+	id uuid default uuid_generate_v4 (),
+	token_id uuid references token(id),
+	embedding vector(1536),
+	primary key(id),
+	unique(token_id)
+);
+"""