diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dc03e15..565a5fa 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,7 +18,15 @@ ], // Configure tool-specific properties. - // "customizations": {}, + "customizations": { + "vscode":{ + "extensions": [ + "timonwong.shellcheck", + "GitHub.vscode-pull-request-github", + "charliermarsh.ruff" + ] + } + }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..4592aea --- /dev/null +++ b/.env.template @@ -0,0 +1,12 @@ +LOUIS_DSN= +PGBASE= +PGUSER= +USER= +PGHOST= +POSTGRES_PASSWORD= +PGPASSWORD= +OPENAI_API_KEY= +AZURE_OPENAI_SERVICE= +LOUIS_SCHEMA= +DB_SERVER_CONTAINER_NAME= +PGDATA= diff --git a/.gitignore b/.gitignore index 390df2b..c0f7379 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -.env +.env** .pgpassfile dumps/** reports/** diff --git a/DEVELOPER.md b/DEVELOPER.md new file mode 100644 index 0000000..2c67321 --- /dev/null +++ b/DEVELOPER.md @@ -0,0 +1,53 @@ +# Development guidelines for louis-db + +## Making changes to the database schema + +### Run latest schema locally + +* Setup .env environment variables + * LOUIS_DSN: Data Source Name (DSN) used for configuring a database connection in Louis's system. + + * PGBASE: the base directory where PostgreSQL related files or resources are stored or accessed. + + * PGUSER: the username or role required to authenticate and access a PostgreSQL database. + + * USER: the username required for validation and access + + * PGHOST: the hostname or IP address of the server where the PostgreSQL database is hosted. + + * PGPASSWORD: the password for the user authentication when connecting to the PostgreSQL database. + + * POSTGRES_PASSWORD: the password for the database, for authentication when connecting to the PostgreSQL database. + + * PGDATA: path to the directory where PostgreSQL data files are stored. + + * OPENAI_API_KEY: the API key required for authentication when making requests to the OpenAI API. + + * AZURE_OPENAI_SERVICE: information related to an Azure-based service for OpenAI. + + * LOUIS_SCHEMA: the Louis schema within database. + + * DB_SERVER_CONTAINER_NAME: name of your database server container. + +* Run database locally (see bin/postgres.sh) +* Restore latest schema dump + +### before every change + +* pgdump the schema using ```bin/backup-db-docker.sh``` + +### Create change + +* make sure to create a Github Issue issue #X first describing the work to be done +* create a branch ```issueX-descriptive-name``` +* add a new SQL file YYYY-mm-dd-issueX-descriptive-name + * explain in top header comment the changes to be made + * provide original DDL of files to be modified +* create a test case in tests/test_db.py + * load your new SQL file within a transaction (that will be rolled back) + * ensure you have an assert to test for +* once your test passes, commit change to the database by running your script with bin/psql.sh + * you should now be able to remove the load SQL file and run the test successfully +* re-run test suite and fix exposed database functions affected by your changes (failing) +* dump the new schema as louis_v00X with X+1 +* test new schema with your client apps. diff --git a/Dockerfile b/Dockerfile index f3df76f..ac24f3d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1 FROM alpine RUN apk add && apk add postgresql-client -COPY docker-entrypoint.sh /entrypoint.sh +COPY bin/docker-entrypoint.sh /entrypoint.sh ENV LOUIS_DSN= ENV LOUIS_SCHEMA= ENV LOAD_DATA_ONLY= diff --git a/README.md b/README.md index 24a0794..0ca513a 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,20 @@ ## Installing python package +If you need to interface with the database, use this to install: + +``` +pip install git+https://github.com/ai-cfia/louis-db@v0.0.5-alpha3 ``` -pip install git+https://github.com/ai-cfia/louis-db@v0.0.5-alpha2 -``` \ No newline at end of file + +You'll often want to add, move or modify existing database layer functions found in louis-db from a client repository. + +To edit, you can install an editable version of the package dependencies such as: + +``` +pip install -e git+https://github.com/ai-cfia/louis-db#egg=louis_db +``` + +this will checkout the latest source in a local git in src/louis-db allowing edits in that directory to be immediately available for use by louis-crawler. + +Don't forget to create a PR with your changes once you're done! \ No newline at end of file diff --git a/backup-db-docker.sh b/backup-db-docker.sh deleted file mode 100755 index 0a441d2..0000000 --- a/backup-db-docker.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -DIRNAME=`dirname $0` -. $DIRNAME/lib.sh -TODAY=`date +%Y-%m-%d` - -if [ -z "$PGBASE" -o -z "$LOUIS_SCHEMA" ]; then - echo "Environment variables PGBASE and LOUIS_SCHEMA must be specified" - exit 1 -fi - -NAME=dumps/$LOUIS_SCHEMA.$TODAY.pg_dump -PGDUMP="docker exec -it louis-db-server pg_dump -U postgres -d $PGBASE" - -if [ ! -f "$NAME" ]; then - echo "Backing up $LOUIS_SCHEMA to $NAME" - $PGDUMP --schema=$LOUIS_SCHEMA --no-owner --no-privileges > $NAME - if [ "$?" -eq 0 ]; then - echo "Dumped to $NAME" - else - echo "Error dumping to $NAME" - cat $NAME - rm $NAME - exit 1 - fi -else - echo "File $NAME already exists" -fi - -ARCHIVE_FILENAME="$NAME.zip" -if [ ! -f "$ARCHIVE_FILENAME" ]; then - zip $ARCHIVE_FILENAME $NAME -else - echo "File $ARCHIVE_FILENAME.zip already exists" -fi - diff --git a/backup-db.sh b/backup-db.sh deleted file mode 100755 index ba1a836..0000000 --- a/backup-db.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -TODAY=`date +%Y-%m-%d` -NAME=dumps/inspection.canada.ca.$TODAY.pg_dump -if [ ! -f "$NAME" ]; then - pg_dump --no-owner --no-privileges -d inspection.canada.ca > $NAME -else - echo "File $NAME already exists" -fi - -if [ ! -f "$NAME.zip" ]; then - zip $NAME.zip $NAME -else - echo "File $NAME.zip already exists" -fi - diff --git a/bin/README.md b/bin/README.md new file mode 100644 index 0000000..724d279 --- /dev/null +++ b/bin/README.md @@ -0,0 +1,64 @@ +# creating a new schema + +## environment + +This assumes: + +* you are running WSL +* you are running a dockerized version of Postgresql 15 under WSL +* you are running louis-db in a DevContainer under Visual Studio Code +* your source is on WSL under ~/src + +## configuration + +database connection parameters is set in .env file + +you can create multiple .env.NAME and symlink as needed: + +working on local source: + +``` +ln -sf .env.louis_v004_local .env +``` + +switching to target + +``` +ln -sf .env.louis_v005_azure .env +``` + +## Running the database server locally + +* use Dockerfile in postgres directory +* use ```bin/postgres.sh``` script as your startup script (symlink) + +## Editing + +* Create adhoc modifications as scripts in sql/ with proper YYYY-mm-dd prefix +* Create tests that apply these sql scripts in a transaction and test them +* Once satisfied, commit changes to database + + + +## backing up schema and data + +in this example, the modified louis_v004 becomes the louis_v005 schema: + +``` +./bin/dump-versioned-schema.sh louis_v004 louis_v005 +./bin/dump-versioned-data.sh louis_v004 louis_v005 +``` + +## loading schema + +change your .env to link to your target database first + +``` +./bin/load-versioned-schema.sh louis_v005 +``` + +validate manually that schema is as expected here (dbBeaver ERD diagram) before loading the data: + +``` +./bin/load-versioned-data.sh louis_v005 +``` \ No newline at end of file diff --git a/bin/backup-db-docker.sh b/bin/backup-db-docker.sh new file mode 100755 index 0000000..a97d09c --- /dev/null +++ b/bin/backup-db-docker.sh @@ -0,0 +1,7 @@ +#!/bin/bash +DIRNAME=`dirname $0` +. $DIRNAME/lib.sh + +docker cp $DIRNAME/backup-db.sh louis-db-server:backup-db.sh +docker cp $DIRNAME/lib.sh louis-db-server:lib.sh +docker exec -it -e PGDUMP_FILENAME=/dev/stdout --env-file $ENV_FILE louis-db-server ./backup-db.sh > $PGDUMP_FILENAME \ No newline at end of file diff --git a/bin/backup-db.sh b/bin/backup-db.sh new file mode 100755 index 0000000..5f706c8 --- /dev/null +++ b/bin/backup-db.sh @@ -0,0 +1,19 @@ +#!/bin/bash +DIRNAME=`dirname $0` +. $DIRNAME/lib.sh + +if [ ! -f "$NAME" ]; then + echo "preparing to dump $PGBASE.$LOUIS_SCHEMA to $PGDUMP_FILENAME" + # apparently pg_dump doesn't use the environment variables PG* + pg_dump -d $PGBASE --schema=$LOUIS_SCHEMA --no-owner --no-privileges --file $PGDUMP_FILENAME +else + echo "File $PGDUMP_FILENAME already exists" +fi + +if [ -f "$PGDUMP_FILENAME" ]; then + if [ ! -f "$PGDUMP_FILENAME.zip" ]; then + zip $PGDUMP_FILENAME.zip $PGDUMP_FILENAME + else + echo "File $PGDUMP_FILENAME.zip already exists" + fi +fi \ No newline at end of file diff --git a/build-data-volume.sh b/bin/build-data-volume.sh similarity index 100% rename from build-data-volume.sh rename to bin/build-data-volume.sh diff --git a/build-dataloader.sh b/bin/build-dataloader.sh similarity index 100% rename from build-dataloader.sh rename to bin/build-dataloader.sh diff --git a/combine-iis-logs.sh b/bin/combine-iis-logs.sh similarity index 100% rename from combine-iis-logs.sh rename to bin/combine-iis-logs.sh diff --git a/deprecated/init-schema-louis.sh b/bin/deprecated/init-schema-louis.sh similarity index 100% rename from deprecated/init-schema-louis.sh rename to bin/deprecated/init-schema-louis.sh diff --git a/deprecated/load-db.sh b/bin/deprecated/load-db.sh similarity index 100% rename from deprecated/load-db.sh rename to bin/deprecated/load-db.sh diff --git a/deprecated/load-schema.sh b/bin/deprecated/load-schema.sh similarity index 100% rename from deprecated/load-schema.sh rename to bin/deprecated/load-schema.sh diff --git a/deprecated/migrate.sh b/bin/deprecated/migrate.sh similarity index 100% rename from deprecated/migrate.sh rename to bin/deprecated/migrate.sh diff --git a/docker-entrypoint.sh b/bin/docker-entrypoint.sh similarity index 100% rename from docker-entrypoint.sh rename to bin/docker-entrypoint.sh diff --git a/dump-versioned-data.sh b/bin/dump-versioned-data.sh similarity index 65% rename from dump-versioned-data.sh rename to bin/dump-versioned-data.sh index 5be37d8..d9fed0f 100755 --- a/dump-versioned-data.sh +++ b/bin/dump-versioned-data.sh @@ -8,7 +8,7 @@ INPUT_SCHEMA=$1 OUTPUT_SCHEMA=$2 if [ -z "$PGHOST" -o "$PGHOST" == "localhost" ]; then - RELPATH=dumps/$OUTPUT_SCHEMA + RELPATH=$PROJECT_DIR/dumps/$OUTPUT_SCHEMA OUTPUT_DIR=`realpath $RELPATH` if [ -d "$OUTPUT_DIR" ]; then echo "Warning: Directory exist: $OUTPUT_DIR" @@ -18,7 +18,11 @@ else OUTPUT_DIR=/var/lib/postgresql/data fi -$PSQL_ADMIN < $DIRNAME/sql/schema_to_csv.sql +$PSQL_ADMIN -f $PROJECT_DIR/sql/schema_to_csv.sql +if [ $? -ne 0 ]; then + echo "Failed to install schema_to_csv function" + exit 3 +fi echo "Outputting all tables from schema $INPUT_SCHEMA as csv to $OUTPUT_DIR on the database server" -$PSQL_ADMIN -c "select * from schema_to_csv('$INPUT_SCHEMA', '$OUTPUT_DIR')" \ No newline at end of file +$PSQL_ADMIN -c "select * from public.schema_to_csv('$INPUT_SCHEMA'::text, '$OUTPUT_DIR'::text)" \ No newline at end of file diff --git a/dump-versioned-schema.sh b/bin/dump-versioned-schema.sh similarity index 72% rename from dump-versioned-schema.sh rename to bin/dump-versioned-schema.sh index ae7f71c..8981bf7 100755 --- a/dump-versioned-schema.sh +++ b/bin/dump-versioned-schema.sh @@ -1,23 +1,24 @@ #!/bin/bash DIRNAME=`dirname $0` . $DIRNAME/lib.sh -TODAY=`date +%Y-%m-%d` if [ -z $2 ]; then echo "usage: $0 source_schema output_schema" + echo "example: $0 louis_v005 to louis_v006" exit 1 fi SOURCE_SCHEMA=$1 TARGET_SCHEMA=$2 -SCHEMA_OUTPUT_DIR=$DIRNAME/dumps/$TARGET_SCHEMA +SCHEMA_OUTPUT_DIR=$PROJECT_DIR/dumps/$TARGET_SCHEMA mkdir -p $SCHEMA_OUTPUT_DIR SCHEMA_OUTPUT_FILENAME=$SCHEMA_OUTPUT_DIR/schema.sql if [ -f "$SCHEMA_OUTPUT_FILENAME" ]; then - echo "File $SCHEMA_OUTPUT_FILENAME already exists" - #exit 2 + echo "File $SCHEMA_OUTPUT_FILENAME already exists, exiting" + exit 2 fi +echo "dumping schema to $SCHEMA_OUTPUT_FILENAME" pg_dump -n $SOURCE_SCHEMA -d $PGBASE \ --no-owner --no-privileges --no-security-labels \ --no-table-access-method --no-tablespaces --schema-only \ diff --git a/bin/install-postgresl-client-15.sh b/bin/install-postgresl-client-15.sh new file mode 100755 index 0000000..e1efd9c --- /dev/null +++ b/bin/install-postgresl-client-15.sh @@ -0,0 +1,4 @@ +sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' +wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - +sudo apt update +sudo apt install postgresql-client-15 \ No newline at end of file diff --git a/bin/lib.sh b/bin/lib.sh new file mode 100644 index 0000000..6a32e5c --- /dev/null +++ b/bin/lib.sh @@ -0,0 +1,52 @@ +#!/bin/bash +DIRNAME=$(dirname $(realpath $0)) +PARENT_DIR=$DIRNAME/.. +PROJECT_DIR=$(realpath $PARENT_DIR) +ENV_FILE=$PROJECT_DIR/.env + +if [ -f "$ENV_FILE" ]; then + # shellcheck source=lib.sh + . "$ENV_FILE" +else +echo "WARNING: File $ENV_FILE does not exist, relying on environment variables" +fi + +check_environment_variables_defined () { + variable_not_set=0 + for VARIABLE in "$@"; do + if [ -z "${!VARIABLE}" ]; then + echo "Environment variable $VARIABLE is not set" + variable_not_set=1 + fi + done + + if [ $variable_not_set -eq 1 ]; then + echo "One or more variables are not defined, the program cannot continue" + exit 1 + fi +} + +export PGOPTIONS="--search_path=$LOUIS_SCHEMA" +export PGBASE +export PGDATABASE +export PGHOST +export PGUSER +export PGPORT +export PGHOST +export PGPASSFILE +export PGPASSWORD + +VERSION15=$(psql --version | grep 15.) + +if [ -z "$VERSION15" ]; then + echo "postgresql-client-15 required" + exit 1 +fi + +TODAY=$(date +%Y-%m-%d) + +if [ -z "$PGDUMP_FILENAME" ]; then + PGDUMP_FILENAME=$PROJECT_DIR/dumps/$TODAY.$PGBASE.pg_dump +fi + +export PSQL_ADMIN="psql -v ON_ERROR_STOP=1 --single-transaction -d $PGBASE" diff --git a/load-data-container.sh b/bin/load-data-container.sh similarity index 100% rename from load-data-container.sh rename to bin/load-data-container.sh diff --git a/bin/load-versioned-data.sh b/bin/load-versioned-data.sh new file mode 100755 index 0000000..9051192 --- /dev/null +++ b/bin/load-versioned-data.sh @@ -0,0 +1,72 @@ +#!/bin/bash +DIRNAME=$(dirname "$0") +. $DIRNAME/lib.sh + +## To debug +# set -x -e + +if [ -z "$1" ]; then + echo "usage: $0 target_schema" + exit 1 +fi +TARGET_SCHEMA=$1 +SOURCE_DIR=$PROJECT_DIR/dumps/$TARGET_SCHEMA +CSV_TO_SCHEMA=$PROJECT_DIR/sql/csv_to_schema.sql + +if [ ! -d "$SOURCE_DIR" ]; then + echo "Directory does not exist: $SOURCE_DIR" + exit 1 +fi + +if [ ! -f "$CSV_TO_SCHEMA" ]; then + echo "Source file does not exist: $CSV_TO_SCHEMA" + exit 1 +fi + +# $PSQL_ADMIN -d $PGBASE < $CSV_TO_SCHEMA +# if [ $? -ne 0 ]; then +# echo "Failed to load csv_to_schema.sql" +# exit 1 +# fi +# $PSQL_ADMIN -d $PGBASE -c "select * from csv_to_schema('$TARGET_SCHEMA', '$SOURCE_DIR', array['crawl', 'chunk', 'token', 'ada_002', 'link', 'score', 'query'])" + +TABLE_LIST=$SOURCE_DIR/tables.txt +echo $TABLE_LIST +if [ ! -f "$TABLE_LIST" ]; then + echo "File defining list of table and their load order does not exist: $TABLE_LIST" + exit 1 +fi +TABLES=$(cat "$TABLE_LIST") + +# we check that there's a csv file for each table +for table in $TABLES; do + FILENAME=$SOURCE_DIR/$table.csv + if [ ! -f "$FILENAME" ]; then + echo "File does not exist: $FILENAME" + exit 1 + fi +done + +# we check that there's a table for each csv file +for file in $SOURCE_DIR/*.csv; do + echo "Checking $file is expected in table list" + TABLE=`basename $file .csv` + if ! grep -q $TABLE $TABLE_LIST; then + echo "File $file is not expected in table list" + exit 1 + fi +done + +CSV_TO_SCHEMA_PSQL=$(mktemp) +echo "" > $CSV_TO_SCHEMA_PSQL +# echo "set session_replica_role = 'replica';" >> $CSV_TO_SCHEMA_PSQL +for table in $TABLES; do + echo "Loading $table" + FILENAME=$SOURCE_DIR/$table.csv + echo "\COPY $TARGET_SCHEMA.$table FROM $FILENAME WITH DELIMITER as ';' CSV HEADER" >> $CSV_TO_SCHEMA_PSQL +done +$PSQL_ADMIN -f $CSV_TO_SCHEMA_PSQL +if [ $? -ne 0 ]; then + echo "Failed to load $table" + exit 1 +fi \ No newline at end of file diff --git a/bin/load-versioned-schema.sh b/bin/load-versioned-schema.sh new file mode 100755 index 0000000..93a9b1c --- /dev/null +++ b/bin/load-versioned-schema.sh @@ -0,0 +1,34 @@ +#!/bin/bash +DIRNAME=$(dirname "$0") +. $DIRNAME/lib.sh + +## To debug +# set -x -e + +## Explain usage if missing argument +if [ -z "$1" ]; then + echo "usage: $0 source_schema" + exit 1 +fi + +SOURCE_SCHEMA=$1 + +## Create a path using the project directory and the schema name +SOURCE_DIR=$PROJECT_DIR/dumps/$1 + +## If the directory at the path doesn't exist, print an error and exit +if [ ! -d "$SOURCE_DIR" ]; then + echo "Directory does not exist: $SOURCE_DIR" + exit 1 +fi + +## Create a path to a file named schema.sql +SCHEMA_FILE=$SOURCE_DIR/schema.sql + +## If the file does not exist, print an error message and exit with status code 2 +if [ ! -f "$SCHEMA_FILE" ]; then + echo "File does not exist: $SCHEMA_FILE" + exit 2 +fi +## If the file does exist, pass it as input to a PostgreSQL admin command +$PSQL_ADMIN < $SCHEMA_FILE diff --git a/bin/postgres.sh b/bin/postgres.sh new file mode 100755 index 0000000..45d6089 --- /dev/null +++ b/bin/postgres.sh @@ -0,0 +1,36 @@ +#!/bin/bash +DIRNAME=$(dirname "$(realpath "$0")") +. "$DIRNAME"/lib.sh + +if [ -z "$PGDATA" ]; then + echo "PGDATA is not set. Setting to default directory..." + PGDATA=$HOME/pgdata +fi + +if [ ! -d "$PGDATA" ]; then + echo "PGDATA directory $PGDATA does not exist, creating it..." + mkdir -p "$PGDATA" +fi + +check_environment_variables_defined DB_SERVER_CONTAINER_NAME POSTGRES_PASSWORD + +STATUS=$(docker inspect "$DB_SERVER_CONTAINER_NAME" -f '{{.State.Status}}') + +if [ "$STATUS" = "exited" ]; then + echo "container $DB_SERVER_CONTAINER_NAME exist but has exited, restarting" + docker start "$DB_SERVER_CONTAINER_NAME" + +elif [ "$STATUS" != "running" ]; then + + echo "container $DB_SERVER_CONTAINER_NAME does not exist, creating" + + docker run --name "$DB_SERVER_CONTAINER_NAME" \ + -e POSTGRES_PASSWORD="$POSTGRES_PASSWORD" \ + --network louis_network \ + --mount type=bind,src="$PGDATA",target=/var/lib/postgresql/data \ + --publish 5432:5432 \ + --user "$(id -u):$(id -g)" -v /etc/passwd:/etc/passwd:ro \ + -d "louis-postgres" +else + echo "Postgres is already running" +fi diff --git a/bin/psql.sh b/bin/psql.sh new file mode 100755 index 0000000..5486a0a --- /dev/null +++ b/bin/psql.sh @@ -0,0 +1,8 @@ +DIRNAME=`dirname $0` +. $DIRNAME/lib.sh +if [ -z "$1" ]; then + psql +else + SQL_SCRIPT=$1 + $PSQL_ADMIN -f "$SQL_SCRIPT" +fi \ No newline at end of file diff --git a/restore-db-docker.sh b/bin/restore-db-docker.sh similarity index 100% rename from restore-db-docker.sh rename to bin/restore-db-docker.sh diff --git a/bin/setup-db-docker.sh b/bin/setup-db-docker.sh new file mode 100755 index 0000000..00f2bff --- /dev/null +++ b/bin/setup-db-docker.sh @@ -0,0 +1,57 @@ +#!/bin/bash +DIRNAME=$(dirname "$0") +. $DIRNAME/lib.sh + +check_environment_variables_defined PGBASE DB_SERVER_CONTAINER_NAME PSQL_ADMIN PGPASSWORD USER + +# set -x -e +DOCKER_EXEC="docker exec -it $DB_SERVER_CONTAINER_NAME" + +## Check if user already exist +$DOCKER_EXEC psql -U postgres -c " +DO +\$do\$ +BEGIN + IF NOT EXISTS ( + SELECT FROM pg_roles WHERE rolname = '$USER' + ) THEN + CREATE USER $USER WITH PASSWORD '$PGPASSWORD'; + ALTER USER $USER WITH SUPERUSER; + END IF; +END +\$do\$;" + +## Print all existing users +$DOCKER_EXEC psql -U postgres -c '\du' + +## Check if database already exist +DB_EXISTS=$($DOCKER_EXEC psql -U "$USER" -d "$PGBASE" -tAc "SELECT 1 FROM pg_database WHERE datname='$PGBASE'" | tr -d '\r') + +if [ "$DB_EXISTS" = '1' ] +then + echo "Database $PGBASE already exists." +else + echo "Database $PGBASE does not exist, creating..." + $DOCKER_EXEC createdb -E utf-8 -U postgres "$PGBASE" +fi + +## Print all existing databases +$DOCKER_EXEC psql -U postgres -c '\l' + +$DOCKER_EXEC pip install pgxnclient + +VECTOR_INSTALLED=$(docker exec -u 0 -i "$DB_SERVER_CONTAINER_NAME" psql -U postgres -tAc "SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname = 'vector');") + +if [ "$VECTOR_INSTALLED" = "t" ] +then + echo "Extension vector is already installed." +else + echo "Extension vector is not installed. Installing..." + docker exec -u 0 -it "$DB_SERVER_CONTAINER_NAME" pgxn install vector +fi + +$DOCKER_EXEC psql -U postgres -v ON_ERROR_STOP=1 --single-transaction -d $PGBASE -c 'SET search_path TO public; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS vector;' + +# $DOCKER_EXEC $PSQL_ADMIN -c "SET search_path TO public; CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"; CREATE EXTENSION IF NOT EXISTS vector;" +# User creation +# $DOCKER_EXEC $PSQL_ADMIN -c "CREATE USER $USER; ALTER USER $USER WITH SUPERUSER;" \ No newline at end of file diff --git a/setup-db.sh b/bin/setup-db.sh similarity index 90% rename from setup-db.sh rename to bin/setup-db.sh index 8a7e6e1..aa32bbd 100755 --- a/setup-db.sh +++ b/bin/setup-db.sh @@ -1,4 +1,5 @@ -DIRNAME=`dirname $0` +#!/bin/bash +DIRNAME=$(dirname "$0") . $DIRNAME/lib.sh $PSQL_ADMIN -f $DIRNAME/sql/fix-utf8-template.sql diff --git a/update-schema-louis.sh b/bin/update-schema-louis.sh similarity index 100% rename from update-schema-louis.sh rename to bin/update-schema-louis.sh diff --git a/lib.sh b/lib.sh deleted file mode 100644 index 2dbcb4c..0000000 --- a/lib.sh +++ /dev/null @@ -1,12 +0,0 @@ -DIRNAME=`dirname $0` -. $DIRNAME/.env -export PGDATABASE -export PGHOST -export PGUSER -export PGPORT -export PGHOST -export PGPASSFILE -export PGPASSWORD - -PSQL_ADMIN="psql -v ON_ERROR_STOP=1 -U postgres --single-transaction -d $PGBASE" -TODAY=`date +%Y-%m-%d` diff --git a/load-versioned-data.sh b/load-versioned-data.sh deleted file mode 100755 index faf5fcd..0000000 --- a/load-versioned-data.sh +++ /dev/null @@ -1,11 +0,0 @@ -DIRNAME=`dirname $0` -. $DIRNAME/lib.sh -RELPATH=dumps/$TARGET_SCHEMA -SOURCE_DIR=`realpath $RELPATH` -if [ ! -d "$SOURCE_DIR" ]; then - echo "Directory does not exist: $SOURCE_DIR" - exit 1 -fi - -$PSQL_ADMIN -d $PGBASE < $DIRNAME/sql/csv_to_schema.sql -$PSQL_ADMIN -d $PGBASE -c "select * from csv_to_schema('$TARGET_SCHEMA', '$SOURCE_DIR', array['crawl', 'chunk', 'token', 'ada_002', 'link', 'score', 'query'])" \ No newline at end of file diff --git a/load-versioned-schema.sh b/load-versioned-schema.sh deleted file mode 100755 index 5a12433..0000000 --- a/load-versioned-schema.sh +++ /dev/null @@ -1,20 +0,0 @@ -DIRNAME=`dirname $0` -. $DIRNAME/lib.sh - -if [ -z "$1" ]; then - echo "usage: $0 source_schema" - exit 1 -fi -SOURCE_SCHEMA=$1 -SOURCE_DIR=$DIRNAME/dumps/$1 -if [ ! -d "$SOURCE_DIR" ]; then - echo "Directory does not exist: $SOURCE_DIR" - exit 1 -fi - -SCHEMA_FILE=$SOURCE_DIR/schema.sql -if [ ! -f "$SCHEMA_FILE" ]; then - echo "File does not exist: $SCHEMA_FILE" - exit 2 -fi -$PSQL_ADMIN < $SCHEMA_FILE diff --git a/louis/db/__init__.py b/louis/db/__init__.py index d1dacc0..b241b5b 100644 --- a/louis/db/__init__.py +++ b/louis/db/__init__.py @@ -1,21 +1,25 @@ """Database functions for the Louis project.""" +import hashlib import logging import os import urllib -import numpy as np import psycopg -import psycopg.sql as sql from pgvector.psycopg import register_vector from psycopg.rows import dict_row -from louis.models import openai LOGGER = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) +class DBError(Exception): + pass + +class DBMissingEnvironmentVariable(DBError): + pass + def raise_error(message): - raise Exception(message) + raise DBMissingEnvironmentVariable(message) LOUIS_DSN = os.environ.get("LOUIS_DSN") or raise_error("LOUIS_DSN is not set") LOUIS_SCHEMA = os.environ.get("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set") @@ -28,6 +32,8 @@ def connect_db(): row_factory=dict_row, autocommit=False, options=f"-c search_path={LOUIS_SCHEMA},public") + assert connection.info.encoding == 'utf-8', ( + 'Encoding is not UTF8: ' + connection.info.encoding) # psycopg.extras.register_uuid() register_vector(connection) return connection @@ -36,223 +42,27 @@ def cursor(connection): """Return a cursor for the given connection.""" return connection.cursor() -def store_chunk_item(cursor, item): - """Process a ChunkItem and insert it into the database.""" - try: - data = { - 'url': item["url"], - 'title': item["title"], - 'text_content': item["text_content"], - 'tokens': item["tokens"], - 'encoding': 'cl100k_base' - } - cursor.execute( - """SELECT id FROM crawl WHERE url = %(url)s - ORDER BY last_updated DESC LIMIT 1""", - data - ) - data['crawl_id'] = cursor.fetchone()['id'] - cursor.execute( - "INSERT INTO chunk (crawl_id, title, text_content)" - " VALUES(%(crawl_id)s::UUID, %(title)s, %(text_content)s)" - " RETURNING id", - data - ) - data['chunk_id'] = cursor.fetchone()['id'] - cursor.execute( - "INSERT INTO token (chunk_id, tokens, encoding)" - " VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s)" - " RETURNING id", - data - ) - data['token_id'] = cursor.fetchone()['id'] - - return item - except psycopg.IntegrityError: - # ignore duplicates and keep processing - return item - -def store_crawl_item(cursor, item): - """Process a CrawlItem and insert it into the database.""" - try: - cursor.execute( - """INSERT INTO crawl - (url, title, lang, html_content, last_crawled, last_updated) - VALUES (%s, %s, %s, %s, %s, %s)""", - ( - item["url"], - item["title"], - item["lang"], - item["html_content"], - item["last_crawled"], - item["last_updated"], - ) - ) - return item - except psycopg.IntegrityError: - # ignore duplicates and keep processing - return item - -def store_embedding_item(cursor, item): - """Process an EmbeddingItem and insert it into the database.""" - try: - data = { - 'token_id': item["token_id"], - # TODO: shouldn't python-pgvector support casting from smallint[] to vector? - 'embedding': np.array(item["embedding"]), - 'embedding_model': item["embedding_model"], - } - query = sql.SQL( - 'INSERT INTO {embedding_model} (token_id, embedding)' - ' VALUES (%(token_id)s, %(embedding)s::vector)' - ).format(embedding_model=sql.Identifier(data['embedding_model'])).as_string(cursor) - cursor.execute( - query, - data - ) - return item - except psycopg.IntegrityError: - # ignore duplicates and keep processing - return item - - -def link_pages(cursor, source_url, destination_url): - """Link two pages together in the database.""" - data = { - 'source_url': source_url, - 'destination_url': destination_url, - } - cursor.execute( - """SELECT id FROM crawl - WHERE url = %(source_url)s ORDER BY last_updated DESC LIMIT 1""", - data - ) - data['source_crawl_id'] = cursor.fetchone()['id'] - cursor.execute( - """SELECT id FROM crawl - WHERE url = %(destination_url)s ORDER BY last_updated DESC LIMIT 1""", - data - ) - data['destination_crawl_id'] = cursor.fetchone()['id'] - cursor.execute( - "INSERT INTO link (source_crawl_id, destination_crawl_id)" - " VALUES (%(source_crawl_id)s, %(destination_crawl_id)s)" - " ON CONFLICT DO NOTHING", - data - ) - - -def fetch_links(cursor, url): - """Fetch all links from a given url.""" - data = { - 'source_url': url - } - cursor.execute( - "SELECT url FROM link" - " JOIN crawl ON link.destination_crawl_id = crawl.id" - " WHERE source_crawl_id = (" - " SELECT id FROM crawl WHERE url = %(source_url)s" - " ORDER BY last_updated DESC LIMIT 1)", - data - ) - data['destination_urls'] = [r['url'] for r in cursor.fetchall()] - return data['destination_urls'] - -def fetch_chunk_id_without_embedding(cursor, embedding_model='ada_002'): - """Fetch all chunk ids without an embedding.""" - query = sql.SQL( - "SELECT chunk_id FROM chunk" - " JOIN token ON chunk.id = token.chunk_id" - " LEFT JOIN {embedding_model} ON token.id = {embedding_model}.token_id" - " WHERE {embedding_model}.embedding IS NULL" - ).format(embedding_model=sql.Identifier(embedding_model)).as_string(cursor) - cursor.execute(query) - return [chunk_id['chunk_id'] for chunk_id in cursor.fetchall()] - -def fetch_crawl_row(cursor, url): - """Fetch the most recent crawl row for a given url.""" - data = { - 'url': url - } - cursor.execute( - "SELECT * FROM crawl WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1", - data - ) - return cursor.fetchone() - -def fetch_chunk_token_row(cursor, url): - """Fetch the most recent chunk token for a given chunk id.""" - - # TODO: eventually we could generalize the use of these postgresql - # url to data but for now keep it simple - data = parse_postgresql_url(url) - cursor.execute( - "SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk" - " JOIN token ON chunk.id = token.chunk_id" - " JOIN crawl ON chunk.crawl_id = crawl.id" - " WHERE chunk.id = %(entity_uuid)s LIMIT 1", - data - ) - # psycopg.extras.DictRow is not a real dict and will convert - # to string as a list so we force convert to dict - return dict(cursor.fetchone()) - -def create_postgresql_url(dbname, tablename, entity_uuid, parameters=None): +def create_postgresql_url(dbname, tablename, entity_id, parameters=None): if parameters is None: - return f'postgresql://{dbname}/public/{tablename}/{entity_uuid}' - return f'postgresql://{dbname}/public/{tablename}/{entity_uuid}?{urllib.parse.urlencode(parameters)}' + return f'postgresql://{dbname}/{LOUIS_SCHEMA}/{tablename}/{entity_id}' + return f'postgresql://{dbname}/{LOUIS_SCHEMA}/{tablename}/{entity_id}?{urllib.parse.urlencode(parameters)}' def parse_postgresql_url(url): """Parse a postgresql url and return a dictionary with the parameters.""" parsed = urllib.parse.urlparse(url) + path_split = parsed.path.split('/') return { 'dbname': parsed.hostname, - 'tablename': parsed.path.split('/')[2], - 'entity_uuid': parsed.path.split('/')[3], + 'schema': path_split[1], + 'tablename': path_split[2], + 'id': path_split[3], 'parameters': urllib.parse.parse_qs(parsed.query) } -def match_documents(cursor, query_embedding): - """Match documents with a given query.""" - data = { - # TODO: use of np.array to get it to recognize the vector type - # is there a simpler way to do this? only reason we use this - # dependency - # 'query_embedding': np.array(query_embedding), - 'query_embedding': query_embedding, - 'match_threshold': 0.5, - 'match_count': 10 - } - - # cursor.callproc('match_documents', data) - cursor.execute( - "SELECT * FROM match_documents" - "(%(query_embedding)s::vector, %(match_threshold)s, %(match_count)s)", - data) - - # turn into list of dict now to preserve dictionaries - return [dict(r) for r in cursor.fetchall()] - -def match_documents_from_text_query(cursor, query): - data = { - 'query': query, - 'tokens': openai.get_tokens_from_text(query) - } - results = cursor.execute(""" - SELECT * - FROM query - WHERE tokens = %(tokens)s::integer[] - """, data) - db_data = results.fetchone() - if not db_data: - data['embedding'] = openai.fetch_embedding(data['tokens']) - results = cursor.execute( - "INSERT INTO query(query, tokens, embedding)" - " VALUES(%(query)s, %(tokens)s, %(embedding)s) RETURNING id", data) - data['query_id'] = results.fetchone()['id'] - else: - data.update(db_data) - docs = match_documents(cursor, data['embedding']) +def hash(text): + """Return the hash of the given text. - return docs + We hash using the Python library to remove a roundtrip to the database + """ + return hashlib.md5(text.encode()).hexdigest() \ No newline at end of file diff --git a/louis/db/api/__init__.py b/louis/db/api/__init__.py new file mode 100644 index 0000000..c7613f6 --- /dev/null +++ b/louis/db/api/__init__.py @@ -0,0 +1,46 @@ +from louis.models import openai + + +def match_documents(cursor, query_embedding): + """Match documents with a given query.""" + data = { + # TODO: use of np.array to get it to recognize the vector type + # is there a simpler way to do this? only reason we use this + # dependency + # 'query_embedding': np.array(query_embedding), + 'query_embedding': query_embedding, + 'match_threshold': 0.5, + 'match_count': 10 + } + + # cursor.callproc('match_documents', data) + cursor.execute( + "SELECT * FROM match_documents" + "(%(query_embedding)s::vector, %(match_threshold)s, %(match_count)s)", + data) + + # turn into list of dict now to preserve dictionaries + return [dict(r) for r in cursor.fetchall()] + +def match_documents_from_text_query(cursor, query): + data = { + 'query': query, + 'tokens': openai.get_tokens_from_text(query) + } + results = cursor.execute(""" + SELECT * + FROM query + WHERE tokens = %(tokens)s::integer[] + """, data) + db_data = results.fetchone() + if not db_data: + data['embedding'] = openai.fetch_embedding(data['tokens']) + results = cursor.execute( + "INSERT INTO query(query, tokens, embedding)" + " VALUES(%(query)s, %(tokens)s, %(embedding)s) RETURNING id", data) + data['query_id'] = results.fetchone()['id'] + else: + data.update(db_data) + docs = match_documents(cursor, data['embedding']) + + return docs diff --git a/louis/db/crawler/__init__.py b/louis/db/crawler/__init__.py new file mode 100644 index 0000000..18d15eb --- /dev/null +++ b/louis/db/crawler/__init__.py @@ -0,0 +1,220 @@ +import psycopg +import numpy as np + +import louis.db as db + +def link_pages(cursor, source_url, destination_url): + """Link two pages together in the database.""" + data = { + 'source_url': source_url, + 'destination_url': destination_url, + } + cursor.execute( + """SELECT id FROM crawl + WHERE url = %(source_url)s ORDER BY last_updated DESC LIMIT 1""", + data + ) + data['source_crawl_id'] = cursor.fetchone()['id'] + cursor.execute( + """SELECT id FROM crawl + WHERE url = %(destination_url)s ORDER BY last_updated DESC LIMIT 1""", + data + ) + data['destination_crawl_id'] = cursor.fetchone()['id'] + cursor.execute( + "INSERT INTO link (source_crawl_id, destination_crawl_id)" + " VALUES (%(source_crawl_id)s, %(destination_crawl_id)s)" + " ON CONFLICT DO NOTHING", + data + ) + + +def fetch_links(cursor, url): + """Fetch all links from a given url.""" + data = { + 'source_url': url + } + cursor.execute( + "SELECT url FROM link" + " JOIN crawl ON link.destination_crawl_id = crawl.id" + " WHERE source_crawl_id = (" + " SELECT id FROM crawl WHERE url = %(source_url)s" + " ORDER BY last_updated DESC LIMIT 1)", + data + ) + data['destination_urls'] = [r['url'] for r in cursor.fetchall()] + return data['destination_urls'] + +def store_chunk_item(cursor, item): + """Process a ChunkItem and insert it into the database.""" + try: + data = { + 'url': item["url"], + 'title': item["title"], + 'text_content': item["text_content"], + 'tokens': item["tokens"], + 'encoding': 'cl100k_base' + } + cursor.execute( + """SELECT md5hash FROM crawl WHERE url = %(url)s + ORDER BY last_updated DESC LIMIT 1""", + data + ) + data['md5hash'] = cursor.fetchone()['md5hash'] + + # TODO: should probably update the title even if the text_content + # is already present as we may have changed how we create the title + cursor.execute( + """ + WITH e as( + INSERT INTO chunk (title, text_content) + VALUES(%(title)s, %(text_content)s) + ON CONFLICT DO NOTHING + RETURNING id + ) + SELECT id FROM e + UNION ALL + SELECT id FROM chunk WHERE text_content = %(text_content)s + """, + data + ) + data['chunk_id'] = cursor.fetchone()['id'] + cursor.execute( + """ + INSERT INTO html_content_to_chunk (html_content_md5hash, chunk_id) + VALUES(%(md5hash)s, %(chunk_id)s::UUID) + ON CONFLICT DO NOTHING + """, + data) + cursor.execute( + """ + WITH e as( + INSERT INTO token (chunk_id, tokens, encoding) + VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s) + ON CONFLICT DO NOTHING + RETURNING * + ) + SELECT id FROM e + UNION ALL + SELECT id FROM token + WHERE chunk_id = %(chunk_id)s::UUID + and tokens = %(tokens)s::INTEGER[] + and encoding = %(encoding)s + """, + data + ) + data['token_id'] = cursor.fetchone()['id'] + return data + except psycopg.IntegrityError as e: + raise db.DBError("Error storing chunk item for %s" % item['url']) from e + + +def store_crawl_item(cursor, item): + """Process a CrawlItem and insert it into the database.""" + try: + item['html_content_md5hash'] = db.hash(item["html_content"]) + cursor.execute( + """INSERT INTO html_content (content, md5hash) + VALUES(%(html_content)s, %(html_content_md5hash)s) + ON CONFLICT DO NOTHING""", + item) + cursor.execute( + """INSERT INTO crawl + (url, title, lang, md5hash, last_crawled, last_updated) + VALUES ( + %(url)s, %(title)s, %(lang)s, %(html_content_md5hash)s, + %(last_crawled)s, %(last_updated)s) + """, + item + ) + return item + except psycopg.IntegrityError as e: + raise db.DBError("Error storing crawl item for %s" % item['url']) from e + + +def store_embedding_item(cursor, item): + """Process an EmbeddingItem and insert it into the database.""" + try: + data = { + 'token_id': item["token_id"], + # TODO: shouldn't python-pgvector support casting from smallint[] to vector? + 'embedding': np.array(item["embedding"]), + 'embedding_model': item["embedding_model"], + } + query = psycopg.sql.SQL( + 'INSERT INTO {embedding_model} (token_id, embedding)' + ' VALUES (%(token_id)s, %(embedding)s::vector)' + ).format( + embedding_model=psycopg.sql.Identifier( + data['embedding_model']) + ).as_string(cursor) + cursor.execute( + query, + data + ) + return item + except psycopg.IntegrityError as e: + raise db.DBError( + "Error storing embedding item for token %s" % item['token_id']) from e + +def fetch_crawl_ids_without_chunk(cursor): + """Fetch all crawl ids without an embedding.""" + query = psycopg.sql.SQL( + """ + SELECT crawl.id FROM crawl + LEFT JOIN html_content_to_chunk + ON crawl.md5hash = html_content_to_chunk.html_content_md5hash + WHERE chunk_id IS NULL + """ + ).as_string(cursor) + cursor.execute(query) + return [crawl_id['id'] for crawl_id in cursor.fetchall()] + +def fetch_chunk_id_without_embedding(cursor, embedding_model='ada_002'): + """Fetch all chunk ids without an embedding.""" + query = psycopg.sql.SQL( + "SELECT chunk_id FROM chunk" + " JOIN token ON chunk.id = token.chunk_id" + " LEFT JOIN {embedding_model} ON token.id = {embedding_model}.token_id" + " WHERE {embedding_model}.embedding IS NULL" + ).format(embedding_model=psycopg.sql.Identifier(embedding_model)).as_string(cursor) + cursor.execute(query) + return [chunk_id['chunk_id'] for chunk_id in cursor.fetchall()] + +def fetch_crawl_row(cursor, url): + """Fetch the most recent crawl row for a given url.""" + if url.startswith('postgresql://'): + data = db.parse_postgresql_url(url) + + cursor.execute( + """SELECT *, content as html_content FROM crawl + INNER JOIN html_content on crawl.md5hash = html_content.md5hash + WHERE id = %(id)s ORDER BY last_updated DESC LIMIT 1""", + data + ) + else: + data = {'url': url} + cursor.execute( + """SELECT *, content as html_content FROM crawl + INNER JOIN html_content on crawl.md5hash = html_content.md5hash + WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1""", + data + ) + if cursor.rowcount == 0: + raise db.DBError("No crawl found for id: {}".format(data)) + row = cursor.fetchone() + assert 'html_content' in row.keys() + return row + +def fetch_chunk_token_row(cursor, url): + """Fetch the most recent chunk token for a given chunk id.""" + data = db.parse_postgresql_url(url) + cursor.execute( + "SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk" + " JOIN token ON chunk.id = token.chunk_id" + " WHERE chunk.id = %(id)s LIMIT 1", + data + ) + # psycopg.extras.DictRow is not a real dict and will convert + # to string as a list so we force convert to dict + return cursor.fetchone() diff --git a/louis/models/openai.py b/louis/models/openai.py index e057356..4a4eafb 100644 --- a/louis/models/openai.py +++ b/louis/models/openai.py @@ -3,6 +3,9 @@ import openai import tiktoken +import dotenv +dotenv.load_dotenv() + # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=python def safe_get(key): diff --git a/psql.sh b/psql.sh deleted file mode 100755 index ed7b965..0000000 --- a/psql.sh +++ /dev/null @@ -1,3 +0,0 @@ -DIRNAME=`dirname $0` -. $DIRNAME/lib.sh -psql \ No newline at end of file diff --git a/setup-db-docker.sh b/setup-db-docker.sh deleted file mode 100755 index 15b4d23..0000000 --- a/setup-db-docker.sh +++ /dev/null @@ -1,13 +0,0 @@ -DIRNAME=`dirname $0` -. $DIRNAME/lib.sh - -if [ -z "$PGBASE" ]; then - echo "PGBASE is not set" - exit 1 -fi -DOCKER_EXEC="docker exec -it louis-db-server" -$DOCKER_EXEC createdb -E utf-8 $PGBASE -$DOCKER_EXEC $PSQL_ADMIN -c "CREATE USER $USER; ALTER USER $USER WITH SUPERUSER;" -$DOCKER_EXEC pip install pgxnclient -$DOCKER_EXEC pgxn install vector -$DOCKER_EXEC $PSQL_ADMIN -c "SET search_path TO public; CREATE EXTENSION IF NOT EXISTS \"uuid-ossp\"; CREATE EXTENSION IF NOT EXISTS vector;" \ No newline at end of file diff --git a/sql/2023-08-08-count-crawl-with-missing-chunk.sql b/sql/2023-08-08-count-crawl-with-missing-chunk.sql new file mode 100644 index 0000000..6bb2bd9 --- /dev/null +++ b/sql/2023-08-08-count-crawl-with-missing-chunk.sql @@ -0,0 +1,4 @@ +select count(*) +from crawl left join chunk +on crawl.id = chunk.crawl_id +where chunk.crawl_id is null \ No newline at end of file diff --git a/sql/2023-08-09-find-duplicated_html_content.sql b/sql/2023-08-09-find-duplicated_html_content.sql new file mode 100644 index 0000000..373f537 --- /dev/null +++ b/sql/2023-08-09-find-duplicated_html_content.sql @@ -0,0 +1,9 @@ +with hashes as ( +select id, md5(html_content) as md5sum, url from crawl +), +aggregated as( + select array_agg(id), array_agg(url), count(*) as dups + from hashes + group by md5sum +) +select * from aggregated where dups > 1 \ No newline at end of file diff --git a/sql/2023-08-09-issue8-html_content-table.sql b/sql/2023-08-09-issue8-html_content-table.sql new file mode 100644 index 0000000..64de142 --- /dev/null +++ b/sql/2023-08-09-issue8-html_content-table.sql @@ -0,0 +1,103 @@ +-- because the same html_content can be used by multiple crawl entries +-- we modify the crawl table by moving the column html_content to a new table html_content +-- and add a foreign key to the crawl table to html_content +-- we also modify chunk, originally linked to the crawl table as follows: +-- * the same chunk (example: this page has been archived can be extracted from different crawl entries (and html_content) +-- even when these pages are not the same (do not have the same md5sum hash) +-- * we add an N:N relation between chunk and html_content + +-- original tables: +--- +-- CREATE TABLE crawl ( +-- id uuid NOT NULL DEFAULT uuid_generate_v4(), +-- url text NULL, +-- title text NULL, +-- lang bpchar(2) NULL, +-- html_content text NULL, +-- last_crawled text NULL, +-- last_updated text NULL, +-- last_updated_date date NULL, +-- CONSTRAINT crawl_pkey PRIMARY KEY (id), +-- CONSTRAINT crawl_url_last_updated_key UNIQUE (url, last_updated) +-- ); +-- +-- CREATE TABLE chunk ( +-- id uuid NOT NULL DEFAULT uuid_generate_v4(), +-- crawl_id uuid NULL, +-- title text NULL, +-- text_content text NULL, +-- CONSTRAINT chunk_pkey PRIMARY KEY (id), +-- CONSTRAINT chunk_text_content_key UNIQUE (text_content), +-- CONSTRAINT chunk_crawl_uuid_fkey FOREIGN KEY (crawl_id) REFERENCES crawl(id) ON DELETE CASCADE +-- ); + +CREATE table if not EXISTS html_content ( + content text NOT NULL, + md5hash CHAR(32) NOT NULL, + CONSTRAINT html_content_md5hash_key UNIQUE (md5hash) +); + +ALTER TABLE crawl + ADD COLUMN IF NOT EXISTS md5hash CHAR(32); + +-- the tables crawl and chunk already contain data so we need to move the data to the new table first before applying constraints +-- on duplicate key value + +UPDATE crawl + SET md5hash = md5(html_content); + +INSERT INTO html_content (content, md5hash) + SELECT html_content, md5hash FROM crawl +ON CONFLICT (md5hash) DO NOTHING; + +-- we create N:N mapping between html_content and chunk +-- as from different html_content it is possible to extract +-- the same chunk even if not all chunks are the same +CREATE table html_content_to_chunk ( + html_content_md5hash CHAR(32) NOT NULL, + chunk_id uuid NOT NULL +); + +insert into html_content_to_chunk (html_content_md5hash, chunk_id) + select crawl.md5hash, chunk.id + from chunk + join crawl on chunk.crawl_id = crawl.id; + +-- we add constraint AFTER insertion for better performance +ALTER TABLE html_content_to_chunk + ADD CONSTRAINT html_content_to_chunk_pkey PRIMARY KEY (html_content_md5hash, chunk_id), + ADD CONSTRAINT html_content_to_chunk_html_content_md5hash_fkey FOREIGN KEY (html_content_md5hash) REFERENCES html_content(md5hash) ON DELETE CASCADE, + ADD CONSTRAINT html_content_to_chunk_chunk_id_fkey FOREIGN KEY (chunk_id) REFERENCES chunk(id) ON DELETE CASCADE; + +CREATE OR REPLACE VIEW documents +AS SELECT crawl.id, + chunk.id AS chunk_id, + crawl.url, + html_content.content as html_content, + crawl.title, + chunk.title AS subtitle, + chunk.text_content AS content, + embedding.embedding, + cardinality(token.tokens) AS tokens_count, + crawl.last_updated, + scoring.score + FROM crawl, + html_content, + html_content_to_chunk, + chunk, + token, + ada_002 embedding, + scoring + WHERE chunk.id = token.chunk_id + AND token.id = embedding.token_id + AND crawl.id = scoring.entity_id + AND crawl.md5hash = html_content.md5hash + AND html_content_to_chunk.html_content_md5hash = html_content.md5hash + AND html_content_to_chunk.chunk_id = chunk.id; + +ALTER TABLE chunk + DROP CONSTRAINT chunk_crawl_uuid_fkey, + DROP COLUMN crawl_id; + +alter table crawl + drop column html_content; \ No newline at end of file diff --git a/sql/2023-09-04-add-missing-constraints-rename-column.sql b/sql/2023-09-04-add-missing-constraints-rename-column.sql new file mode 100644 index 0000000..fe740fe --- /dev/null +++ b/sql/2023-09-04-add-missing-constraints-rename-column.sql @@ -0,0 +1,3 @@ +ALTER TABLE crawl ADD CONSTRAINT crawl_to_html_content_md5hash_fkey FOREIGN KEY (md5hash) REFERENCES html_content(md5hash) ON DELETE cascade; +ALTER TABLE default_chunks ADD CONSTRAINT default_chunks_to_chunk_fkey FOREIGN KEY (chunk_id) REFERENCES chunk(id) ON DELETE cascade; +ALTER TABLE html_content_to_chunk RENAME COLUMN html_content_md5hash TO md5hash; diff --git a/sql/2023-09-20-seed-identification-API-for-Nachet-frontend.sql b/sql/2023-09-20-seed-identification-API-for-Nachet-frontend.sql new file mode 100644 index 0000000..e69de29 diff --git a/sql/schema_to_csv.sql b/sql/schema_to_csv.sql index c3584e8..cb6523d 100644 --- a/sql/schema_to_csv.sql +++ b/sql/schema_to_csv.sql @@ -1,4 +1,4 @@ -CREATE OR REPLACE FUNCTION schema_to_csv(schema_source TEXT, path TEXT) RETURNS void AS $$ +CREATE OR REPLACE FUNCTION public.schema_to_csv(schema_source TEXT, path TEXT) RETURNS void AS $$ declare tables RECORD; statement TEXT; @@ -17,4 +17,3 @@ END LOOP; return; end; $$ LANGUAGE plpgsql; - diff --git a/tests/test_db.py b/tests/test_db.py deleted file mode 100644 index ebef987..0000000 --- a/tests/test_db.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import unittest -import pytest -import psycopg -import json -from psycopg.rows import dict_row - -import dotenv -dotenv.load_dotenv() - -def raise_error(message): - raise Exception(message) - -LOUIS_DSN = os.getenv("LOUIS_DSN") or raise_error("LOUIS_DSN is not set") -LOUIS_SCHEMA = os.getenv("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set") -MATCH_THRESHOLD = 0.5 -MATCH_COUNT = 10 - -class DBTest(unittest.TestCase): - - def execute(self, filename): - query = open(filename).read() - self.cursor.execute(query) - - def setUp(self): - self.connection = psycopg.connect(LOUIS_DSN) - self.cursor = self.connection.cursor(row_factory=dict_row) - self.cursor.execute("SET search_path TO louis_v004, public") - - def tearDown(self): - self.cursor.close() - self.connection.close() - - def upgrade_schema(self): - return - if LOUIS_SCHEMA == 'louis_v004': - self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql') - self.execute('sql/2023-07-11-populate-link.sql') - self.execute('sql/2023-07-12-score-current.sql') - self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql') - self.execute('sql/2023-07-19-modified-documents.sql') - self.execute('sql/2023-07-19-weighted_search.sql') - self.execute('sql/2023-07-21-default_chunk.sql') - - def test_well_formed_xml(self): - self.upgrade_schema() - # SELECT count(*) FROM crawl WHERE NOT xml_is_well_formed(html_content); - self.cursor.execute(""" - SELECT count(*) - FROM crawl - WHERE NOT xml_is_well_formed(html_content);""") - result = self.cursor.fetchall() - self.assertEqual(result[0]['count'], 0, "All xml should be well formed") - - def test_weighted_search(self): - self.upgrade_schema() - - with open('tests/embeddings/president.json') as f: - embeddings = json.load(f) - query = 'who is the president of the CFIA?' - weights = json.dumps( - {'similarity': 0.6, 'recency': 0.2, 'traffic': 0.0, 'current': 0.1}) - self.cursor.execute( - "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", ( - query, embeddings, MATCH_THRESHOLD, MATCH_COUNT, weights)) - results = self.cursor.fetchall() - result = results[0]['search'] - self.assertEqual( - result[0]['title'], - "Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency") - - query_id = result[0]['query_id'] - self.cursor.execute("SELECT * FROM query where id = %s::uuid", (query_id,)) - result = self.cursor.fetchall() - self.assertEqual(len(result), 1) - self.assertEqual(result[0]['query'], query) - result_embedding = json.loads(result[0]['embedding']) - self.assertAlmostEqual(result_embedding[0], embeddings[0]) - self.assertEqual(len(result[0]['result']), MATCH_COUNT) - - def test_weighted_search_with_empty_query(self): - self.upgrade_schema() - - weights = json.dumps({ 'recency': 0.4, 'traffic': 0.4, 'current': 0.2}) - self.cursor.execute( - "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", ( - None, None, MATCH_THRESHOLD, MATCH_COUNT, weights)) - result = self.cursor.fetchall()[0]['search'] - self.assertEqual(len(result), MATCH_COUNT, "Should return 10 results") - urls = dict([(r['url'], True) for r in result]) - self.assertEqual(len(urls.keys()), MATCH_COUNT, "All urls should be unique") - - - @unittest.skip("we have to re-chunk the documents using louis-crawler first") - @pytest.mark.skip( - reason="we have to re-chunk the documents using louis-crawler first") - def test_every_crawl_doc_should_have_at_least_one_chunk(self): - self.cursor.execute(""" - SELECT count(*) - FROM crawl LEFT JOIN chunk ON crawl.id = chunk.crawl_id - WHERE chunk.id IS NULL""") - result = self.cursor.fetchall() - self.assertEqual( - result[0]['count'], 0, - "Every crawl doc should have at least one chunk") diff --git a/tests/test_db_api.py b/tests/test_db_api.py new file mode 100644 index 0000000..804cb0d --- /dev/null +++ b/tests/test_db_api.py @@ -0,0 +1,74 @@ +"""test database functions""" +import unittest +import json + +import louis.db as db +import louis.db.api as api + +import testing_utils as test + + +class TestDBAPI(unittest.TestCase): + """Test the database functions""" + def setUp(self): + self.connection = db.connect_db() + self.cursor = db.cursor(self.connection) + + def tearDown(self): + self.connection.rollback() + self.connection.close() + + def test_match_documents_text_query(self): + with db.cursor(self.connection) as cursor: + docs = api.match_documents_from_text_query( + cursor, + 'what are the cooking temperatures for e.coli?') + self.connection.rollback() + self.assertEqual(len(docs), 10) + + # obsoleted by weighted search + # def test_president_of_cfia(self): + # with db.cursor(self.connection) as cursor: + # docs = api.match_documents_from_text_query( + # cursor, 'who is the president of the CFIA?') + # self.connection.rollback() + # self.assertEqual( + # docs[0]['title'], + # 'Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency') + + def test_weighted_search(self): + with open('tests/embeddings/president.json') as f: + embeddings = json.load(f) + query = 'who is the president of the CFIA?' + weights = json.dumps( + {'similarity': 0.6, 'recency': 0.2, 'traffic': 0.0, 'current': 0.1}) + self.cursor.execute( + "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", ( + query, embeddings, test.MATCH_THRESHOLD, test.MATCH_COUNT, weights)) + results = self.cursor.fetchall() + result = results[0]['search'] + self.assertEqual( + result[0]['title'], + "Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency") + + query_id = result[0]['query_id'] + self.cursor.execute("SELECT * FROM query where id = %s::uuid", (query_id,)) + result = self.cursor.fetchall() + self.assertEqual(len(result), 1) + self.assertEqual(result[0]['query'], query) + result_embedding = result[0]['embedding'] + self.assertAlmostEqual(result_embedding[0], embeddings[0]) + self.assertEqual(len(result[0]['result']), test.MATCH_COUNT) + + def test_weighted_search_with_empty_query(self): + weights = json.dumps({ 'recency': 0.4, 'traffic': 0.4, 'current': 0.2}) + self.cursor.execute( + "SELECT * FROM search(%s, %s::vector, %s::float, %s::integer, %s::jsonb)", ( + None, None, test.MATCH_THRESHOLD, test.MATCH_COUNT, weights)) + result = self.cursor.fetchall()[0]['search'] + self.assertEqual(len(result), test.MATCH_COUNT, "Should return 10 results") + urls = dict([(r['url'], True) for r in result]) + self.assertEqual( + len(urls.keys()), + test.MATCH_COUNT, + "All urls should be unique") \ No newline at end of file diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py new file mode 100644 index 0000000..63db26c --- /dev/null +++ b/tests/test_db_crawler.py @@ -0,0 +1,77 @@ +"""test database functions""" +import unittest + +import louis.db as db +import louis.db.crawler as crawler + +import testing_utils as test + +class TestDBCrawler(unittest.TestCase): + """Test the database functions""" + def setUp(self): + self.connection = db.connect_db() + + def tearDown(self): + self.connection.close() + + def test_link_pages_and_fetch_links(self): + """sample test to check if link_pages works""" + with db.cursor(self.connection) as cursor: + source_url = "https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149" + destination_url = "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/australia/eng/1321292836314/1321292933011" + crawler.link_pages(cursor, source_url, destination_url) + links = crawler.fetch_links(cursor, source_url) + self.connection.rollback() + self.assertTrue(destination_url in links) + + def test_fetch_crawl_row_by_http_url(self): + """sample test to check if fetch_crawl_row works""" + with db.cursor(self.connection) as cursor: + row = crawler.fetch_crawl_row( + cursor, + "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699" + ) + self.connection.rollback() + self.assertEqual(row['url'], "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699") + self.assertEqual( + row['title'], + "Mandat - Agence canadienne d'inspection des aliments") + + def test_fetch_crawl_row_by_postgresql_url(self): + """sample test to check if fetch_crawl_row works""" + url = db.create_postgresql_url( + "DBNAME", + "crawl", + "8b25a4d3-bd83-412d-8cd8-0fd969f28efc") + with db.cursor(self.connection) as cursor: + row = crawler.fetch_crawl_row( + cursor, + url + ) + self.connection.rollback() + self.assertEqual(row['url'], "https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149") + self.assertEqual( + row['title'], + "Sampling procedures - Canadian Food Inspection Agency") + + def test_fetch_chunk_row(self): + """sample test to check if fetch_chunk_row works""" + url = db.create_postgresql_url( + "DBNAME", + "chunk", + "469812c5-190c-4e56-9f88-c8621592bcb5") + with db.cursor(self.connection) as cursor: + row = crawler.fetch_chunk_token_row(cursor, url) + self.connection.rollback() + self.assertTrue(isinstance(row, dict)) + self.assertEqual(len(row['tokens']), 76) + self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5") + self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285') + + def test_fetch_chunk_id_without_embedding(self): + """sample test to check if fetch_chunk_id_without_embedding works""" + with db.cursor(self.connection) as cursor: + cursor.execute(test.embedding_table.format(embedding_model='test-model')) + rows = crawler.fetch_chunk_id_without_embedding(cursor, 'test-model') + _entity_id = rows[0] + self.connection.rollback() \ No newline at end of file diff --git a/tests/test_db_data.py b/tests/test_db_data.py new file mode 100644 index 0000000..7bc9f0e --- /dev/null +++ b/tests/test_db_data.py @@ -0,0 +1,54 @@ +import unittest + +import psycopg +from psycopg.rows import dict_row + +import testing_utils as test + +class TestDBData(unittest.TestCase): + + def execute(self, filename): + query = open(filename).read() + self.cursor.execute(query) + + def setUp(self): + self.connection = psycopg.connect(test.LOUIS_DSN) + self.cursor = self.connection.cursor(row_factory=dict_row) + self.cursor.execute("SET search_path TO louis_v004, public") + + def tearDown(self): + self.connection.rollback() + self.cursor.close() + self.connection.close() + + def upgrade_schema(self): + return + if test.LOUIS_SCHEMA == 'louis_v004': + self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql') + self.execute('sql/2023-07-11-populate-link.sql') + self.execute('sql/2023-07-12-score-current.sql') + self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql') + self.execute('sql/2023-07-19-modified-documents.sql') + self.execute('sql/2023-07-19-weighted_search.sql') + self.execute('sql/2023-07-21-default_chunk.sql') + + def test_well_formed_xml(self): + self.upgrade_schema() + # SELECT count(*) FROM crawl WHERE NOT xml_is_well_formed(html_content); + self.cursor.execute(""" + SELECT count(*) + FROM html_content + WHERE NOT xml_is_well_formed(content);""") + result = self.cursor.fetchall() + self.assertEqual(result[0]['count'], 0, "All xml should be well formed") + + def test_every_crawl_doc_should_have_at_least_one_chunk(self): + # self.execute('sql/2023-08-09-issue8-html_content-table.sql') + self.cursor.execute(""" + select count(*) + from crawl left join documents on crawl.id = documents.id + where documents.id is null""") + result = self.cursor.fetchall() + self.assertEqual( + result[0]['count'], 0, + "Every crawl doc should have at least one chunk") diff --git a/tests/test_db_layer.py b/tests/test_db_layer.py deleted file mode 100644 index a3664df..0000000 --- a/tests/test_db_layer.py +++ /dev/null @@ -1,107 +0,0 @@ -"""test database functions""" -import os -import unittest - -import louis.db as db - -embedding_table = """ -create table if not exists "{embedding_model}" ( - id uuid default uuid_generate_v4 (), - token_id uuid references token(id), - embedding vector(1536), - primary key(id), - unique(token_id) -); -""" - -class TestDBLayer(unittest.TestCase): - """Test the database functions""" - def setUp(self): - self.connection = db.connect_db() - - def tearDown(self): - self.connection.close() - - def test_schema(self): - """sample test to check if the schema is correct and idempotent""" - LOUIS_SCHEMA = os.environ.get('LOUIS_SCHEMA') - with open(f"dumps/{LOUIS_SCHEMA}/schema.sql", encoding='utf-8') as schema_file: - schema = schema_file.read() - schema = schema.replace(LOUIS_SCHEMA, 'test') - with db.cursor(self.connection) as cursor: - cursor.execute(schema) - self.connection.rollback() - - def test_link_pages_and_fetch_links(self): - """sample test to check if link_pages works""" - with db.cursor(self.connection) as cursor: - source_url = "https://inspection.canada.ca/splash" - destination_url = "https://inspection.canada.ca/animal-health/terrestrial-animals/exports/pets/australia/eng/1321292836314/1321292933011" - db.link_pages(cursor, source_url, destination_url) - links = db.fetch_links(cursor, "https://inspection.canada.ca/splash") - self.connection.rollback() - self.assertEqual(links, [destination_url]) - - def test_fetch_crawl_row(self): - """sample test to check if fetch_crawl_row works""" - with db.cursor(self.connection) as cursor: - row = db.fetch_crawl_row( - cursor, - "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699" - ) - self.connection.rollback() - self.assertEqual(row['url'], "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699") - self.assertEqual( - row['title'], - "Mandat - Agence canadienne d'inspection des aliments") - - def test_fetch_chunk_row(self): - """sample test to check if fetch_chunk_row works""" - with db.cursor(self.connection) as cursor: - row = db.fetch_chunk_token_row( - cursor, - "postgresql://inspection.canada.ca/public/chunk/469812c5-190c-4e56-9f88-c8621592bcb5") - self.connection.rollback() - self.assertEqual(len(row['tokens']), 76) - self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5") - self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285') - - - def test_fetch_chunk_id_without_embedding(self): - """sample test to check if fetch_chunk_id_without_embedding works""" - with db.cursor(self.connection) as cursor: - cursor.execute(embedding_table.format(embedding_model='test-model')) - rows = db.fetch_chunk_id_without_embedding(cursor, 'test-model') - _entity_id = rows[0] - self.connection.rollback() - - def test_create_postgresql_url(self): - """sample test to check if create_parse_postgresql_url works""" - entity_uuid = '5cef886d-8408-4868-9a69-0f0ca2167941' - url = db.create_postgresql_url( - "inspection.canada.ca", - "chunk", entity_uuid, - {'encoding': 'cl100k_base'}) - self.assertEqual(url, f"postgresql://inspection.canada.ca/public/chunk/{entity_uuid}?encoding=cl100k_base") - parsed = db.parse_postgresql_url(url) - self.assertEqual(parsed['dbname'], "inspection.canada.ca") - self.assertEqual(parsed['tablename'], "chunk") - self.assertEqual(parsed['entity_uuid'], entity_uuid) - self.assertEqual(parsed['parameters']['encoding'][0], "cl100k_base") - - def test_match_documents_text_query(self): - with db.cursor(self.connection) as cursor: - docs = db.match_documents_from_text_query( - cursor, - 'what are the cooking temperatures for e.coli?') - self.connection.rollback() - self.assertEqual(len(docs), 10) - - def test_president_of_cfia(self): - with db.cursor(self.connection) as cursor: - docs = db.match_documents_from_text_query( - cursor, 'who is the president of the CFIA?') - self.connection.rollback() - self.assertEqual( - docs[0]['title'], - 'Dr. Harpreet S. Kochhar - Canadian Food Inspection Agency') diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py new file mode 100644 index 0000000..e8ee7eb --- /dev/null +++ b/tests/test_db_schema.py @@ -0,0 +1,24 @@ +"""test database functions""" +import unittest + +import testing_utils as test +import louis.db as db + + +class TestDBSchema(unittest.TestCase): + """Test the database functions""" + def setUp(self): + self.connection = db.connect_db() + + def tearDown(self): + self.connection.close() + + def test_schema(self): + """sample test to check if the schema is correct and idempotent""" + schema_filename = f"dumps/{test.LOUIS_SCHEMA}/schema.sql" + with open(schema_filename, encoding='utf-8') as schema_file: + schema = schema_file.read() + schema = schema.replace(test.LOUIS_SCHEMA, 'test') + with db.cursor(self.connection) as cursor: + cursor.execute(schema) + self.connection.rollback() \ No newline at end of file diff --git a/tests/test_db_utils.py b/tests/test_db_utils.py new file mode 100644 index 0000000..7194cea --- /dev/null +++ b/tests/test_db_utils.py @@ -0,0 +1,28 @@ +"""test database functions""" +import unittest + +import testing_utils as test +import louis.db as db + + +class TestDBUtils(unittest.TestCase): + """Test the database functions""" + def setUp(self): + self.connection = db.connect_db() + + def tearDown(self): + self.connection.close() + + def test_create_postgresql_url(self): + """sample test to check if create_parse_postgresql_url works""" + entity_uuid = '5cef886d-8408-4868-9a69-0f0ca2167941' + url = db.create_postgresql_url( + "inspection.canada.ca", + "chunk", entity_uuid, + {'encoding': 'cl100k_base'}) + self.assertEqual(url, f"postgresql://inspection.canada.ca/{test.LOUIS_SCHEMA}/chunk/{entity_uuid}?encoding=cl100k_base") + parsed = db.parse_postgresql_url(url) + self.assertEqual(parsed['dbname'], "inspection.canada.ca") + self.assertEqual(parsed['tablename'], "chunk") + self.assertEqual(parsed['id'], entity_uuid) + self.assertEqual(parsed['parameters']['encoding'][0], "cl100k_base") \ No newline at end of file diff --git a/tests/testing_utils.py b/tests/testing_utils.py new file mode 100644 index 0000000..9e560cb --- /dev/null +++ b/tests/testing_utils.py @@ -0,0 +1,21 @@ +import os +import dotenv +dotenv.load_dotenv() + +def raise_error(message): + raise Exception(message) + +LOUIS_SCHEMA = os.getenv("LOUIS_SCHEMA") or raise_error("LOUIS_SCHEMA is not set") +LOUIS_DSN = os.getenv("LOUIS_DSN") or raise_error("LOUIS_DSN is not set") +MATCH_THRESHOLD = 0.5 +MATCH_COUNT = 10 + +embedding_table = """ +create table if not exists "{embedding_model}" ( + id uuid default uuid_generate_v4 (), + token_id uuid references token(id), + embedding vector(1536), + primary key(id), + unique(token_id) +); +"""