From 6f8203dd229cc76bbc022df213b7d7a3be2d6d65 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 20 Feb 2024 16:41:40 +0400 Subject: [PATCH] PQ Api improvements #288 - Add drop_quantization API that deletes all PQ-quantization resources associated with a table - Make sure all quantization resources and table hooks are deleted when the lantern extension is deleted - Current api has quantize_vector but decompress_vector. switch to quantize/dequantize language - Make quantize_table work with all kinds of table names and formattings - Allow quantizing subset of a table --- sql/lantern.sql | 58 +++++++++++---- src/hnsw.h | 1 - src/hnsw/pqtable.c | 12 +++ test/expected/ext_relocation.out | 5 +- test/expected/hnsw_pq.out | 123 +++++++++++++++++++++++++++---- test/sql/hnsw_pq.sql | 56 ++++++++++---- 6 files changed, 211 insertions(+), 44 deletions(-) diff --git a/sql/lantern.sql b/sql/lantern.sql index a6633003f..6755e11ef 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -59,7 +59,7 @@ CREATE FUNCTION _lantern_internal.failure_point_enable(func TEXT, name TEXT, don CREATE FUNCTION _lantern_internal.continue_blockmap_group_initialization(index regclass) RETURNS VOID AS 'MODULE_PATHNAME', 'lantern_internal_continue_blockmap_group_initialization' LANGUAGE C STABLE STRICT PARALLEL UNSAFE; -CREATE FUNCTION _lantern_internal.create_pq_codebook(REGCLASS, NAME, INT, INT, TEXT) RETURNS REAL[][][] +CREATE FUNCTION _lantern_internal.create_pq_codebook(REGCLASS, NAME, INT, INT, TEXT, INT) RETURNS REAL[][][] AS 'MODULE_PATHNAME', 'create_pq_codebook' LANGUAGE C STABLE STRICT PARALLEL UNSAFE; -- operator classes CREATE OR REPLACE FUNCTION _lantern_internal._create_ldb_operator_classes(access_method_name TEXT) RETURNS BOOLEAN AS $$ @@ -235,9 +235,11 @@ END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION create_pq_codebook(tbl REGCLASS, col NAME, cluster_cnt INT, subvector_count INT, distance_metric TEXT) +CREATE OR REPLACE FUNCTION create_pq_codebook(p_tbl REGCLASS, p_col NAME, cluster_cnt INT, subvector_count INT, distance_metric TEXT, dataset_size_limit INT DEFAULT 0) RETURNS NAME AS $$ DECLARE + tbl NAME; + col NAME; stmt TEXT; res REAL[]; codebooks REAL[][][]; @@ -246,12 +248,14 @@ DECLARE codebook_table NAME; dim INT; BEGIN - + tbl := regexp_replace(trim(both '"' FROM p_tbl::TEXT), '^.*\.', ''); + col := trim(both '"' FROM p_col); + stmt := format('SELECT array_length(%I, 1) FROM %I WHERE %1$I IS NOT NULL LIMIT 1', col, tbl); EXECUTE stmt INTO dim; -- Get codebooks - codebooks := _lantern_internal.create_pq_codebook(tbl, col, cluster_cnt, subvector_count, distance_metric); + codebooks := _lantern_internal.create_pq_codebook(p_tbl, col, cluster_cnt, subvector_count, distance_metric, dataset_size_limit); -- Create codebook table codebook_table := format('_lantern_internal."_codebook_%s_%s"', tbl, col); @@ -338,8 +342,8 @@ BEGIN END; $$ LANGUAGE plpgsql; --- Decompress vector using codebook -CREATE OR REPLACE FUNCTION decompress_vector(v pqvec, codebook regclass) +-- Dequantize vector using codebook +CREATE OR REPLACE FUNCTION dequantize_vector(v pqvec, codebook regclass) RETURNS REAL[] AS $$ DECLARE res REAL[]; @@ -371,12 +375,14 @@ END; $$ LANGUAGE plpgsql; -- Quantize table -CREATE OR REPLACE FUNCTION quantize_table(tbl regclass, col NAME, cluster_cnt INT,subvector_count INT, distance_metric TEXT) +CREATE OR REPLACE FUNCTION quantize_table(p_tbl regclass, p_col NAME, cluster_cnt INT,subvector_count INT, distance_metric TEXT, dataset_size_limit INT DEFAULT 0) RETURNS VOID AS $$ DECLARE subvector REAL[]; id INT; stmt TEXT; + tbl NAME; + col NAME; pq_col_name NAME; codebook_table NAME; trigger_func_name NAME; @@ -385,16 +391,19 @@ DECLARE pg_version INT; column_exists BOOLEAN; BEGIN + tbl := regexp_replace(trim(both '"' FROM p_tbl::TEXT), '^.*\.', ''); + col := trim(both '"' FROM p_col); + pg_version := (SELECT setting FROM pg_settings WHERE name = 'server_version_num'); pq_col_name := format('%I_pq', col); - column_exists := (SELECT true FROM pg_attribute WHERE attrelid = tbl AND attname = pq_col_name AND NOT attisdropped); + column_exists := (SELECT true FROM pg_attribute WHERE attrelid = p_tbl AND attname = pq_col_name AND NOT attisdropped); IF column_exists THEN RAISE EXCEPTION 'Column % already exists in table', pq_col_name; END IF; -- Create codebook - codebook_table := create_pq_codebook(tbl, col, cluster_cnt, subvector_count, distance_metric); + codebook_table := create_pq_codebook(p_tbl, col, cluster_cnt, subvector_count, distance_metric, dataset_size_limit); -- Compress vectors RAISE INFO 'Compressing vectors...'; @@ -410,9 +419,9 @@ BEGIN EXECUTE stmt; -- Create trigger to update pq values based on vector value - trigger_func_name := format('_set_pq_col_%s', md5(tbl || col)); + trigger_func_name := format('"_lantern_internal"._set_pq_col_%s', md5(tbl || col)); stmt := format(' - CREATE OR REPLACE FUNCTION %I() + CREATE OR REPLACE FUNCTION %s() RETURNS trigger LANGUAGE plpgsql AS $body$ @@ -435,7 +444,7 @@ BEGIN stmt := format('DROP TRIGGER IF EXISTS %I ON %I', update_trigger_name, tbl); EXECUTE stmt; - stmt := format('CREATE TRIGGER %I BEFORE INSERT ON %I FOR EACH ROW WHEN (NEW.%I IS NOT NULL) EXECUTE FUNCTION %I()', + stmt := format('CREATE TRIGGER %I BEFORE INSERT ON %I FOR EACH ROW WHEN (NEW.%I IS NOT NULL) EXECUTE FUNCTION %s()', insert_trigger_name, tbl, col, @@ -444,7 +453,7 @@ BEGIN EXECUTE stmt; - stmt := format('CREATE TRIGGER %1$I BEFORE UPDATE OF %2$I ON %3$I FOR EACH ROW WHEN (NEW.%2$I IS NOT NULL) EXECUTE FUNCTION %4$I()', + stmt := format('CREATE TRIGGER %1$I BEFORE UPDATE OF %2$I ON %3$I FOR EACH ROW WHEN (NEW.%2$I IS NOT NULL) EXECUTE FUNCTION %4$s()', update_trigger_name, col, tbl, @@ -454,3 +463,26 @@ BEGIN END IF; END; $$ LANGUAGE plpgsql; + +CREATE FUNCTION drop_quantization(p_tbl regclass, p_col NAME) +RETURNS VOID AS $$ +DECLARE + tbl NAME; + col NAME; + pq_col_name NAME; + codebook_table NAME; + trigger_func_name NAME; +BEGIN + tbl := regexp_replace(trim(both '"' FROM p_tbl::TEXT), '^.*\.', ''); + col := trim(both '"' FROM p_col); + codebook_table := format('_lantern_internal."_codebook_%s_%s"', tbl, col); + pq_col_name := format('%s_pq', col); + trigger_func_name := format('"_lantern_internal"._set_pq_col_%s', md5(tbl || col)); + + EXECUTE format('DROP TABLE IF EXISTS %s CASCADE', codebook_table); + + EXECUTE format('ALTER TABLE %I DROP COLUMN IF EXISTS %I', tbl, pq_col_name); + + EXECUTE format('DROP FUNCTION IF EXISTS %s CASCADE', trigger_func_name); +END; +$$ LANGUAGE plpgsql; diff --git a/src/hnsw.h b/src/hnsw.h index 893267d6b..d1db4edc8 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -35,7 +35,6 @@ PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS); PGDLLEXPORT Datum cos_dist_with_guard(PG_FUNCTION_ARGS); PGDLLEXPORT Datum vector_cos_dist(PG_FUNCTION_ARGS); PGDLLEXPORT Datum lantern_reindex_external_index(PG_FUNCTION_ARGS); -PGDLLEXPORT Datum create_pq_codebook(PG_FUNCTION_ARGS); HnswColumnType GetColumnTypeFromOid(Oid oid); HnswColumnType GetIndexColumnType(Relation index); diff --git a/src/hnsw/pqtable.c b/src/hnsw/pqtable.c index 31101c38d..50456e779 100644 --- a/src/hnsw/pqtable.c +++ b/src/hnsw/pqtable.c @@ -41,6 +41,7 @@ Datum create_pq_codebook(PG_FUNCTION_ARGS) uint32 cluster_cnt = PG_GETARG_UINT32(2); uint32 subvector_cnt = PG_GETARG_UINT32(3); text *distance_metric_text = PG_GETARG_TEXT_P(4); + uint32 dataset_size_limit = PG_GETARG_UINT32(5); usearch_metric_kind_t distance_metric; // ----------------- // Dataset variables @@ -80,6 +81,10 @@ Datum create_pq_codebook(PG_FUNCTION_ARGS) elog(ERROR, "Cluster count can not be greater than %d", 1 << 8); } + if(dataset_size_limit > 0 && dataset_size_limit < cluster_cnt) { + elog(ERROR, "Dataset size limit should be greater or equal to cluster_cnt count"); + } + distance_metric = GetMetricKindFromStr(text_to_cstring(distance_metric_text)); table = relation_open(tablerelid, AccessShareLock); @@ -106,15 +111,22 @@ Datum create_pq_codebook(PG_FUNCTION_ARGS) dataset_dim = current_tuple_dim; // TODO:: this check can be removed as soon as we resolve return type issue from this function if(dataset_dim % subvector_cnt != 0) { + heap_endscan(scan); + relation_close(table, AccessShareLock); elog(ERROR, "Dataset dimensions should be divisible by subvector count"); } } else if(current_tuple_dim != dataset_dim) { heap_endscan(scan); + relation_close(table, AccessShareLock); elog(ERROR, "Table should have equally sized array: expected %d got %d", dataset_dim, current_tuple_dim); } dataset[ dataset_size++ ] = (float4 *)ARR_DATA_PTR(array); + if(dataset_size_limit > 0 && dataset_size == dataset_size_limit) { + break; + } + if(estimated_row_count == dataset_size - 1) { dataset = repalloc(dataset, estimated_row_count * 2 * sizeof(size_t)); estimated_row_count *= 2; diff --git a/test/expected/ext_relocation.out b/test/expected/ext_relocation.out index 72e89705d..1e9ba6e4d 100644 --- a/test/expected/ext_relocation.out +++ b/test/expected/ext_relocation.out @@ -44,7 +44,8 @@ ORDER BY 1, 3, 2; schema1 | validate_index | _lantern_internal schema1 | cos_dist | schema1 schema1 | create_pq_codebook | schema1 - schema1 | decompress_vector | schema1 + schema1 | dequantize_vector | schema1 + schema1 | drop_quantization | schema1 schema1 | hamming_dist | schema1 schema1 | hnsw_handler | schema1 schema1 | l2sq_dist | schema1 @@ -59,7 +60,7 @@ ORDER BY 1, 3, 2; schema1 | ldb_pqvec_send | schema1 schema1 | quantize_table | schema1 schema1 | quantize_vector | schema1 -(25 rows) +(26 rows) -- show all the extension operators SELECT ne.nspname AS extschema, op.oprname, np.nspname AS proschema diff --git a/test/expected/hnsw_pq.out b/test/expected/hnsw_pq.out index b3d549aea..ef11d5636 100644 --- a/test/expected/hnsw_pq.out +++ b/test/expected/hnsw_pq.out @@ -96,21 +96,23 @@ SELECT '{84,1,4,128,255}'::INT[]::pqvec; SELECT '{84,1,4,128,256}'::pqvec; ERROR: Compressed vector element can not be bigger than 255 at character 8 -- Verify wrong argument assertions -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'nonexistant', 10, 32, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'nonexistant', 10, 32, 'l2sq', 0); ERROR: Column nonexistant not found in table -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1001, 32, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1001, 32, 'l2sq', 0); ERROR: Cluster count can not be greater than 256 -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 33, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 33, 'l2sq', 0); ERROR: Dataset dimensions should be divisible by subvector count -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sqz'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sqz', 0); ERROR: Unsupported metric kind: l2sqz . Should be one of (l2sq, cos, hamming) -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 32, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 32, 'l2sq', 0); ERROR: Cluster count can not be greater than 256 -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 0, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 0, 'l2sq', 0); +ERROR: Subvector count can not be zero +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 256, 0, 'l2sq', 10); ERROR: Subvector count can not be zero \set ON_ERROR_STOP on -- This should create codebook[1][1][128] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1, 1, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1, 1, 'l2sq', 0) as codebook \gset INFO: Table scanned. Dataset size 1000 INFO: Starting k-means over dataset with (subvectors=1, clusters=1) INFO: Codebooks created @@ -133,7 +135,7 @@ SELECT array_length(:'codebook'::REAL[][][], 3); (1 row) -- This should create codebook[1][10][128] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 1, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 1, 'l2sq', 0) as codebook \gset INFO: Table scanned. Dataset size 1000 INFO: Starting k-means over dataset with (subvectors=1, clusters=10) INFO: Codebooks created @@ -156,7 +158,7 @@ SELECT array_length(:'codebook'::REAL[][][], 3); (1 row) -- This should create codebook[32][10][4] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 0) as codebook \gset INFO: Table scanned. Dataset size 1000 INFO: Starting k-means over dataset with (subvectors=32, clusters=10) INFO: Codebooks created @@ -223,9 +225,9 @@ ERROR: Cannot modify readonly table. INSERT INTO _lantern_internal._codebook_sift_base1k_v (subvector_id, centroid_id, c) VALUES (1, 1, '{1,2,3,4}'); ERROR: Cannot modify readonly table. -- Validate that compressing invalid vector raises an error -SELECT decompress_vector('{}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); +SELECT dequantize_vector('{}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); ERROR: pqvector can not be empty at character 26 -SELECT decompress_vector('{1,2,3}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); +SELECT dequantize_vector('{1,2,3}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); ERROR: Codebook has 32 subvectors, but vector is quantized in 3 subvectors \set ON_ERROR_STOP on -- Compression and Decompression @@ -233,8 +235,8 @@ ERROR: Codebook has 32 subvectors, but vector is quantized in 3 subvectors SELECT v as v1 FROM sift_base1k WHERE id=1 \gset SELECT v_pq as v1_pq FROM sift_base1k WHERE id=1 \gset SELECT quantize_vector(:'v1', '_lantern_internal._codebook_sift_base1k_v'::regclass, 'l2sq') as compressed \gset -SELECT decompress_vector(:'v1_pq', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_1 \gset -SELECT decompress_vector(:'compressed', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_2 \gset +SELECT dequantize_vector(:'v1_pq', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_1 \gset +SELECT dequantize_vector(:'compressed', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_2 \gset SELECT l2sq_dist(:'decompressed_1', :'decompressed_2'); l2sq_dist ----------- @@ -243,7 +245,7 @@ SELECT l2sq_dist(:'decompressed_1', :'decompressed_2'); -- Test recall for quantized vs non quantized vectors ALTER TABLE sift_base1k ADD COLUMN v_pq_dec REAL[]; -UPDATE sift_base1k SET v_pq_dec=decompress_vector(v_pq, '_lantern_internal._codebook_sift_base1k_v'); +UPDATE sift_base1k SET v_pq_dec=dequantize_vector(v_pq, '_lantern_internal._codebook_sift_base1k_v'); -- Calculate recall over original vector SELECT (calculate_table_recall('sift_base1k', 'sift_query1k', 'sift_truth1k', 'v', 10, 100) - calculate_table_recall('sift_base1k', 'sift_query1k', 'sift_truth1k', 'v_pq_dec', 10, 100)) as recall_diff \gset @@ -317,3 +319,96 @@ SELECT array_length(c, 1) FROM _lantern_internal._codebook_sift_base1k_v_pq_dec 4 (1 row) +-- Test that resources are being cleared correctly +SELECT drop_quantization('sift_base1k'::regclass, 'v'); + drop_quantization +------------------- + +(1 row) + +SELECT drop_quantization('sift_base1k'::regclass, 'v_pq_dec'); + drop_quantization +------------------- + +(1 row) + +SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'sift_base1k'; + column_name +------------- + id + v + v_pq_dec +(3 rows) + +SELECT table_name FROM information_schema.tables WHERE table_schema = '_lantern_internal'; + table_name +------------ +(0 rows) + +-- Test quantization over subset of data +SELECT quantize_table('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 500); +INFO: Table scanned. Dataset size 500 +INFO: Starting k-means over dataset with (subvectors=32, clusters=10) +INFO: Codebooks created +INFO: Compressing vectors... + quantize_table +---------------- + +(1 row) + +SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal._codebook_sift_base1k_v; + count +------- + 32 +(1 row) + +SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal._codebook_sift_base1k_v; + count +------- + 10 +(1 row) + +SELECT COUNT(*) FROM _lantern_internal._codebook_sift_base1k_v; + count +------- + 320 +(1 row) + +SELECT array_length(c, 1) FROM _lantern_internal._codebook_sift_base1k_v LIMIT 1; + array_length +-------------- + 4 +(1 row) + +-- Test quantization with mixed case and schema qualified table name +SELECT id, v AS "v_New" into "sift_Base1k_NEW" FROM sift_base1k; +SELECT quantize_table('"public"."sift_Base1k_NEW"'::regclass, 'v_New', 10, 32, 'l2sq'); +INFO: Table scanned. Dataset size 1001 +INFO: Starting k-means over dataset with (subvectors=32, clusters=10) +INFO: Codebooks created +INFO: Compressing vectors... + quantize_table +---------------- + +(1 row) + +SELECT array_length( + dequantize_vector( + quantize_vector( + (SELECT "v_New" FROM "sift_Base1k_NEW" WHERE id=1), + '_lantern_internal."_codebook_sift_Base1k_NEW_v_New"'::regclass, + 'l2sq'), + '_lantern_internal."_codebook_sift_Base1k_NEW_v_New"'::regclass), + 1 + ); + array_length +-------------- + 128 +(1 row) + +SELECT drop_quantization('"sift_Base1k_NEW"'::regclass, 'v_New'); + drop_quantization +------------------- + +(1 row) + diff --git a/test/sql/hnsw_pq.sql b/test/sql/hnsw_pq.sql index 9813da73b..8133021b1 100644 --- a/test/sql/hnsw_pq.sql +++ b/test/sql/hnsw_pq.sql @@ -13,28 +13,29 @@ SELECT '{84,1,4,128,255}'::INT[]::pqvec; -- Test PQVec type SELECT '{84,1,4,128,256}'::pqvec; -- Verify wrong argument assertions -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'nonexistant', 10, 32, 'l2sq'); -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1001, 32, 'l2sq'); -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 33, 'l2sq'); -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sqz'); -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 32, 'l2sq'); -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 0, 'l2sq'); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'nonexistant', 10, 32, 'l2sq', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1001, 32, 'l2sq', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 33, 'l2sq', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sqz', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 32, 'l2sq', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 257, 0, 'l2sq', 0); +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 256, 0, 'l2sq', 10); \set ON_ERROR_STOP on -- This should create codebook[1][1][128] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1, 1, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 1, 1, 'l2sq', 0) as codebook \gset SELECT array_length(:'codebook'::REAL[][][], 1); SELECT array_length(:'codebook'::REAL[][][], 2); SELECT array_length(:'codebook'::REAL[][][], 3); -- This should create codebook[1][10][128] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 1, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 1, 'l2sq', 0) as codebook \gset SELECT array_length(:'codebook'::REAL[][][], 1); SELECT array_length(:'codebook'::REAL[][][], 2); SELECT array_length(:'codebook'::REAL[][][], 3); -- This should create codebook[32][10][4] -SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sq') as codebook \gset +SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 0) as codebook \gset SELECT array_length(:'codebook'::REAL[][][], 1); SELECT array_length(:'codebook'::REAL[][][], 2); SELECT array_length(:'codebook'::REAL[][][], 3); @@ -55,8 +56,8 @@ UPDATE _lantern_internal._codebook_sift_base1k_v SET centroid_id=2 WHERE centroi INSERT INTO _lantern_internal._codebook_sift_base1k_v (subvector_id, centroid_id, c) VALUES (1, 1, '{1,2,3,4}'); -- Validate that compressing invalid vector raises an error -SELECT decompress_vector('{}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); -SELECT decompress_vector('{1,2,3}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); +SELECT dequantize_vector('{}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); +SELECT dequantize_vector('{1,2,3}'::pqvec, '_lantern_internal._codebook_sift_base1k_v'::regclass); \set ON_ERROR_STOP on -- Compression and Decompression @@ -64,13 +65,13 @@ SELECT decompress_vector('{1,2,3}'::pqvec, '_lantern_internal._codebook_sift_bas SELECT v as v1 FROM sift_base1k WHERE id=1 \gset SELECT v_pq as v1_pq FROM sift_base1k WHERE id=1 \gset SELECT quantize_vector(:'v1', '_lantern_internal._codebook_sift_base1k_v'::regclass, 'l2sq') as compressed \gset -SELECT decompress_vector(:'v1_pq', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_1 \gset -SELECT decompress_vector(:'compressed', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_2 \gset +SELECT dequantize_vector(:'v1_pq', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_1 \gset +SELECT dequantize_vector(:'compressed', '_lantern_internal._codebook_sift_base1k_v'::regclass) as decompressed_2 \gset SELECT l2sq_dist(:'decompressed_1', :'decompressed_2'); -- Test recall for quantized vs non quantized vectors ALTER TABLE sift_base1k ADD COLUMN v_pq_dec REAL[]; -UPDATE sift_base1k SET v_pq_dec=decompress_vector(v_pq, '_lantern_internal._codebook_sift_base1k_v'); +UPDATE sift_base1k SET v_pq_dec=dequantize_vector(v_pq, '_lantern_internal._codebook_sift_base1k_v'); -- Calculate recall over original vector SELECT (calculate_table_recall('sift_base1k', 'sift_query1k', 'sift_truth1k', 'v', 10, 100) - calculate_table_recall('sift_base1k', 'sift_query1k', 'sift_truth1k', 'v_pq_dec', 10, 100)) as recall_diff \gset @@ -95,3 +96,30 @@ SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal._codebook_sift_base1k SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal._codebook_sift_base1k_v_pq_dec; SELECT COUNT(*) FROM _lantern_internal._codebook_sift_base1k_v_pq_dec; SELECT array_length(c, 1) FROM _lantern_internal._codebook_sift_base1k_v_pq_dec LIMIT 1; + +-- Test that resources are being cleared correctly +SELECT drop_quantization('sift_base1k'::regclass, 'v'); +SELECT drop_quantization('sift_base1k'::regclass, 'v_pq_dec'); +SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'sift_base1k'; +SELECT table_name FROM information_schema.tables WHERE table_schema = '_lantern_internal'; + +-- Test quantization over subset of data +SELECT quantize_table('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 500); +SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal._codebook_sift_base1k_v; +SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal._codebook_sift_base1k_v; +SELECT COUNT(*) FROM _lantern_internal._codebook_sift_base1k_v; +SELECT array_length(c, 1) FROM _lantern_internal._codebook_sift_base1k_v LIMIT 1; + +-- Test quantization with mixed case and schema qualified table name +SELECT id, v AS "v_New" into "sift_Base1k_NEW" FROM sift_base1k; +SELECT quantize_table('"public"."sift_Base1k_NEW"'::regclass, 'v_New', 10, 32, 'l2sq'); +SELECT array_length( + dequantize_vector( + quantize_vector( + (SELECT "v_New" FROM "sift_Base1k_NEW" WHERE id=1), + '_lantern_internal."_codebook_sift_Base1k_NEW_v_New"'::regclass, + 'l2sq'), + '_lantern_internal."_codebook_sift_Base1k_NEW_v_New"'::regclass), + 1 + ); +SELECT drop_quantization('"sift_Base1k_NEW"'::regclass, 'v_New');