From 0f36bc2112125da2c2b858455ecc6233ad2ce24e Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Tue, 15 Oct 2024 16:20:59 -0700 Subject: [PATCH] fix bug with incorrect partitions on second train --- apis/python/test/test_index.py | 3 - src/include/detail/ivf/index.h | 4 +- src/include/index/ivf_pq_index.h | 55 +++---- src/include/test/unit_api_ivf_pq_index.cc | 3 +- src/include/test/unit_ivf_pq_index.cc | 177 +++++++++++----------- src/include/test/unit_tdb_io.cc | 25 --- 6 files changed, 107 insertions(+), 160 deletions(-) diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index fc88f6e2c..87e242e31 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -35,7 +35,6 @@ def check_default_metadata( uri, expected_vector_type, expected_storage_version, expected_index_type ): group = tiledb.Group(uri, "r", ctx=tiledb.Ctx(None)) - print("[check_default_metadata] group.meta", group.meta) assert "dataset_type" in group.meta assert group.meta["dataset_type"] == DATASET_TYPE assert type(group.meta["dataset_type"]) == str @@ -306,7 +305,6 @@ def test_ivf_pq_index(tmp_path): uri = os.path.join(tmp_path, "array") if os.path.exists(uri): os.rmdir(uri) - vector_type = np.float32 index = ivf_pq_index.create( @@ -342,7 +340,6 @@ def test_ivf_pq_index(tmp_path): vectors=update_vectors, external_ids=np.array([0, 1, 2, 3, 4], dtype=np.dtype(np.uint32)), ) - query_and_check_distances( index, np.array([[2, 2, 2]], dtype=np.float32), 2, [[0, 3]], [[2, 1]] ) diff --git a/src/include/detail/ivf/index.h b/src/include/detail/ivf/index.h index 81388dab2..c3719fba4 100644 --- a/src/include/detail/ivf/index.h +++ b/src/include/detail/ivf/index.h @@ -133,7 +133,7 @@ int ivf_pq_index( // Find the centroid that is closest to each input vector. auto parts = detail::flat::qv_partition(centroids, input_vectors, nthreads); { - scoped_timer _{"shuffling data"}; + scoped_timer _{"index@ivf_pq_index@shuffling_data"}; std::unordered_set deleted_ids_set( deleted_ids.begin(), deleted_ids.end()); auto indices = compute_indices< @@ -274,7 +274,7 @@ int ivf_index( // Find the centroid that is closest to each input vector. auto parts = detail::flat::qv_partition(centroids, input_vectors, nthreads); { - scoped_timer _{"shuffling data"}; + scoped_timer _{"index@ivf_index@shuffling_data"}; std::unordered_set deleted_ids_set( deleted_ids.begin(), deleted_ids.end()); diff --git a/src/include/index/ivf_pq_index.h b/src/include/index/ivf_pq_index.h index 3c75a7a8d..9f6e36858 100644 --- a/src/include/index/ivf_pq_index.h +++ b/src/include/index/ivf_pq_index.h @@ -805,7 +805,19 @@ class ivf_pq_index { num_clusters_, num_subspaces_); - write_group.set_num_partitions(num_partitions_); + if (write_group.get_all_ingestion_timestamps().size() == 1 && + write_group.get_previous_ingestion_timestamp() == 0 && + write_group.get_all_base_sizes().size() == 1 && + write_group.get_previous_base_size() == 0) { + write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end()); + write_group.set_base_size(0); + write_group.set_num_partitions(num_partitions_); + } else { + write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end()); + write_group.append_base_size(0); + write_group.append_num_partitions(num_partitions_); + } + write_group.store_metadata(); // 4. Write the centroids. @@ -945,7 +957,7 @@ class ivf_pq_index { for (size_t partition_id = 0; partition_id < partitions; ++partition_id) { auto slice = std::make_pair( static_cast(prev_index), - static_cast(partial_indexes[i] - 1)); + static_cast(partial_indexes[i]) - 1); if (slice.first <= slice.second && slice.first != std::numeric_limits::max()) { partition_slices[partition_id].push_back(slice); @@ -1052,19 +1064,7 @@ class ivf_pq_index { num_clusters_, num_subspaces_); - if (write_group.get_all_ingestion_timestamps().size() == 1 && - write_group.get_previous_ingestion_timestamp() == 0 && - write_group.get_all_base_sizes().size() == 1 && - write_group.get_previous_base_size() == 0) { - write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end()); - write_group.set_base_size(write_group.get_temp_size()); - write_group.set_num_partitions(num_partitions_); - } else { - write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end()); - write_group.append_base_size(write_group.get_temp_size()); - write_group.append_num_partitions(num_partitions_); - } - + write_group.set_base_size(write_group.get_temp_size()); write_group.store_metadata(); } @@ -1099,19 +1099,7 @@ class ivf_pq_index { dimensions_, num_clusters_, num_subspaces_); - if (write_group.get_all_ingestion_timestamps().size() == 1 && - write_group.get_previous_ingestion_timestamp() == 0 && - write_group.get_all_base_sizes().size() == 1 && - write_group.get_previous_base_size() == 0) { - write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end()); - write_group.set_base_size(::num_vectors(vectors)); - write_group.set_num_partitions(num_partitions_); - } else { - write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end()); - write_group.append_base_size(::num_vectors(vectors)); - write_group.append_num_partitions(num_partitions_); - } - + write_group.set_base_size(::num_vectors(vectors)); write_group.store_metadata(); } @@ -1360,7 +1348,6 @@ class ivf_pq_index { auto&& [active_partitions, active_queries] = detail::ivf::partition_ivf_flat_index( flat_ivf_centroids_, query_vectors, nprobe, num_threads_); - auto query_to_pq_centroid_distance_tables = std::move(*generate_query_to_pq_centroid_distance_tables< Q, @@ -1379,6 +1366,7 @@ class ivf_pq_index { make_pq_distance_query_to_pq_centroid_distance_tables< std::span, decltype(pq_storage_type{}[0])>()); + return rerank( std::move(initial_distances), std::move(initial_ids), @@ -1421,15 +1409,6 @@ class ivf_pq_index { } } - auto all_feature_vectors = - tdbColMajorMatrixWithIds( - group_->cached_ctx(), - group_->feature_vectors_uri(), - group_->ids_uri(), - 100, - temporal_policy_); - all_feature_vectors.load(); - auto feature_vectors = tdbColMajorMatrixMultiRange( group_->cached_ctx(), diff --git a/src/include/test/unit_api_ivf_pq_index.cc b/src/include/test/unit_api_ivf_pq_index.cc index aa7a88c80..1478e566d 100644 --- a/src/include/test/unit_api_ivf_pq_index.cc +++ b/src/include/test/unit_api_ivf_pq_index.cc @@ -823,6 +823,7 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") { // We then load the trained index and don't set a timestamp (which means // we'll load it at timestamp 99). auto index = IndexIVFPQ(ctx, index_uri); + // Check that we can do finite and infinite queries and then train + write // the index. { @@ -879,7 +880,6 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") { {{11, 11, 11}, {22, 22, 22}, {33, 33, 33}, {44, 44, 44}, {55, 55, 55}}}; auto&& [scores, ids] = index.query(FeatureVectorArray(queries), top_k, nprobe); - check_single_vector_equals( scores, ids, {0, 0, 0, 0, 0}, {11, 22, 33, 44, 55}); @@ -976,7 +976,6 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") { all_ingestion_timestamps.end(), std::vector{99, 100}.begin())); } - // Load it at timestamp 5 (before ingestion) and make sure we can query and be // returned fill values. for (auto upper_bound : std::vector{0, 4}) { diff --git a/src/include/test/unit_ivf_pq_index.cc b/src/include/test/unit_ivf_pq_index.cc index 3cbc72454..eeed52bd5 100644 --- a/src/include/test/unit_ivf_pq_index.cc +++ b/src/include/test/unit_ivf_pq_index.cc @@ -298,100 +298,100 @@ TEST_CASE( // #if 0 TEMPLATE_TEST_CASE( - "query stacked hypercube", - "[flativf_index]", - float, - uint8_t) { -size_t k_dist = GENERATE(0, 32); -size_t k_near = k_dist; -size_t k_far = k_dist; + "query stacked hypercube", + "[flativf_index]", + float, + uint8_t) { + size_t k_dist = GENERATE(0, 32); + size_t k_near = k_dist; + size_t k_far = k_dist; -auto hypercube0 = build_hypercube(k_near, k_far, 0xdeadbeef); -auto hypercube1 = build_hypercube(k_near, k_far, 0xbeefdead); + auto hypercube0 = build_hypercube(k_near, k_far, 0xdeadbeef); + auto hypercube1 = build_hypercube(k_near, k_far, 0xbeefdead); -auto hypercube2 = ColMajorMatrix(6, num_vectors(hypercube0)); -auto hypercube4 = ColMajorMatrix(12, num_vectors(hypercube0)); + auto hypercube2 = ColMajorMatrix(6, num_vectors(hypercube0)); + auto hypercube4 = ColMajorMatrix(12, num_vectors(hypercube0)); -std::vector ids(num_vectors(hypercube0)); -std::iota(begin(ids), end(ids), 0); + std::vector ids(num_vectors(hypercube0)); + std::iota(begin(ids), end(ids), 0); -for (size_t j = 0; j < 3; ++j) { - for (size_t i = 0; i < num_vectors(hypercube4); ++i) { - hypercube2(j, i) = hypercube0(j, i); - hypercube2(j + 3, i) = hypercube1(j, i); + for (size_t j = 0; j < 3; ++j) { + for (size_t i = 0; i < num_vectors(hypercube4); ++i) { + hypercube2(j, i) = hypercube0(j, i); + hypercube2(j + 3, i) = hypercube1(j, i); - hypercube4(j, i) = hypercube0(j, i); - hypercube4(j + 3, i) = hypercube1(j, i); - hypercube4(j + 6, i) = hypercube0(j, i); - hypercube4(j + 9, i) = hypercube1(j, i); + hypercube4(j, i) = hypercube0(j, i); + hypercube4(j + 3, i) = hypercube1(j, i); + hypercube4(j + 6, i) = hypercube0(j, i); + hypercube4(j + 9, i) = hypercube1(j, i); + } } -} -SECTION("partitions = 1") { - size_t k_nn = 6; - size_t partitions = 1; + SECTION("partitions = 1") { + size_t k_nn = 6; + size_t partitions = 1; + + auto ivf_idx2 = ivf_pq_index( + /*128,*/ partitions, 2, 4, 1.e-4); + ivf_idx2.train_ivf(hypercube2); + ivf_idx2.add(hypercube2, ids); + auto ivf_idx4 = ivf_pq_index( + /*128,*/ partitions, 2, 4, 1.e-4); + ivf_idx4.train_ivf(hypercube4); + ivf_idx4.add(hypercube4, ids); + + auto top_k_ivf_scores = ColMajorMatrix(); + auto top_k_ivf = ColMajorMatrix(); + auto top_k_scores = ColMajorMatrix(); + auto top_k = ColMajorMatrix(); + auto query2 = ColMajorMatrix(); + auto query4 = ColMajorMatrix(); + + SECTION("query2/4 = 0...") { + query2 = ColMajorMatrix{{0, 0, 0, 0, 0, 0}}; + query4 = ColMajorMatrix{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + } + SECTION("query2/4 = 127...") { + query2 = ColMajorMatrix{{127, 127, 127, 127, 127, 127}}; + query4 = ColMajorMatrix{ + {127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}}; + } + SECTION("query2/4 = 0...") { + query2 = ColMajorMatrix{{0, 0, 0, 127, 127, 127}}; + query4 = ColMajorMatrix{ + {0, 0, 0, 0, 0, 0, 127, 127, 127, 127, 127, 127}}; + } + SECTION("query2/4 = 127...") { + query2 = ColMajorMatrix{{127, 127, 127, 0, 0, 0}}; + query4 = ColMajorMatrix{ + {127, 127, 127, 127, 127, 127, 0, 0, 0, 0, 0, 0}}; + } + SECTION("query2/4 = 127...") { + query2 = ColMajorMatrix{ + {127, 0, 127, 0, 127, 0}, {0, 127, 0, 127, 0, 127}}; + query4 = ColMajorMatrix{ + {127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0}, + {0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127}}; + } - auto ivf_idx2 = ivf_pq_index( - /*128,*/ partitions, 2, 4, 1.e-4); - ivf_idx2.train_ivf(hypercube2); - ivf_idx2.add(hypercube2, ids); - auto ivf_idx4 = ivf_pq_index( - /*128,*/ partitions, 2, 4, 1.e-4); - ivf_idx4.train_ivf(hypercube4); - ivf_idx4.add(hypercube4, ids); - - auto top_k_ivf_scores = ColMajorMatrix(); - auto top_k_ivf = ColMajorMatrix(); - auto top_k_scores = ColMajorMatrix(); - auto top_k = ColMajorMatrix(); - auto query2 = ColMajorMatrix(); - auto query4 = ColMajorMatrix(); - - SECTION("query2/4 = 0...") { - query2 = ColMajorMatrix{{0, 0, 0, 0, 0, 0}}; - query4 = ColMajorMatrix{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; - } - SECTION("query2/4 = 127...") { - query2 = ColMajorMatrix{{127, 127, 127, 127, 127, 127}}; - query4 = ColMajorMatrix{ - {127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}}; - } - SECTION("query2/4 = 0...") { - query2 = ColMajorMatrix{{0, 0, 0, 127, 127, 127}}; - query4 = ColMajorMatrix{ - {0, 0, 0, 0, 0, 0, 127, 127, 127, 127, 127, 127}}; - } - SECTION("query2/4 = 127...") { - query2 = ColMajorMatrix{{127, 127, 127, 0, 0, 0}}; - query4 = ColMajorMatrix{ - {127, 127, 127, 127, 127, 127, 0, 0, 0, 0, 0, 0}}; - } - SECTION("query2/4 = 127...") { - query2 = ColMajorMatrix{ - {127, 0, 127, 0, 127, 0}, {0, 127, 0, 127, 0, 127}}; - query4 = ColMajorMatrix{ - {127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0}, - {0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127}}; - } - - std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap( - hypercube2, query2, k_nn, 1, sum_of_squares_distance{}); - std::tie(top_k_ivf_scores, top_k_ivf) = - ivf_idx2.query_infinite_ram(query2, k_nn, 1); // k, nprobe - size_t intersections0 = count_intersections(top_k_ivf, top_k, k_nn); - double recall0 = intersections0 / ((double)top_k.num_cols() * k_nn); - CHECK(intersections0 == k_nn * num_vectors(query2)); - CHECK(recall0 == 1.0); - - std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap( - hypercube4, query4, k_nn, 1, sum_of_squares_distance{}); - std::tie(top_k_ivf_scores, top_k_ivf) = - ivf_idx4.query_infinite_ram(query4, k_nn, 1); // k, nprobe - - size_t intersections1 = (long)count_intersections(top_k_ivf, top_k, k_nn); - double recall1 = intersections1 / ((double)top_k.num_cols() * k_nn); - CHECK(intersections1 == k_nn * num_vectors(query4)); - CHECK(recall1 == 1.0); -} + std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap( + hypercube2, query2, k_nn, 1, sum_of_squares_distance{}); + std::tie(top_k_ivf_scores, top_k_ivf) = + ivf_idx2.query_infinite_ram(query2, k_nn, 1); // k, nprobe + size_t intersections0 = count_intersections(top_k_ivf, top_k, k_nn); + double recall0 = intersections0 / ((double)top_k.num_cols() * k_nn); + CHECK(intersections0 == k_nn * num_vectors(query2)); + CHECK(recall0 == 1.0); + + std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap( + hypercube4, query4, k_nn, 1, sum_of_squares_distance{}); + std::tie(top_k_ivf_scores, top_k_ivf) = + ivf_idx4.query_infinite_ram(query4, k_nn, 1); // k, nprobe + + size_t intersections1 = (long)count_intersections(top_k_ivf, top_k, k_nn); + double recall1 = intersections1 / ((double)top_k.num_cols() * k_nn); + CHECK(intersections1 == k_nn * num_vectors(query4)); + CHECK(recall1 == 1.0); + } } #endif @@ -485,8 +485,6 @@ TEST_CASE("query empty index", "[ivf_pq_index]") { ctx, index_uri, dimensions, partitions, dimensions / 2); ivf_pq_index index( ctx, index_uri); - // auto index = ivf_pq_index( - // partitions, dimensions / 2); auto queries = ColMajorMatrix{{{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}}; @@ -496,7 +494,6 @@ TEST_CASE("query empty index", "[ivf_pq_index]") { ColMajorMatrixWithIds(dimensions, num_vectors); index.train(data); index.ingest(data, data.raveled_ids()); - // CHECK(index.num_vectors() == num_vectors); } // We can query an empty index. { diff --git a/src/include/test/unit_tdb_io.cc b/src/include/test/unit_tdb_io.cc index 342a31ded..c6dcafa36 100644 --- a/src/include/test/unit_tdb_io.cc +++ b/src/include/test/unit_tdb_io.cc @@ -276,31 +276,6 @@ TEST_CASE("create group", "[tdb_io]") { read_group.close(); } -TEMPLATE_TEST_CASE( - "types", "[tdb_io]", int, float, uint8_t, uint32_t, uint64_t) { - size_t N = 10; - - tiledb::Context ctx; - tiledb::Config cfg; - std::string uri = - (std::filesystem::temp_directory_path() / "tmp_group").string(); - tiledb::VFS vfs(ctx); - if (vfs.is_dir(uri)) { - vfs.remove_dir(uri); - } - - static const int32_t default_domain{std::numeric_limits::max() - 1}; - static const int32_t default_tile_extent{100'000}; - static const int32_t tile_size_bytes{64 * 1024 * 1024}; - tiledb_filter_type_t default_compression = TILEDB_FILTER_ZSTD; - create_empty_for_vector( - ctx, uri, default_domain, default_tile_extent, default_compression); - - auto vector = std::vector(N); - std::iota(begin(vector), end(vector), 17); - write_vector(ctx, vector, uri, 0, false); -} - TEST_CASE("read vector slices", "[tdb_io]") { tiledb::Context ctx; std::string uri =