From 4789993334a9736f094067581d6598757f71a19d Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Tue, 15 Oct 2024 17:07:02 -0700 Subject: [PATCH] Misc small code cleanups --- apis/python/src/tiledb/vector_search/index.py | 17 +++-- .../src/tiledb/vector_search/ingestion.py | 2 - apis/python/test/test_index.py | 20 +++--- apis/python/test/test_type_erased_module.py | 13 ++++ src/include/index/ivf_pq_group.h | 33 +++++---- src/include/index/ivf_pq_index.h | 8 +++ src/include/test/unit_api_ivf_pq_index.cc | 22 +++--- src/include/test/unit_ivf_pq_index.cc | 67 +++---------------- 8 files changed, 83 insertions(+), 99 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py index d82171a34..e69fe93a7 100644 --- a/apis/python/src/tiledb/vector_search/index.py +++ b/apis/python/src/tiledb/vector_search/index.py @@ -542,10 +542,15 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs): tiledb.consolidate(self.updates_array_uri, config=conf) # We don't copy the centroids if self.partitions=0 because this means our index was previously empty. - should_pass_copy_centroids_uri = ( - self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0 - ) - if should_pass_copy_centroids_uri: + copy_centroids_uri = None + if ( + (self.index_type == "IVF_FLAT" or self.index_type == "IVF_PQ") + and not retrain_index + and self.partitions > 0 + ): + if self.index_type == "IVF_FLAT": + # TODO(paris): Update so that IVF_PQ can also copy the centroids. We also need to pass the PQ-centroids. + copy_centroids_uri = self.centroids_uri # Make sure the user didn't pass an incorrect number of partitions. if "partitions" in kwargs and self.partitions != kwargs["partitions"]: raise ValueError( @@ -565,9 +570,7 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs): index_timestamp=max_timestamp, distance_metric=self.distance_metric, storage_version=self.storage_version, - copy_centroids_uri=self.centroids_uri - if should_pass_copy_centroids_uri - else None, + copy_centroids_uri=copy_centroids_uri, config=self.config, **kwargs, ) diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py index e8b221e3b..a186e29fb 100644 --- a/apis/python/src/tiledb/vector_search/ingestion.py +++ b/apis/python/src/tiledb/vector_search/ingestion.py @@ -1738,9 +1738,7 @@ def ingest_vectors_udf( if part_end > end: part_end = end - str(part) + "-" + str(part_end) part_id = int(part / batch) - part_id * (partitions + 1) logger.debug("Input vectors start_pos: %d, end_pos: %d", part, part_end) updated_ids = read_updated_ids( diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index 65ed73f31..87e242e31 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -495,28 +495,28 @@ def test_index_with_incorrect_num_of_query_columns_complex(tmp_path): # number of columns in the indexed data. size = 1000 indexes = ["FLAT", "IVF_FLAT", "VAMANA", "IVF_PQ"] - num_columns_in_vector = [1, 2, 3, 4, 5, 10] + dimensions_in_ingestion = [1, 2, 3, 4, 5, 10] for index_type in indexes: - for num_columns in num_columns_in_vector: - index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}") - dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}") + for dimensions in dimensions_in_ingestion: + index_uri = os.path.join(tmp_path, f"array_{index_type}_{dimensions}") + dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{dimensions}") create_random_dataset_f32_only_data( - nb=size, d=num_columns, centers=1, path=dataset_dir + nb=size, d=dimensions, centers=1, path=dataset_dir ) index = ingest( index_type=index_type, index_uri=index_uri, source_uri=os.path.join(dataset_dir, "data.f32bin"), - num_subspaces=num_columns, + num_subspaces=dimensions, partitions=1, ) - # We have created a dataset with num_columns in each vector. Let's try creating queries + # We have created a dataset with dimensions in each vector. Let's try creating queries # with different numbers of columns and confirming incorrect ones will throw. - for num_columns_for_query in range(1, num_columns + 2): - query_shape = (1, num_columns_for_query) + for dimensions_in_query in range(1, dimensions + 2): + query_shape = (1, dimensions_in_query) query = np.random.rand(*query_shape).astype(np.float32) - if query.shape[1] == num_columns: + if query.shape[1] == dimensions: index.query(query, k=1, nprobe=1) else: with pytest.raises(TypeError): diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py index e182caeb4..4d3fe803f 100644 --- a/apis/python/test/test_type_erased_module.py +++ b/apis/python/test/test_type_erased_module.py @@ -234,6 +234,19 @@ def test_numpy_to_feature_vector_array_with_ids(): assert b.ids_type_string() == "uint64" +def test_numpy_to_feature_vector_single_item(): + a = np.array([1], dtype=np.float32) + assert a.ndim == 1 + assert a.shape == (1,) + b = vspy.FeatureVector(a) + assert b.dimensions() == 1 + assert b.feature_type_string() == "float32" + c = np.array(b) + assert c.ndim == 1 + assert c.shape == (1,) + assert (a == c).all() + + def test_TemporalPolicy(): temporal_policy = vspy.TemporalPolicy() assert temporal_policy.timestamp_start() == 0 diff --git a/src/include/index/ivf_pq_group.h b/src/include/index/ivf_pq_group.h index 8d8b20b2e..1eee68c6f 100644 --- a/src/include/index/ivf_pq_group.h +++ b/src/include/index/ivf_pq_group.h @@ -163,34 +163,38 @@ class ivf_pq_group : public base_index_group { /***************************************************************************** * Inverted index information: centroids, index, pq_parts, ids ****************************************************************************/ - [[nodiscard]] auto cluster_centroids_uri() const { - return this->array_key_to_uri("cluster_centroids_array_name"); - } [[nodiscard]] auto flat_ivf_centroids_uri() const { return this->array_key_to_uri("flat_ivf_centroids_array_name"); } - [[nodiscard]] auto pq_ivf_indices_uri() const { - return this->array_key_to_uri("pq_ivf_indices_array_name"); - } - [[nodiscard]] auto pq_ivf_ids_uri() const { - return this->array_key_to_uri("pq_ivf_ids_array_name"); - } - [[nodiscard]] auto pq_ivf_vectors_uri() const { - return this->array_key_to_uri("pq_ivf_vectors_array_name"); + [[nodiscard]] auto flat_ivf_centroids_array_name() const { + return this->array_key_to_array_name("flat_ivf_centroids_array_name"); } + [[nodiscard]] auto cluster_centroids_uri() const { + return this->array_key_to_uri("cluster_centroids_array_name"); + } [[nodiscard]] auto cluster_centroids_array_name() const { return this->array_key_to_array_name("cluster_centroids_array_name"); } - [[nodiscard]] auto flat_ivf_centroids_array_name() const { - return this->array_key_to_array_name("flat_ivf_centroids_array_name"); + + [[nodiscard]] auto pq_ivf_indices_uri() const { + return this->array_key_to_uri("pq_ivf_indices_array_name"); } [[nodiscard]] auto pq_ivf_indices_array_name() const { return this->array_key_to_array_name("pq_ivf_indices_array_name"); } + + [[nodiscard]] auto pq_ivf_ids_uri() const { + return this->array_key_to_uri("pq_ivf_ids_array_name"); + } [[nodiscard]] auto pq_ivf_ids_array_name() const { return this->array_key_to_array_name("pq_ivf_ids_array_name"); } + + [[nodiscard]] auto pq_ivf_vectors_uri() const { + return this->array_key_to_uri("pq_ivf_vectors_array_name"); + } + [[nodiscard]] auto pq_ivf_vectors_array_name() const { return this->array_key_to_array_name("pq_ivf_vectors_array_name"); } @@ -205,6 +209,9 @@ class ivf_pq_group : public base_index_group { metadata_.num_subspaces_ = num_subspaces; } + uint32_t get_sub_dimensions() const { + return metadata_.sub_dimensions_; + } void set_sub_dimensions(uint32_t sub_dimensions) { metadata_.sub_dimensions_ = sub_dimensions; } diff --git a/src/include/index/ivf_pq_index.h b/src/include/index/ivf_pq_index.h index f8c6034b5..b4a02f0d7 100644 --- a/src/include/index/ivf_pq_index.h +++ b/src/include/index/ivf_pq_index.h @@ -1665,6 +1665,8 @@ class ivf_pq_index { << " num_vectors(rhs): " << ::num_vectors(rhs) << std::endl; std::cout << "dimensions(lhs): " << ::dimensions(lhs) << " dimensions(rhs): " << ::dimensions(rhs) << std::endl; + debug_matrix(lhs, "[ivf_pq_index@compare_feature_vector_arrays] lhs"); + debug_matrix(rhs, "[ivf_pq_index@compare_feature_vector_arrays] rhs"); return false; } for (size_t i = 0; i < ::num_vectors(lhs); ++i) { @@ -1737,6 +1739,9 @@ class ivf_pq_index { return true; } if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) { + std::cout << "[ivf_pq_index@compare_ivf_index] partitioned_pq_vectors_ " + "|| rhs.partitioned_pq_vectors_ is nullptr" + << std::endl; return false; } return compare_feature_vectors( @@ -1750,6 +1755,9 @@ class ivf_pq_index { return true; } if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) { + std::cout << "[ivf_pq_index@compare_ivf_ids] partitioned_pq_vectors_ || " + "rhs.partitioned_pq_vectors_ is nullptr" + << std::endl; return false; } return compare_feature_vectors( diff --git a/src/include/test/unit_api_ivf_pq_index.cc b/src/include/test/unit_api_ivf_pq_index.cc index 9936248d1..537c50aae 100644 --- a/src/include/test/unit_api_ivf_pq_index.cc +++ b/src/include/test/unit_api_ivf_pq_index.cc @@ -509,11 +509,11 @@ TEST_CASE("infer dimension", "[api_ivf_pq_index]") { TEST_CASE("write and read", "[api_ivf_pq_index]") { auto ctx = tiledb::Context{}; - std::string api_ivf_pq_index_uri = + std::string index_uri = (std::filesystem::temp_directory_path() / "api_ivf_pq_index").string(); tiledb::VFS vfs(ctx); - if (vfs.is_dir(api_ivf_pq_index_uri)) { - vfs.remove_dir(api_ivf_pq_index_uri); + if (vfs.is_dir(index_uri)) { + vfs.remove_dir(index_uri); } auto a = IndexIVFPQ(std::make_optional( @@ -524,9 +524,9 @@ TEST_CASE("write and read", "[api_ivf_pq_index]") { auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri); a.train(training_set); a.add(training_set); - a.write_index(ctx, api_ivf_pq_index_uri); + a.write_index(ctx, index_uri); - auto b = IndexIVFPQ(ctx, api_ivf_pq_index_uri); + auto b = IndexIVFPQ(ctx, index_uri); CHECK(dimensions(a) == dimensions(b)); CHECK(a.feature_type() == b.feature_type()); @@ -561,10 +561,10 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") { size_t k_nn = 10; - std::string api_ivf_pq_index_uri = + std::string index_uri = (std::filesystem::temp_directory_path() / "api_ivf_pq_index").string(); - if (vfs.is_dir(api_ivf_pq_index_uri)) { - vfs.remove_dir(api_ivf_pq_index_uri); + if (vfs.is_dir(index_uri)) { + vfs.remove_dir(index_uri); } auto a = IndexIVFPQ(std::make_optional( @@ -576,19 +576,19 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") { auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri); a.train(training_set); a.add(training_set); - a.write_index(ctx, api_ivf_pq_index_uri); + a.write_index(ctx, index_uri); auto query_set = FeatureVectorArray(ctx, siftsmall_query_uri); auto groundtruth_set = FeatureVectorArray(ctx, siftsmall_groundtruth_uri); std::unique_ptr b; SECTION("infinite") { - b = std::make_unique(ctx, api_ivf_pq_index_uri); + b = std::make_unique(ctx, index_uri); } SECTION("finite") { size_t upper_bound = GENERATE(500, 1000); b = std::make_unique( - ctx, api_ivf_pq_index_uri, IndexLoadStrategy::PQ_OOC, upper_bound); + ctx, index_uri, IndexLoadStrategy::PQ_OOC, upper_bound); CHECK(b->upper_bound() == upper_bound); } diff --git a/src/include/test/unit_ivf_pq_index.cc b/src/include/test/unit_ivf_pq_index.cc index b726f2bd1..562bb5dc8 100644 --- a/src/include/test/unit_ivf_pq_index.cc +++ b/src/include/test/unit_ivf_pq_index.cc @@ -102,24 +102,16 @@ TEST_CASE("test kmeans initializations", "[ivf_pq_index][init]") { index.set_flat_ivf_centroids(ColMajorMatrix(4, 3)); SECTION("random") { - if (debug) { - std::cout << "random" << std::endl; - } index.kmeans_random_init(training_data); } SECTION("kmeans++") { - if (debug) { - std::cout << "kmeans++" << std::endl; - } index.kmeans_pp(training_data); } CHECK(index.get_flat_ivf_centroids().num_cols() == 3); CHECK(index.get_flat_ivf_centroids().num_rows() == 4); - // debug_centroids(index); - for (size_t i = 0; i < index.get_flat_ivf_centroids().num_cols() - 1; ++i) { for (size_t j = i + 1; j < index.get_flat_ivf_centroids().num_cols(); ++j) { CHECK(!std::equal( @@ -145,8 +137,6 @@ TEST_CASE("test kmeans initializations", "[ivf_pq_index][init]") { } TEST_CASE("test kmeans", "[ivf_pq_index][kmeans]") { - const bool debug = false; - std::vector data = {8, 6, 7, 5, 3, 3, 7, 2, 1, 4, 1, 3, 0, 5, 1, 2, 9, 9, 5, 9, 2, 0, 2, 7, 7, 9, 8, 6, 7, 9, 6, 6}; @@ -157,16 +147,10 @@ TEST_CASE("test kmeans", "[ivf_pq_index][kmeans]") { /*4,*/ 3, 2, 10, 1e-4); SECTION("random") { - if (debug) { - std::cout << "random" << std::endl; - } index.train_ivf(training_data, kmeans_init::random); } SECTION("kmeans++") { - if (debug) { - std::cout << "kmeans++" << std::endl; - } index.train_ivf(training_data, kmeans_init::kmeanspp); } } @@ -190,9 +174,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") { {0.7306664, 5.7294807}}}; SECTION("one iteration") { - if (debug) { - std::cout << "one iteration" << std::endl; - } auto index = ivf_pq_index( /*sklearn_centroids.num_rows(),*/ sklearn_centroids.num_cols(), @@ -204,37 +185,20 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") { } SECTION("two iterations") { - if (debug) { - std::cout << "two iterations" << std::endl; - } auto index = ivf_pq_index( - /*sklearn_centroids.num_rows(),*/ - sklearn_centroids.num_cols(), - 2, - 2, - 1e-4); + sklearn_centroids.num_cols(), 2, 2, 1e-4); index.set_flat_ivf_centroids(sklearn_centroids); index.train_ivf(training_data, kmeans_init::none); } SECTION("five iterations") { - if (debug) { - std::cout << "five iterations" << std::endl; - } auto index = ivf_pq_index( - /* sklearn_centroids.num_rows(), */ - sklearn_centroids.num_cols(), - 2, - 5, - 1e-4); + sklearn_centroids.num_cols(), 2, 5, 1e-4); index.set_flat_ivf_centroids(sklearn_centroids); index.train_ivf(training_data, kmeans_init::none); } SECTION("five iterations, perturbed") { - if (debug) { - std::cout << "five iterations, perturbed" << std::endl; - } for (size_t i = 0; i < sklearn_centroids.num_cols(); ++i) { for (size_t j = 0; j < sklearn_centroids.num_rows(); ++j) { sklearn_centroids(j, i) *= 0.8; @@ -253,9 +217,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") { } SECTION("five iterations") { - if (debug) { - std::cout << "five iterations" << std::endl; - } auto index = ivf_pq_index( /* sklearn_centroids.num_rows(), */ sklearn_centroids.num_cols(), @@ -470,21 +431,16 @@ TEST_CASE("build index and infinite query in place", "[ivf_pq_index]") { std::tie(top_k_ivf_scores, top_k_ivf) = idx.query(query_set, k_nn, nprobe); - // NOTE: Can be used to debug the results: - // debug_slice(top_k_ivf, "top_k_ivf"); - // debug_slice(top_k_ivf_scores, "top_k_ivf_scores"); - // debug_slice(groundtruth_set, "groundtruth_set"); - init.verify(top_k_ivf); } TEST_CASE("ivf_pq_index write and read", "[ivf_pq_index]") { tiledb::Context ctx; - std::string ivf_pq_index_uri = + std::string index_uri = (std::filesystem::temp_directory_path() / "tmp_ivf_pq_index").string(); tiledb::VFS vfs(ctx); - if (vfs.is_dir(ivf_pq_index_uri)) { - vfs.remove_dir(ivf_pq_index_uri); + if (vfs.is_dir(index_uri)) { + vfs.remove_dir(index_uri); } auto training_set = tdbColMajorMatrix(ctx, siftsmall_inputs_uri, 0); load(training_set); @@ -496,14 +452,13 @@ TEST_CASE("ivf_pq_index write and read", "[ivf_pq_index]") { idx.train(training_set, ids); idx.add(training_set, ids); uint64_t write_timestamp = 1000; - idx.write_index( - ctx, ivf_pq_index_uri, TemporalPolicy(TimeTravel, write_timestamp)); + idx.write_index(ctx, index_uri, TemporalPolicy(TimeTravel, write_timestamp)); CHECK(idx.num_vectors() == ::num_vectors(training_set)); { // Load the index and check metadata. auto idx2 = ivf_pq_index( - ctx, ivf_pq_index_uri); + ctx, index_uri); CHECK(idx2.num_vectors() == ::num_vectors(training_set)); CHECK(idx2.group().get_dimensions() == sift_dimensions); CHECK(idx2.group().get_temp_size() == 0); @@ -521,16 +476,16 @@ TEST_CASE("ivf_pq_index write and read", "[ivf_pq_index]") { // CHECK(idx.compare_group(idx2)); auto idx3 = ivf_pq_index( - ctx, ivf_pq_index_uri); + ctx, index_uri); CHECK(idx2 == idx3); } { // Clear history. ivf_pq_index::clear_history( - ctx, ivf_pq_index_uri, write_timestamp + 10); + ctx, index_uri, write_timestamp + 10); auto idx2 = ivf_pq_index( - ctx, ivf_pq_index_uri); + ctx, index_uri); CHECK(idx2.num_vectors() == 0); CHECK(idx2.group().get_dimensions() == sift_dimensions); @@ -650,7 +605,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") { // We can train, add, query, and then write the index. { - auto training = ColMajorMatrixWithIds{ + auto training = ColMajorMatrixWithIds{ {{1, 1, 1, 1}, {2, 2, 2, 2}, {3, 3, 3, 3}, {4, 4, 4, 4}}, {11, 22, 33, 44}}; index.train(training, training.raveled_ids());