Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various small code cleanups from IVF_PQ OOC work #552

Merged
merged 3 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions apis/python/src/tiledb/vector_search/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,9 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
tiledb.consolidate(self.updates_array_uri, config=conf)

# We don't copy the centroids if self.partitions=0 because this means our index was previously empty.
should_pass_copy_centroids_uri = (
self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0
)
if should_pass_copy_centroids_uri:
copy_centroids_uri = None
if self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0:
copy_centroids_uri = self.centroids_uri
# Make sure the user didn't pass an incorrect number of partitions.
if "partitions" in kwargs and self.partitions != kwargs["partitions"]:
raise ValueError(
Expand All @@ -565,9 +564,7 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
index_timestamp=max_timestamp,
distance_metric=self.distance_metric,
storage_version=self.storage_version,
copy_centroids_uri=self.centroids_uri
if should_pass_copy_centroids_uri
else None,
copy_centroids_uri=copy_centroids_uri,
config=self.config,
**kwargs,
)
Expand Down
2 changes: 0 additions & 2 deletions apis/python/src/tiledb/vector_search/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,9 +1738,7 @@ def ingest_vectors_udf(
if part_end > end:
part_end = end

str(part) + "-" + str(part_end)
part_id = int(part / batch)
part_id * (partitions + 1)

logger.debug("Input vectors start_pos: %d, end_pos: %d", part, part_end)
updated_ids = read_updated_ids(
Expand Down
20 changes: 10 additions & 10 deletions apis/python/test/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,28 +495,28 @@ def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
# number of columns in the indexed data.
size = 1000
indexes = ["FLAT", "IVF_FLAT", "VAMANA", "IVF_PQ"]
num_columns_in_vector = [1, 2, 3, 4, 5, 10]
dimensions_in_ingestion = [1, 2, 3, 4, 5, 10]
for index_type in indexes:
for num_columns in num_columns_in_vector:
index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}")
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}")
for dimensions in dimensions_in_ingestion:
index_uri = os.path.join(tmp_path, f"array_{index_type}_{dimensions}")
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{dimensions}")
create_random_dataset_f32_only_data(
nb=size, d=num_columns, centers=1, path=dataset_dir
nb=size, d=dimensions, centers=1, path=dataset_dir
)
index = ingest(
index_type=index_type,
index_uri=index_uri,
source_uri=os.path.join(dataset_dir, "data.f32bin"),
num_subspaces=num_columns,
num_subspaces=dimensions,
partitions=1,
)

# We have created a dataset with num_columns in each vector. Let's try creating queries
# We have created a dataset with dimensions in each vector. Let's try creating queries
# with different numbers of columns and confirming incorrect ones will throw.
for num_columns_for_query in range(1, num_columns + 2):
query_shape = (1, num_columns_for_query)
for dimensions_in_query in range(1, dimensions + 2):
query_shape = (1, dimensions_in_query)
query = np.random.rand(*query_shape).astype(np.float32)
if query.shape[1] == num_columns:
if query.shape[1] == dimensions:
index.query(query, k=1, nprobe=1)
else:
with pytest.raises(TypeError):
Expand Down
13 changes: 13 additions & 0 deletions apis/python/test/test_type_erased_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,19 @@ def test_numpy_to_feature_vector_array_with_ids():
assert b.ids_type_string() == "uint64"


def test_numpy_to_feature_vector_single_item():
a = np.array([1], dtype=np.float32)
assert a.ndim == 1
assert a.shape == (1,)
b = vspy.FeatureVector(a)
assert b.dimensions() == 1
assert b.feature_type_string() == "float32"
c = np.array(b)
assert c.ndim == 1
assert c.shape == (1,)
assert (a == c).all()


def test_TemporalPolicy():
temporal_policy = vspy.TemporalPolicy()
assert temporal_policy.timestamp_start() == 0
Expand Down
33 changes: 20 additions & 13 deletions src/include/index/ivf_pq_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,34 +163,38 @@ class ivf_pq_group : public base_index_group<index_type> {
/*****************************************************************************
* Inverted index information: centroids, index, pq_parts, ids
****************************************************************************/
[[nodiscard]] auto cluster_centroids_uri() const {
return this->array_key_to_uri("cluster_centroids_array_name");
}
[[nodiscard]] auto flat_ivf_centroids_uri() const {
return this->array_key_to_uri("flat_ivf_centroids_array_name");
}
[[nodiscard]] auto pq_ivf_indices_uri() const {
return this->array_key_to_uri("pq_ivf_indices_array_name");
}
[[nodiscard]] auto pq_ivf_ids_uri() const {
return this->array_key_to_uri("pq_ivf_ids_array_name");
}
[[nodiscard]] auto pq_ivf_vectors_uri() const {
return this->array_key_to_uri("pq_ivf_vectors_array_name");
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
return this->array_key_to_array_name("flat_ivf_centroids_array_name");
}

[[nodiscard]] auto cluster_centroids_uri() const {
return this->array_key_to_uri("cluster_centroids_array_name");
}
[[nodiscard]] auto cluster_centroids_array_name() const {
return this->array_key_to_array_name("cluster_centroids_array_name");
}
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
return this->array_key_to_array_name("flat_ivf_centroids_array_name");

[[nodiscard]] auto pq_ivf_indices_uri() const {
return this->array_key_to_uri("pq_ivf_indices_array_name");
}
[[nodiscard]] auto pq_ivf_indices_array_name() const {
return this->array_key_to_array_name("pq_ivf_indices_array_name");
}

[[nodiscard]] auto pq_ivf_ids_uri() const {
return this->array_key_to_uri("pq_ivf_ids_array_name");
}
[[nodiscard]] auto pq_ivf_ids_array_name() const {
return this->array_key_to_array_name("pq_ivf_ids_array_name");
}

[[nodiscard]] auto pq_ivf_vectors_uri() const {
return this->array_key_to_uri("pq_ivf_vectors_array_name");
}

[[nodiscard]] auto pq_ivf_vectors_array_name() const {
return this->array_key_to_array_name("pq_ivf_vectors_array_name");
}
Expand All @@ -205,6 +209,9 @@ class ivf_pq_group : public base_index_group<index_type> {
metadata_.num_subspaces_ = num_subspaces;
}

uint32_t get_sub_dimensions() const {
return metadata_.sub_dimensions_;
}
void set_sub_dimensions(uint32_t sub_dimensions) {
metadata_.sub_dimensions_ = sub_dimensions;
}
Expand Down
8 changes: 8 additions & 0 deletions src/include/index/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -1665,6 +1665,8 @@ class ivf_pq_index {
<< " num_vectors(rhs): " << ::num_vectors(rhs) << std::endl;
std::cout << "dimensions(lhs): " << ::dimensions(lhs)
<< " dimensions(rhs): " << ::dimensions(rhs) << std::endl;
debug_matrix(lhs, "[ivf_pq_index@compare_feature_vector_arrays] lhs");
debug_matrix(rhs, "[ivf_pq_index@compare_feature_vector_arrays] rhs");
return false;
}
for (size_t i = 0; i < ::num_vectors(lhs); ++i) {
Expand Down Expand Up @@ -1737,6 +1739,9 @@ class ivf_pq_index {
return true;
}
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
std::cout << "[ivf_pq_index@compare_ivf_index] partitioned_pq_vectors_ "
"|| rhs.partitioned_pq_vectors_ is nullptr"
<< std::endl;
return false;
}
return compare_feature_vectors(
Expand All @@ -1750,6 +1755,9 @@ class ivf_pq_index {
return true;
}
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
std::cout << "[ivf_pq_index@compare_ivf_ids] partitioned_pq_vectors_ || "
"rhs.partitioned_pq_vectors_ is nullptr"
<< std::endl;
return false;
}
return compare_feature_vectors(
Expand Down
22 changes: 11 additions & 11 deletions src/include/test/unit_api_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,11 +509,11 @@ TEST_CASE("infer dimension", "[api_ivf_pq_index]") {

TEST_CASE("write and read", "[api_ivf_pq_index]") {
auto ctx = tiledb::Context{};
std::string api_ivf_pq_index_uri =
std::string index_uri =
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
tiledb::VFS vfs(ctx);
if (vfs.is_dir(api_ivf_pq_index_uri)) {
vfs.remove_dir(api_ivf_pq_index_uri);
if (vfs.is_dir(index_uri)) {
vfs.remove_dir(index_uri);
}

auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
Expand All @@ -524,9 +524,9 @@ TEST_CASE("write and read", "[api_ivf_pq_index]") {
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
a.train(training_set);
a.add(training_set);
a.write_index(ctx, api_ivf_pq_index_uri);
a.write_index(ctx, index_uri);

auto b = IndexIVFPQ(ctx, api_ivf_pq_index_uri);
auto b = IndexIVFPQ(ctx, index_uri);

CHECK(dimensions(a) == dimensions(b));
CHECK(a.feature_type() == b.feature_type());
Expand Down Expand Up @@ -561,10 +561,10 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {

size_t k_nn = 10;

std::string api_ivf_pq_index_uri =
std::string index_uri =
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
if (vfs.is_dir(api_ivf_pq_index_uri)) {
vfs.remove_dir(api_ivf_pq_index_uri);
if (vfs.is_dir(index_uri)) {
vfs.remove_dir(index_uri);
}

auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
Expand All @@ -576,19 +576,19 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
a.train(training_set);
a.add(training_set);
a.write_index(ctx, api_ivf_pq_index_uri);
a.write_index(ctx, index_uri);

auto query_set = FeatureVectorArray(ctx, siftsmall_query_uri);
auto groundtruth_set = FeatureVectorArray(ctx, siftsmall_groundtruth_uri);

std::unique_ptr<IndexIVFPQ> b;
SECTION("infinite") {
b = std::make_unique<IndexIVFPQ>(ctx, api_ivf_pq_index_uri);
b = std::make_unique<IndexIVFPQ>(ctx, index_uri);
}
SECTION("finite") {
size_t upper_bound = GENERATE(500, 1000);
b = std::make_unique<IndexIVFPQ>(
ctx, api_ivf_pq_index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
ctx, index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
CHECK(b->upper_bound() == upper_bound);
}

Expand Down
14 changes: 7 additions & 7 deletions src/include/test/unit_flat_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -534,13 +534,13 @@ TEMPLATE_TEST_CASE(
scale *= a_vx_pqx2;
}

CHECK(a_vx_pqx2 / scale < 0.00005);
CHECK(a_dpqx_evx2 / scale < 0.00005);
CHECK(s_evx_pqx2 / scale < 0.00005);
CHECK(ss_vx_dpqx2 / scale < 0.00005);
CHECK(s_evx_edpqx2 / scale < 0.00005);
CHECK(a_evx_edpqx2 / scale < 0.00005);
CHECK(ss_devx_dpqx2 / scale < 0.00005);
CHECK(a_vx_pqx2 / scale < 0.00006);
CHECK(a_dpqx_evx2 / scale < 0.00006);
CHECK(s_evx_pqx2 / scale < 0.00006);
CHECK(ss_vx_dpqx2 / scale < 0.00006);
CHECK(s_evx_edpqx2 / scale < 0.00006);
CHECK(a_evx_edpqx2 / scale < 0.00006);
CHECK(ss_devx_dpqx2 / scale < 0.00006);
CHECK(ss_devx_vx2 / scale < 0.000075);
}
}
Expand Down
Loading
Loading