Skip to content

Commit

Permalink
Misc small code cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
jparismorgan committed Oct 16, 2024
1 parent 36978a6 commit 4789993
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 99 deletions.
17 changes: 10 additions & 7 deletions apis/python/src/tiledb/vector_search/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,15 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
tiledb.consolidate(self.updates_array_uri, config=conf)

# We don't copy the centroids if self.partitions=0 because this means our index was previously empty.
should_pass_copy_centroids_uri = (
self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0
)
if should_pass_copy_centroids_uri:
copy_centroids_uri = None
if (
(self.index_type == "IVF_FLAT" or self.index_type == "IVF_PQ")
and not retrain_index
and self.partitions > 0
):
if self.index_type == "IVF_FLAT":
# TODO(paris): Update so that IVF_PQ can also copy the centroids. We also need to pass the PQ-centroids.
copy_centroids_uri = self.centroids_uri
# Make sure the user didn't pass an incorrect number of partitions.
if "partitions" in kwargs and self.partitions != kwargs["partitions"]:
raise ValueError(
Expand All @@ -565,9 +570,7 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
index_timestamp=max_timestamp,
distance_metric=self.distance_metric,
storage_version=self.storage_version,
copy_centroids_uri=self.centroids_uri
if should_pass_copy_centroids_uri
else None,
copy_centroids_uri=copy_centroids_uri,
config=self.config,
**kwargs,
)
Expand Down
2 changes: 0 additions & 2 deletions apis/python/src/tiledb/vector_search/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,9 +1738,7 @@ def ingest_vectors_udf(
if part_end > end:
part_end = end

str(part) + "-" + str(part_end)
part_id = int(part / batch)
part_id * (partitions + 1)

logger.debug("Input vectors start_pos: %d, end_pos: %d", part, part_end)
updated_ids = read_updated_ids(
Expand Down
20 changes: 10 additions & 10 deletions apis/python/test/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,28 +495,28 @@ def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
# number of columns in the indexed data.
size = 1000
indexes = ["FLAT", "IVF_FLAT", "VAMANA", "IVF_PQ"]
num_columns_in_vector = [1, 2, 3, 4, 5, 10]
dimensions_in_ingestion = [1, 2, 3, 4, 5, 10]
for index_type in indexes:
for num_columns in num_columns_in_vector:
index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}")
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}")
for dimensions in dimensions_in_ingestion:
index_uri = os.path.join(tmp_path, f"array_{index_type}_{dimensions}")
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{dimensions}")
create_random_dataset_f32_only_data(
nb=size, d=num_columns, centers=1, path=dataset_dir
nb=size, d=dimensions, centers=1, path=dataset_dir
)
index = ingest(
index_type=index_type,
index_uri=index_uri,
source_uri=os.path.join(dataset_dir, "data.f32bin"),
num_subspaces=num_columns,
num_subspaces=dimensions,
partitions=1,
)

# We have created a dataset with num_columns in each vector. Let's try creating queries
# We have created a dataset with dimensions in each vector. Let's try creating queries
# with different numbers of columns and confirming incorrect ones will throw.
for num_columns_for_query in range(1, num_columns + 2):
query_shape = (1, num_columns_for_query)
for dimensions_in_query in range(1, dimensions + 2):
query_shape = (1, dimensions_in_query)
query = np.random.rand(*query_shape).astype(np.float32)
if query.shape[1] == num_columns:
if query.shape[1] == dimensions:
index.query(query, k=1, nprobe=1)
else:
with pytest.raises(TypeError):
Expand Down
13 changes: 13 additions & 0 deletions apis/python/test/test_type_erased_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,19 @@ def test_numpy_to_feature_vector_array_with_ids():
assert b.ids_type_string() == "uint64"


def test_numpy_to_feature_vector_single_item():
a = np.array([1], dtype=np.float32)
assert a.ndim == 1
assert a.shape == (1,)
b = vspy.FeatureVector(a)
assert b.dimensions() == 1
assert b.feature_type_string() == "float32"
c = np.array(b)
assert c.ndim == 1
assert c.shape == (1,)
assert (a == c).all()


def test_TemporalPolicy():
temporal_policy = vspy.TemporalPolicy()
assert temporal_policy.timestamp_start() == 0
Expand Down
33 changes: 20 additions & 13 deletions src/include/index/ivf_pq_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,34 +163,38 @@ class ivf_pq_group : public base_index_group<index_type> {
/*****************************************************************************
* Inverted index information: centroids, index, pq_parts, ids
****************************************************************************/
[[nodiscard]] auto cluster_centroids_uri() const {
return this->array_key_to_uri("cluster_centroids_array_name");
}
[[nodiscard]] auto flat_ivf_centroids_uri() const {
return this->array_key_to_uri("flat_ivf_centroids_array_name");
}
[[nodiscard]] auto pq_ivf_indices_uri() const {
return this->array_key_to_uri("pq_ivf_indices_array_name");
}
[[nodiscard]] auto pq_ivf_ids_uri() const {
return this->array_key_to_uri("pq_ivf_ids_array_name");
}
[[nodiscard]] auto pq_ivf_vectors_uri() const {
return this->array_key_to_uri("pq_ivf_vectors_array_name");
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
return this->array_key_to_array_name("flat_ivf_centroids_array_name");
}

[[nodiscard]] auto cluster_centroids_uri() const {
return this->array_key_to_uri("cluster_centroids_array_name");
}
[[nodiscard]] auto cluster_centroids_array_name() const {
return this->array_key_to_array_name("cluster_centroids_array_name");
}
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
return this->array_key_to_array_name("flat_ivf_centroids_array_name");

[[nodiscard]] auto pq_ivf_indices_uri() const {
return this->array_key_to_uri("pq_ivf_indices_array_name");
}
[[nodiscard]] auto pq_ivf_indices_array_name() const {
return this->array_key_to_array_name("pq_ivf_indices_array_name");
}

[[nodiscard]] auto pq_ivf_ids_uri() const {
return this->array_key_to_uri("pq_ivf_ids_array_name");
}
[[nodiscard]] auto pq_ivf_ids_array_name() const {
return this->array_key_to_array_name("pq_ivf_ids_array_name");
}

[[nodiscard]] auto pq_ivf_vectors_uri() const {
return this->array_key_to_uri("pq_ivf_vectors_array_name");
}

[[nodiscard]] auto pq_ivf_vectors_array_name() const {
return this->array_key_to_array_name("pq_ivf_vectors_array_name");
}
Expand All @@ -205,6 +209,9 @@ class ivf_pq_group : public base_index_group<index_type> {
metadata_.num_subspaces_ = num_subspaces;
}

uint32_t get_sub_dimensions() const {
return metadata_.sub_dimensions_;
}
void set_sub_dimensions(uint32_t sub_dimensions) {
metadata_.sub_dimensions_ = sub_dimensions;
}
Expand Down
8 changes: 8 additions & 0 deletions src/include/index/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -1665,6 +1665,8 @@ class ivf_pq_index {
<< " num_vectors(rhs): " << ::num_vectors(rhs) << std::endl;
std::cout << "dimensions(lhs): " << ::dimensions(lhs)
<< " dimensions(rhs): " << ::dimensions(rhs) << std::endl;
debug_matrix(lhs, "[ivf_pq_index@compare_feature_vector_arrays] lhs");
debug_matrix(rhs, "[ivf_pq_index@compare_feature_vector_arrays] rhs");
return false;
}
for (size_t i = 0; i < ::num_vectors(lhs); ++i) {
Expand Down Expand Up @@ -1737,6 +1739,9 @@ class ivf_pq_index {
return true;
}
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
std::cout << "[ivf_pq_index@compare_ivf_index] partitioned_pq_vectors_ "
"|| rhs.partitioned_pq_vectors_ is nullptr"
<< std::endl;
return false;
}
return compare_feature_vectors(
Expand All @@ -1750,6 +1755,9 @@ class ivf_pq_index {
return true;
}
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
std::cout << "[ivf_pq_index@compare_ivf_ids] partitioned_pq_vectors_ || "
"rhs.partitioned_pq_vectors_ is nullptr"
<< std::endl;
return false;
}
return compare_feature_vectors(
Expand Down
22 changes: 11 additions & 11 deletions src/include/test/unit_api_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,11 +509,11 @@ TEST_CASE("infer dimension", "[api_ivf_pq_index]") {

TEST_CASE("write and read", "[api_ivf_pq_index]") {
auto ctx = tiledb::Context{};
std::string api_ivf_pq_index_uri =
std::string index_uri =
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
tiledb::VFS vfs(ctx);
if (vfs.is_dir(api_ivf_pq_index_uri)) {
vfs.remove_dir(api_ivf_pq_index_uri);
if (vfs.is_dir(index_uri)) {
vfs.remove_dir(index_uri);
}

auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
Expand All @@ -524,9 +524,9 @@ TEST_CASE("write and read", "[api_ivf_pq_index]") {
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
a.train(training_set);
a.add(training_set);
a.write_index(ctx, api_ivf_pq_index_uri);
a.write_index(ctx, index_uri);

auto b = IndexIVFPQ(ctx, api_ivf_pq_index_uri);
auto b = IndexIVFPQ(ctx, index_uri);

CHECK(dimensions(a) == dimensions(b));
CHECK(a.feature_type() == b.feature_type());
Expand Down Expand Up @@ -561,10 +561,10 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {

size_t k_nn = 10;

std::string api_ivf_pq_index_uri =
std::string index_uri =
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
if (vfs.is_dir(api_ivf_pq_index_uri)) {
vfs.remove_dir(api_ivf_pq_index_uri);
if (vfs.is_dir(index_uri)) {
vfs.remove_dir(index_uri);
}

auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
Expand All @@ -576,19 +576,19 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
a.train(training_set);
a.add(training_set);
a.write_index(ctx, api_ivf_pq_index_uri);
a.write_index(ctx, index_uri);

auto query_set = FeatureVectorArray(ctx, siftsmall_query_uri);
auto groundtruth_set = FeatureVectorArray(ctx, siftsmall_groundtruth_uri);

std::unique_ptr<IndexIVFPQ> b;
SECTION("infinite") {
b = std::make_unique<IndexIVFPQ>(ctx, api_ivf_pq_index_uri);
b = std::make_unique<IndexIVFPQ>(ctx, index_uri);
}
SECTION("finite") {
size_t upper_bound = GENERATE(500, 1000);
b = std::make_unique<IndexIVFPQ>(
ctx, api_ivf_pq_index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
ctx, index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
CHECK(b->upper_bound() == upper_bound);
}

Expand Down
Loading

0 comments on commit 4789993

Please sign in to comment.