Skip to content

Commit

Permalink
fix bug with incorrect partitions on second train
Browse files Browse the repository at this point in the history
  • Loading branch information
jparismorgan committed Oct 15, 2024
1 parent 2f0caf1 commit 0f36bc2
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 160 deletions.
3 changes: 0 additions & 3 deletions apis/python/test/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def check_default_metadata(
uri, expected_vector_type, expected_storage_version, expected_index_type
):
group = tiledb.Group(uri, "r", ctx=tiledb.Ctx(None))
print("[check_default_metadata] group.meta", group.meta)
assert "dataset_type" in group.meta
assert group.meta["dataset_type"] == DATASET_TYPE
assert type(group.meta["dataset_type"]) == str
Expand Down Expand Up @@ -306,7 +305,6 @@ def test_ivf_pq_index(tmp_path):
uri = os.path.join(tmp_path, "array")
if os.path.exists(uri):
os.rmdir(uri)

vector_type = np.float32

index = ivf_pq_index.create(
Expand Down Expand Up @@ -342,7 +340,6 @@ def test_ivf_pq_index(tmp_path):
vectors=update_vectors,
external_ids=np.array([0, 1, 2, 3, 4], dtype=np.dtype(np.uint32)),
)

query_and_check_distances(
index, np.array([[2, 2, 2]], dtype=np.float32), 2, [[0, 3]], [[2, 1]]
)
Expand Down
4 changes: 2 additions & 2 deletions src/include/detail/ivf/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ int ivf_pq_index(
// Find the centroid that is closest to each input vector.
auto parts = detail::flat::qv_partition(centroids, input_vectors, nthreads);
{
scoped_timer _{"shuffling data"};
scoped_timer _{"index@ivf_pq_index@shuffling_data"};
std::unordered_set<IdsType> deleted_ids_set(
deleted_ids.begin(), deleted_ids.end());
auto indices = compute_indices<
Expand Down Expand Up @@ -274,7 +274,7 @@ int ivf_index(
// Find the centroid that is closest to each input vector.
auto parts = detail::flat::qv_partition(centroids, input_vectors, nthreads);
{
scoped_timer _{"shuffling data"};
scoped_timer _{"index@ivf_index@shuffling_data"};
std::unordered_set<IdsType> deleted_ids_set(
deleted_ids.begin(), deleted_ids.end());

Expand Down
55 changes: 17 additions & 38 deletions src/include/index/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,19 @@ class ivf_pq_index {
num_clusters_,
num_subspaces_);

write_group.set_num_partitions(num_partitions_);
if (write_group.get_all_ingestion_timestamps().size() == 1 &&
write_group.get_previous_ingestion_timestamp() == 0 &&
write_group.get_all_base_sizes().size() == 1 &&
write_group.get_previous_base_size() == 0) {
write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.set_base_size(0);
write_group.set_num_partitions(num_partitions_);
} else {
write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.append_base_size(0);
write_group.append_num_partitions(num_partitions_);
}

write_group.store_metadata();

// 4. Write the centroids.
Expand Down Expand Up @@ -945,7 +957,7 @@ class ivf_pq_index {
for (size_t partition_id = 0; partition_id < partitions; ++partition_id) {
auto slice = std::make_pair(
static_cast<int>(prev_index),
static_cast<int>(partial_indexes[i] - 1));
static_cast<int>(partial_indexes[i]) - 1);
if (slice.first <= slice.second &&
slice.first != std::numeric_limits<uint64_t>::max()) {
partition_slices[partition_id].push_back(slice);
Expand Down Expand Up @@ -1052,19 +1064,7 @@ class ivf_pq_index {
num_clusters_,
num_subspaces_);

if (write_group.get_all_ingestion_timestamps().size() == 1 &&
write_group.get_previous_ingestion_timestamp() == 0 &&
write_group.get_all_base_sizes().size() == 1 &&
write_group.get_previous_base_size() == 0) {
write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.set_base_size(write_group.get_temp_size());
write_group.set_num_partitions(num_partitions_);
} else {
write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.append_base_size(write_group.get_temp_size());
write_group.append_num_partitions(num_partitions_);
}

write_group.set_base_size(write_group.get_temp_size());
write_group.store_metadata();
}

Expand Down Expand Up @@ -1099,19 +1099,7 @@ class ivf_pq_index {
dimensions_,
num_clusters_,
num_subspaces_);
if (write_group.get_all_ingestion_timestamps().size() == 1 &&
write_group.get_previous_ingestion_timestamp() == 0 &&
write_group.get_all_base_sizes().size() == 1 &&
write_group.get_previous_base_size() == 0) {
write_group.set_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.set_base_size(::num_vectors(vectors));
write_group.set_num_partitions(num_partitions_);
} else {
write_group.append_ingestion_timestamp(temporal_policy_.timestamp_end());
write_group.append_base_size(::num_vectors(vectors));
write_group.append_num_partitions(num_partitions_);
}

write_group.set_base_size(::num_vectors(vectors));
write_group.store_metadata();
}

Expand Down Expand Up @@ -1360,7 +1348,6 @@ class ivf_pq_index {
auto&& [active_partitions, active_queries] =
detail::ivf::partition_ivf_flat_index<indices_type>(
flat_ivf_centroids_, query_vectors, nprobe, num_threads_);

auto query_to_pq_centroid_distance_tables =
std::move(*generate_query_to_pq_centroid_distance_tables<
Q,
Expand All @@ -1379,6 +1366,7 @@ class ivf_pq_index {
make_pq_distance_query_to_pq_centroid_distance_tables<
std::span<float>,
decltype(pq_storage_type{}[0])>());

return rerank(
std::move(initial_distances),
std::move(initial_ids),
Expand Down Expand Up @@ -1421,15 +1409,6 @@ class ivf_pq_index {
}
}

auto all_feature_vectors =
tdbColMajorMatrixWithIds<feature_type, id_type>(
group_->cached_ctx(),
group_->feature_vectors_uri(),
group_->ids_uri(),
100,
temporal_policy_);
all_feature_vectors.load();

auto feature_vectors =
tdbColMajorMatrixMultiRange<feature_type, uint64_t>(
group_->cached_ctx(),
Expand Down
3 changes: 1 addition & 2 deletions src/include/test/unit_api_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,7 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") {
// We then load the trained index and don't set a timestamp (which means
// we'll load it at timestamp 99).
auto index = IndexIVFPQ(ctx, index_uri);

// Check that we can do finite and infinite queries and then train + write
// the index.
{
Expand Down Expand Up @@ -879,7 +880,6 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") {
{{11, 11, 11}, {22, 22, 22}, {33, 33, 33}, {44, 44, 44}, {55, 55, 55}}};
auto&& [scores, ids] =
index.query(FeatureVectorArray(queries), top_k, nprobe);

check_single_vector_equals(
scores, ids, {0, 0, 0, 0, 0}, {11, 22, 33, 44, 55});

Expand Down Expand Up @@ -976,7 +976,6 @@ TEST_CASE("write and load index with timestamps", "[api_ivf_pq_index]") {
all_ingestion_timestamps.end(),
std::vector<uint64_t>{99, 100}.begin()));
}

// Load it at timestamp 5 (before ingestion) and make sure we can query and be
// returned fill values.
for (auto upper_bound : std::vector<size_t>{0, 4}) {
Expand Down
177 changes: 87 additions & 90 deletions src/include/test/unit_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,100 +298,100 @@ TEST_CASE(
//
#if 0
TEMPLATE_TEST_CASE(
"query stacked hypercube",
"[flativf_index]",
float,
uint8_t) {
size_t k_dist = GENERATE(0, 32);
size_t k_near = k_dist;
size_t k_far = k_dist;
"query stacked hypercube",
"[flativf_index]",
float,
uint8_t) {
size_t k_dist = GENERATE(0, 32);
size_t k_near = k_dist;
size_t k_far = k_dist;

auto hypercube0 = build_hypercube<TestType>(k_near, k_far, 0xdeadbeef);
auto hypercube1 = build_hypercube<TestType>(k_near, k_far, 0xbeefdead);
auto hypercube0 = build_hypercube<TestType>(k_near, k_far, 0xdeadbeef);
auto hypercube1 = build_hypercube<TestType>(k_near, k_far, 0xbeefdead);

auto hypercube2 = ColMajorMatrix<TestType>(6, num_vectors(hypercube0));
auto hypercube4 = ColMajorMatrix<TestType>(12, num_vectors(hypercube0));
auto hypercube2 = ColMajorMatrix<TestType>(6, num_vectors(hypercube0));
auto hypercube4 = ColMajorMatrix<TestType>(12, num_vectors(hypercube0));

std::vector<uint32_t> ids(num_vectors(hypercube0));
std::iota(begin(ids), end(ids), 0);
std::vector<uint32_t> ids(num_vectors(hypercube0));
std::iota(begin(ids), end(ids), 0);

for (size_t j = 0; j < 3; ++j) {
for (size_t i = 0; i < num_vectors(hypercube4); ++i) {
hypercube2(j, i) = hypercube0(j, i);
hypercube2(j + 3, i) = hypercube1(j, i);
for (size_t j = 0; j < 3; ++j) {
for (size_t i = 0; i < num_vectors(hypercube4); ++i) {
hypercube2(j, i) = hypercube0(j, i);
hypercube2(j + 3, i) = hypercube1(j, i);

hypercube4(j, i) = hypercube0(j, i);
hypercube4(j + 3, i) = hypercube1(j, i);
hypercube4(j + 6, i) = hypercube0(j, i);
hypercube4(j + 9, i) = hypercube1(j, i);
hypercube4(j, i) = hypercube0(j, i);
hypercube4(j + 3, i) = hypercube1(j, i);
hypercube4(j + 6, i) = hypercube0(j, i);
hypercube4(j + 9, i) = hypercube1(j, i);
}
}
}
SECTION("partitions = 1") {
size_t k_nn = 6;
size_t partitions = 1;
SECTION("partitions = 1") {
size_t k_nn = 6;
size_t partitions = 1;

auto ivf_idx2 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ partitions, 2, 4, 1.e-4);
ivf_idx2.train_ivf(hypercube2);
ivf_idx2.add(hypercube2, ids);
auto ivf_idx4 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ partitions, 2, 4, 1.e-4);
ivf_idx4.train_ivf(hypercube4);
ivf_idx4.add(hypercube4, ids);

auto top_k_ivf_scores = ColMajorMatrix<float>();
auto top_k_ivf = ColMajorMatrix<unsigned>();
auto top_k_scores = ColMajorMatrix<float>();
auto top_k = ColMajorMatrix<uint64_t>();
auto query2 = ColMajorMatrix<TestType>();
auto query4 = ColMajorMatrix<TestType>();

SECTION("query2/4 = 0...") {
query2 = ColMajorMatrix<TestType>{{0, 0, 0, 0, 0, 0}};
query4 = ColMajorMatrix<TestType>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{{127, 127, 127, 127, 127, 127}};
query4 = ColMajorMatrix<TestType>{
{127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}};
}
SECTION("query2/4 = 0...") {
query2 = ColMajorMatrix<TestType>{{0, 0, 0, 127, 127, 127}};
query4 = ColMajorMatrix<TestType>{
{0, 0, 0, 0, 0, 0, 127, 127, 127, 127, 127, 127}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{{127, 127, 127, 0, 0, 0}};
query4 = ColMajorMatrix<TestType>{
{127, 127, 127, 127, 127, 127, 0, 0, 0, 0, 0, 0}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{
{127, 0, 127, 0, 127, 0}, {0, 127, 0, 127, 0, 127}};
query4 = ColMajorMatrix<TestType>{
{127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0},
{0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127}};
}

auto ivf_idx2 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ partitions, 2, 4, 1.e-4);
ivf_idx2.train_ivf(hypercube2);
ivf_idx2.add(hypercube2, ids);
auto ivf_idx4 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ partitions, 2, 4, 1.e-4);
ivf_idx4.train_ivf(hypercube4);
ivf_idx4.add(hypercube4, ids);

auto top_k_ivf_scores = ColMajorMatrix<float>();
auto top_k_ivf = ColMajorMatrix<unsigned>();
auto top_k_scores = ColMajorMatrix<float>();
auto top_k = ColMajorMatrix<uint64_t>();
auto query2 = ColMajorMatrix<TestType>();
auto query4 = ColMajorMatrix<TestType>();

SECTION("query2/4 = 0...") {
query2 = ColMajorMatrix<TestType>{{0, 0, 0, 0, 0, 0}};
query4 = ColMajorMatrix<TestType>{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{{127, 127, 127, 127, 127, 127}};
query4 = ColMajorMatrix<TestType>{
{127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}};
}
SECTION("query2/4 = 0...") {
query2 = ColMajorMatrix<TestType>{{0, 0, 0, 127, 127, 127}};
query4 = ColMajorMatrix<TestType>{
{0, 0, 0, 0, 0, 0, 127, 127, 127, 127, 127, 127}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{{127, 127, 127, 0, 0, 0}};
query4 = ColMajorMatrix<TestType>{
{127, 127, 127, 127, 127, 127, 0, 0, 0, 0, 0, 0}};
}
SECTION("query2/4 = 127...") {
query2 = ColMajorMatrix<TestType>{
{127, 0, 127, 0, 127, 0}, {0, 127, 0, 127, 0, 127}};
query4 = ColMajorMatrix<TestType>{
{127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0},
{0, 127, 0, 127, 0, 127, 0, 127, 0, 127, 0, 127}};
}

std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap(
hypercube2, query2, k_nn, 1, sum_of_squares_distance{});
std::tie(top_k_ivf_scores, top_k_ivf) =
ivf_idx2.query_infinite_ram(query2, k_nn, 1); // k, nprobe
size_t intersections0 = count_intersections(top_k_ivf, top_k, k_nn);
double recall0 = intersections0 / ((double)top_k.num_cols() * k_nn);
CHECK(intersections0 == k_nn * num_vectors(query2));
CHECK(recall0 == 1.0);

std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap(
hypercube4, query4, k_nn, 1, sum_of_squares_distance{});
std::tie(top_k_ivf_scores, top_k_ivf) =
ivf_idx4.query_infinite_ram(query4, k_nn, 1); // k, nprobe

size_t intersections1 = (long)count_intersections(top_k_ivf, top_k, k_nn);
double recall1 = intersections1 / ((double)top_k.num_cols() * k_nn);
CHECK(intersections1 == k_nn * num_vectors(query4));
CHECK(recall1 == 1.0);
}
std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap(
hypercube2, query2, k_nn, 1, sum_of_squares_distance{});
std::tie(top_k_ivf_scores, top_k_ivf) =
ivf_idx2.query_infinite_ram(query2, k_nn, 1); // k, nprobe
size_t intersections0 = count_intersections(top_k_ivf, top_k, k_nn);
double recall0 = intersections0 / ((double)top_k.num_cols() * k_nn);
CHECK(intersections0 == k_nn * num_vectors(query2));
CHECK(recall0 == 1.0);

std::tie(top_k_scores, top_k) = detail::flat::qv_query_heap(
hypercube4, query4, k_nn, 1, sum_of_squares_distance{});
std::tie(top_k_ivf_scores, top_k_ivf) =
ivf_idx4.query_infinite_ram(query4, k_nn, 1); // k, nprobe

size_t intersections1 = (long)count_intersections(top_k_ivf, top_k, k_nn);
double recall1 = intersections1 / ((double)top_k.num_cols() * k_nn);
CHECK(intersections1 == k_nn * num_vectors(query4));
CHECK(recall1 == 1.0);
}
}
#endif

Expand Down Expand Up @@ -485,8 +485,6 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
ctx, index_uri, dimensions, partitions, dimensions / 2);
ivf_pq_index<siftsmall_feature_type, siftsmall_ids_type> index(
ctx, index_uri);
// auto index = ivf_pq_index<siftsmall_feature_type, siftsmall_ids_type>(
// partitions, dimensions / 2);
auto queries =
ColMajorMatrix<siftsmall_feature_type>{{{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}};

Expand All @@ -496,7 +494,6 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
ColMajorMatrixWithIds<siftsmall_feature_type>(dimensions, num_vectors);
index.train(data);
index.ingest(data, data.raveled_ids());
// CHECK(index.num_vectors() == num_vectors);
}
// We can query an empty index.
{
Expand Down
Loading

0 comments on commit 0f36bc2

Please sign in to comment.