Skip to content

Commit

Permalink
support build sparse graph in ODescent (#345)
Browse files Browse the repository at this point in the history
Signed-off-by: jinjiabao.jjb <[email protected]>
  • Loading branch information
inabao authored Jan 22, 2025
1 parent cf44602 commit b28a8d0
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 54 deletions.
61 changes: 38 additions & 23 deletions src/impl/odescent_graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,20 @@ class LinearCongruentialGenerator {
};

bool
ODescent::Build() {
ODescent::Build(const uint32_t* valid_ids, int64_t data_num) {
if (is_build_) {
return false;
}
is_build_ = true;
data_num_ = flatten_interface_->TotalCount();
valid_ids_ = valid_ids;
if (valid_ids_ != nullptr) {
data_num_ = data_num;
} else {
data_num_ = flatten_interface_->TotalCount();
}
if (data_num_ <= 1) {
throw std::runtime_error("ODescent cannot build a graph with data_num less than 1");
}
min_in_degree_ = std::min(min_in_degree_, data_num_ - 1);
Vector<std::mutex>(data_num_, allocator_).swap(points_lock_);
Vector<UnorderedSet<uint32_t>> old_neighbors(allocator_);
Expand Down Expand Up @@ -89,35 +97,23 @@ ODescent::SaveGraph(std::stringstream& out) {
// Note: at this point, either _nd == _max_points or any frozen points have
// been temporarily moved to _nd, so _nd + _num_frozen_points is the valid
// location limit.
auto final_graph = GetGraph();
for (uint32_t i = 0; i < data_num_; i++) {
uint32_t gk = (uint32_t)final_graph[i].size();
for (uint32_t i = 0; i < static_cast<uint32_t>(data_num_); i++) {
Vector<uint32_t> edges(allocator_);
edges.resize(graph[i].neighbors.size());
for (int j = 0; j < graph[i].neighbors.size(); ++j) {
edges[j] = graph[i].neighbors[j].id;
}
uint32_t gk = (uint32_t)edges.size();
out.write((char*)&gk, sizeof(uint32_t));
out.write((char*)final_graph[i].data(),
static_cast<std::streamsize>(gk * sizeof(uint32_t)));
max_degree =
final_graph[i].size() > max_degree ? (uint32_t)final_graph[i].size() : max_degree;
out.write((char*)edges.data(), static_cast<std::streamsize>(gk * sizeof(uint32_t)));
max_degree = edges.size() > max_degree ? (uint32_t)edges.size() : max_degree;
index_size += (size_t)(sizeof(uint32_t) * (gk + 1));
}
out.seekp(file_offset, out.beg);
out.write((char*)&index_size, sizeof(uint64_t));
out.write((char*)&max_degree, sizeof(uint32_t));
}

Vector<Vector<uint32_t>>
ODescent::GetGraph() {
Vector<Vector<uint32_t>> extract_graph(allocator_);
extract_graph.resize(data_num_, Vector<uint32_t>(allocator_));
for (int i = 0; i < data_num_; ++i) {
extract_graph[i].resize(graph[i].neighbors.size());
for (int j = 0; j < graph[i].neighbors.size(); ++j) {
extract_graph[i][j] = graph[i].neighbors[j].id;
}
}

return extract_graph;
}

void
ODescent::init_graph() {
graph.resize(data_num_, Linklist(allocator_));
Expand Down Expand Up @@ -375,4 +371,23 @@ ODescent::parallelize_task(std::function<void(int64_t, int64_t)> task) {
}
}

void
ODescent::SaveGraph(GraphInterfacePtr& graph_storage) {
for (int i = 0; i < data_num_; ++i) {
uint32_t id = i;
if (valid_ids_) {
id = valid_ids_[i];
}
Vector<uint32_t> edges(allocator_);
edges.resize(graph[i].neighbors.size());
for (int j = 0; j < graph[i].neighbors.size(); ++j) {
edges[j] = graph[i].neighbors[j].id;
if (valid_ids_) {
edges[j] = valid_ids_[graph[i].neighbors[j].id];
}
}
graph_storage->InsertNeighborsById(id, edges);
}
}

} // namespace vsag
14 changes: 10 additions & 4 deletions src/impl/odescent_graph_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <vector>

#include "data_cell/flatten_datacell.h"
#include "data_cell/graph_datacell.h"
#include "data_cell/sparse_graph_datacell.h"
#include "logger.h"
#include "safe_allocator.h"
#include "simd/simd.h"
Expand Down Expand Up @@ -69,7 +71,6 @@ struct Linklist {
: neighbors(allocator), greast_neighbor_distance(std::numeric_limits<float>::max()) {
}
};

class ODescent {
public:
ODescent(int64_t max_degree,
Expand All @@ -93,17 +94,20 @@ class ODescent {
}

bool
Build();
Build(const uint32_t* valid_ids = nullptr, int64_t data_num = 0);

void
SaveGraph(std::stringstream& out);

Vector<Vector<uint32_t>>
GetGraph();
void
SaveGraph(GraphInterfacePtr& graph_storage);

private:
inline float
get_distance(uint32_t loc1, uint32_t loc2) {
if (valid_ids_ != nullptr) {
return flatten_interface_->ComputePairVectors(valid_ids_[loc1], valid_ids_[loc2]);
}
return flatten_interface_->ComputePairVectors(loc1, loc2);
}

Expand Down Expand Up @@ -145,6 +149,8 @@ class ODescent {
Vector<std::mutex> points_lock_;
SafeThreadPool* thread_pool_;

const uint32_t* valid_ids_{nullptr};

bool pruning_{true};
float sample_rate_{0.3};
Allocator* allocator_;
Expand Down
78 changes: 51 additions & 27 deletions src/impl/odescent_graph_builder_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
#include "odescent_graph_builder.h"

#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>
#include <filesystem>
#include <set>

#include "data_cell/flatten_interface.h"
#include "data_cell/graph_interface.h"
#include "fixtures.h"
#include "io/memory_io_parameter.h"
#include "quantization/fp32_quantizer_parameter.h"
#include "safe_allocator.h"
Expand Down Expand Up @@ -47,38 +50,36 @@ calculate_overlap(const std::vector<uint32_t>& vec1, vsag::Vector<uint32_t>& vec
return intersection.size();
}

TEST_CASE("build nndescent", "[ut][nndescent]") {
int64_t num_vectors = 2000;
TEST_CASE("build nndescent", "[ut][odescent]") {
auto num_vectors = GENERATE(2, 4, 11, 2000);
size_t dim = 128;
int64_t max_degree = 32;
auto partial_data = GENERATE(true, false);

auto vectors = new float[dim * num_vectors];

std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
for (int64_t i = 0; i < dim * num_vectors; ++i) {
vectors[i] = distrib_real(rng);
}

std::vector<std::vector<std::pair<float, uint32_t>>> ground_truths(num_vectors);
auto [ids, vectors] = fixtures::generate_ids_and_vectors(num_vectors, dim);
// prepare common param
vsag::IndexCommonParam param;
param.dim_ = dim;
param.metric_ = vsag::MetricType::METRIC_TYPE_L2SQR;
param.data_type_ = vsag::DataTypes::DATA_TYPE_FLOAT;
param.allocator_ = vsag::SafeAllocator::FactoryDefaultAllocator();
param.thread_pool_ = vsag::SafeThreadPool::FactoryDefaultThreadPool();

// prepare data param
vsag::FlattenDataCellParamPtr flatten_param =
std::make_shared<vsag::FlattenDataCellParameter>();
flatten_param->quantizer_parameter_ = std::make_shared<vsag::FP32QuantizerParameter>();
flatten_param->io_parameter_ = std::make_shared<vsag::MemoryIOParameter>();
vsag::FlattenInterfacePtr flatten_interface_ptr =
vsag::FlattenInterface::MakeInstance(flatten_param, param);
flatten_interface_ptr->Train(vectors, num_vectors);
flatten_interface_ptr->BatchInsertVector(vectors, num_vectors);

vsag::DatasetPtr dataset = vsag::Dataset::Make();
dataset->NumElements(num_vectors)->Float32Vectors(vectors)->Dim(dim)->Owner(true);
flatten_interface_ptr->Train(vectors.data(), num_vectors);
flatten_interface_ptr->BatchInsertVector(vectors.data(), num_vectors);

// prepare graph param
vsag::GraphDataCellParamPtr graph_param_ptr = std::make_shared<vsag::GraphDataCellParameter>();
graph_param_ptr->io_parameter_ = std::make_shared<vsag::MemoryIOParameter>();
graph_param_ptr->max_degree_ = partial_data ? 2 * max_degree : max_degree;
// build graph
vsag::ODescent graph(max_degree,
1,
30,
Expand All @@ -87,23 +88,46 @@ TEST_CASE("build nndescent", "[ut][nndescent]") {
param.allocator_.get(),
param.thread_pool_.get(),
false);
graph.Build();
std::shared_ptr<uint32_t[]> valid_ids = nullptr;
if (partial_data) {
num_vectors /= 2;
valid_ids.reset(new uint32_t[num_vectors]);
for (int i = 0; i < num_vectors; ++i) {
valid_ids[i] = 2 * i;
}
}
if (num_vectors <= 1) {
REQUIRE_THROWS(graph.Build(valid_ids.get(), num_vectors));
return;
}
graph.Build(valid_ids.get(), num_vectors);

auto extract_graph = graph.GetGraph();
// check result
vsag::GraphInterfacePtr graph_interface = nullptr;
graph_interface = vsag::GraphInterface::MakeInstance(graph_param_ptr, param, partial_data);
graph.SaveGraph(graph_interface);

float hit_edge_count = 0;
int64_t indeed_max_degree = std::min(max_degree, (int64_t)num_vectors - 1);
auto id_map = [&](uint32_t id) -> uint32_t { return partial_data ? valid_ids[id] : id; };
for (int i = 0; i < num_vectors; ++i) {
std::vector<std::pair<float, uint32_t>> ground_truths;
uint32_t i_id = id_map(i);
for (int j = 0; j < num_vectors; ++j) {
if (i != j) {
ground_truths[i].emplace_back(flatten_interface_ptr->ComputePairVectors(i, j), j);
uint32_t j_id = id_map(j);
if (i_id != j_id) {
ground_truths.emplace_back(flatten_interface_ptr->ComputePairVectors(i_id, j_id),
j_id);
}
}
std::sort(ground_truths[i].begin(), ground_truths[i].end());
std::vector<uint32_t> truths_edges(max_degree);
for (int j = 0; j < max_degree; ++j) {
truths_edges[j] = ground_truths[i][j].second;
std::sort(ground_truths.begin(), ground_truths.end());
std::vector<uint32_t> truths_edges(indeed_max_degree);
for (int j = 0; j < indeed_max_degree; ++j) {
truths_edges[j] = ground_truths[j].second;
}
hit_edge_count += calculate_overlap(truths_edges, extract_graph[i], max_degree);
vsag::Vector<uint32_t> edges(param.allocator_.get());
graph_interface->GetNeighbors(i_id, edges);
hit_edge_count += calculate_overlap(truths_edges, edges, indeed_max_degree);
}
REQUIRE(hit_edge_count / (num_vectors * max_degree) > 0.95);
REQUIRE(hit_edge_count / (num_vectors * indeed_max_degree) > 0.95);
}

0 comments on commit b28a8d0

Please sign in to comment.