Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ef as a function parameter #1

Merged
merged 6 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions examples/python/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@

p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
Expand Down
3 changes: 0 additions & 3 deletions examples/python/example_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@

hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
Expand Down
3 changes: 0 additions & 3 deletions examples/python/example_replace_deleted.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@
# Enable replacing of deleted elements
hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
Expand Down
3 changes: 0 additions & 3 deletions examples/python/example_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@

p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
Expand Down
6 changes: 2 additions & 4 deletions examples/python/pyw_hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ def add_items(self, data, ids=None):
start += 1
self.index.add_items(data=data, ids=np.asarray(int_labels))

def set_ef(self, ef):
self.index.set_ef(ef)

def load_index(self, path):
self.index.load_index(path)
Expand All @@ -55,8 +53,8 @@ def save_index(self, path):
def set_num_threads(self, num_threads):
self.index.set_num_threads(num_threads)

def knn_query(self, data, k=1):
labels_int, distances = self.index.knn_query(data=data, k=k)
def knn_query(self, data, k=1, ef=10):
labels_int, distances = self.index.knn_query(data=data, k=k, ef=ef)
labels = []
for li in labels_int:
labels.append(
Expand Down
2 changes: 1 addition & 1 deletion hnswlib/bruteforce.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {


std::priority_queue<std::pair<dist_t, labeltype >>
searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const {
searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const {
assert(k <= cur_element_count);
std::priority_queue<std::pair<dist_t, labeltype >> topResults;
if (cur_element_count == 0) return topResults;
Expand Down
11 changes: 1 addition & 10 deletions hnswlib/hnswalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
size_t maxM_{0};
size_t maxM0_{0};
size_t ef_construction_{0};
size_t ef_{ 0 };

double mult_{0.0}, revSize_{0.0};
int maxlevel_{0};
Expand Down Expand Up @@ -107,7 +106,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
maxM_ = M_;
maxM0_ = M_ * 2;
ef_construction_ = std::max(ef_construction, M_);
ef_ = 10;

level_generator_.seed(random_seed);
update_probability_generator_.seed(random_seed + 1);
Expand Down Expand Up @@ -157,12 +155,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
}
};


void setEf(size_t ef) {
ef_ = ef;
}


inline std::mutex& getLabelOpMutex(labeltype label) const {
// calculate hash
size_t lock_id = label & (MAX_LABEL_OPERATION_LOCKS - 1);
Expand Down Expand Up @@ -694,7 +686,6 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
element_levels_ = std::vector<int>(max_elements);
revSize_ = 1.0 / mult_;
ef_ = 10;
for (size_t i = 0; i < cur_element_count; i++) {
label_lookup_[getExternalLabel(i)] = i;
unsigned int linkListSize;
Expand Down Expand Up @@ -1175,7 +1166,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {


std::priority_queue<std::pair<dist_t, labeltype >>
searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const {
searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const {
std::priority_queue<std::pair<dist_t, labeltype >> result;
if (cur_element_count == 0) return result;

Expand Down
8 changes: 4 additions & 4 deletions hnswlib/hnswlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,11 @@ class AlgorithmInterface {
virtual void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) = 0;

virtual std::priority_queue<std::pair<dist_t, labeltype>>
searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr) const = 0;
searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const = 0;

// Return k nearest neighbor in the order of closer fist
virtual std::vector<std::pair<dist_t, labeltype>>
searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const;
searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const;

virtual void saveIndex(const std::string &location) = 0;
virtual ~AlgorithmInterface(){
Expand All @@ -175,11 +175,11 @@ class AlgorithmInterface {
template<typename dist_t>
std::vector<std::pair<dist_t, labeltype>>
AlgorithmInterface<dist_t>::searchKnnCloserFirst(const void* query_data, size_t k,
BaseFilterFunctor* isIdAllowed) const {
BaseFilterFunctor* isIdAllowed, const size_t ef_) const {
std::vector<std::pair<dist_t, labeltype>> result;

// here searchKnn returns the result in the order of further first
auto ret = searchKnn(query_data, k, isIdAllowed);
auto ret = searchKnn(query_data, k, isIdAllowed, ef_);
{
size_t sz = ret.size();
result.resize(sz);
Expand Down
9 changes: 2 additions & 7 deletions python_bindings/LazyIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,15 @@ def get_items(self, ids=None):
if self.max_elements==0:
return []
return super().get_items(ids)
def knn_query(self, data,k=1, num_threads=-1):
def knn_query(self, data,k=1, num_threads=-1, ef=10):
if self.max_elements==0:
return [], []
return super().knn_query(data, k, num_threads)
return super().knn_query(data, k, num_threads, ef)
def resize_index(self, size):
if self.max_elements==0:
return self.init_index(size)
else:
return super().resize_index(size)
def set_ef(self, ef):
if self.max_elements==0:
self.init_ef_construction=ef
return
super().set_ef(ef)
def get_max_elements(self):
return self.max_elements
def get_current_count(self):
Expand Down
45 changes: 11 additions & 34 deletions python_bindings/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ class Index {
std::string space_name;
int dim;
size_t seed;
size_t default_ef;

bool index_inited;
bool ep_added;
Expand All @@ -177,8 +176,6 @@ class Index {
ep_added = true;
index_inited = false;
num_threads_default = std::thread::hardware_concurrency();

default_ef = 10;
}


Expand All @@ -202,18 +199,9 @@ class Index {
appr_alg = new hnswlib::HierarchicalNSW<dist_t>(l2space, maxElements, M, efConstruction, random_seed, allow_replace_deleted);
index_inited = true;
ep_added = false;
appr_alg->ef_ = default_ef;
seed = random_seed;
}


void set_ef(size_t ef) {
default_ef = ef;
if (appr_alg)
appr_alg->ef_ = ef;
}


void set_num_threads(int num_threads) {
this->num_threads_default = num_threads;
}
Expand Down Expand Up @@ -412,7 +400,6 @@ class Index {
"M"_a = appr_alg->M_,
"mult"_a = appr_alg->mult_,
"ef_construction"_a = appr_alg->ef_construction_,
"ef"_a = appr_alg->ef_,
"has_deletions"_a = (bool)appr_alg->num_deleted_,
"size_links_per_element"_a = appr_alg->size_links_per_element_,
"allow_replace_deleted"_a = appr_alg->allow_replace_deleted_,
Expand Down Expand Up @@ -462,8 +449,7 @@ class Index {
"seed"_a = seed);

if (index_inited == false)
return py::dict(**params, "ef"_a = default_ef);

return py::dict(**params);
auto ann_params = getAnnData();

return py::dict(**params, **ann_params);
Expand Down Expand Up @@ -497,7 +483,6 @@ class Index {
new_index->index_inited = index_inited_;
new_index->ep_added = d["ep_added"].cast<bool>();
new_index->num_threads_default = d["num_threads"].cast<int>();
new_index->default_ef = d["ef"].cast<size_t>();

if (index_inited_)
new_index->setAnnData(d);
Expand Down Expand Up @@ -532,7 +517,6 @@ class Index {
assert_true(appr_alg->mult_ == d["mult"].cast<double>(), "Invalid value of mult_ ");
assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast<size_t>(), "Invalid value of ef_construction_ ");

appr_alg->ef_ = d["ef"].cast<size_t>();

assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast<size_t>(), "Invalid value of size_links_per_element_ ");

Expand Down Expand Up @@ -601,7 +585,8 @@ class Index {
py::object input,
size_t k = 1,
int num_threads = -1,
const std::function<bool(hnswlib::labeltype)>& filter = nullptr) {
const std::function<bool(hnswlib::labeltype)>& filter = nullptr,
const size_t ef = 10) {
py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
auto buffer = items.request();
hnswlib::labeltype* data_numpy_l;
Expand Down Expand Up @@ -630,7 +615,7 @@ class Index {
if (normalize == false) {
ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) {
std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = appr_alg->searchKnn(
(void*)items.data(row), k, p_idFilter);
(void*)items.data(row), k, p_idFilter, ef);
if (result.size() != k)
throw std::runtime_error(
"Cannot return the results in a contigious 2D array. Probably ef or M is too small");
Expand All @@ -650,7 +635,7 @@ class Index {
normalize_vector((float*)items.data(row), (norm_array.data() + start_idx));

std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = appr_alg->searchKnn(
(void*)(norm_array.data() + start_idx), k, p_idFilter);
(void*)(norm_array.data() + start_idx), k, p_idFilter, ef);
if (result.size() != k)
throw std::runtime_error(
"Cannot return the results in a contigious 2D array. Probably ef or M is too small");
Expand Down Expand Up @@ -820,7 +805,8 @@ class BFIndex {
py::object knnQuery_return_numpy(
py::object input,
size_t k = 1,
const std::function<bool(hnswlib::labeltype)>& filter = nullptr) {
const std::function<bool(hnswlib::labeltype)>& filter = nullptr,
const size_t ef = 10) {
py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input);
auto buffer = items.request();
hnswlib::labeltype *data_numpy_l;
Expand All @@ -839,7 +825,7 @@ class BFIndex {

for (size_t row = 0; row < rows; row++) {
std::priority_queue<std::pair<dist_t, hnswlib::labeltype >> result = alg->searchKnn(
(void *) items.data(row), k, p_idFilter);
(void *) items.data(row), k, p_idFilter, ef);
for (int i = k - 1; i >= 0; i--) {
auto &result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
Expand Down Expand Up @@ -893,7 +879,8 @@ PYBIND11_PLUGIN(hnswlib) {
py::arg("data"),
py::arg("k") = 1,
py::arg("num_threads") = -1,
py::arg("filter") = py::none())
py::arg("filter") = py::none(),
py::arg("ef") = 10)
.def("add_items",
&Index<float>::addItems,
py::arg("data"),
Expand All @@ -902,7 +889,6 @@ PYBIND11_PLUGIN(hnswlib) {
py::arg("replace_deleted") = false)
.def("get_items", &Index<float, float>::getDataReturnList, py::arg("ids") = py::none())
.def("get_ids_list", &Index<float>::getIdsList)
.def("set_ef", &Index<float>::set_ef, py::arg("ef"))
.def("set_num_threads", &Index<float>::set_num_threads, py::arg("num_threads"))
.def("save_index", &Index<float>::saveIndex, py::arg("path_to_index"))
.def("load_index",
Expand All @@ -918,15 +904,6 @@ PYBIND11_PLUGIN(hnswlib) {
.def_readonly("space", &Index<float>::space_name)
.def_readonly("dim", &Index<float>::dim)
.def_readwrite("num_threads", &Index<float>::num_threads_default)
.def_property("ef",
[](const Index<float> & index) {
return index.index_inited ? index.appr_alg->ef_ : index.default_ef;
},
[](Index<float> & index, const size_t ef_) {
index.default_ef = ef_;
if (index.appr_alg)
index.appr_alg->ef_ = ef_;
})
.def_property_readonly("max_elements", [](const Index<float> & index) {
return index.index_inited ? index.appr_alg->max_elements_ : 0;
})
Expand Down Expand Up @@ -957,7 +934,7 @@ PYBIND11_PLUGIN(hnswlib) {
py::class_<BFIndex<float>>(m, "BFIndex")
.def(py::init<const std::string &, const int>(), py::arg("space"), py::arg("dim"))
.def("init_index", &BFIndex<float>::init_new_index, py::arg("max_elements"))
.def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none())
.def("knn_query", &BFIndex<float>::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none(), py::arg("ef") = 10)
.def("add_items", &BFIndex<float>::addItems, py::arg("data"), py::arg("ids") = py::none())
.def("delete_vector", &BFIndex<float>::deleteVector, py::arg("label"))
.def("save_index", &BFIndex<float>::saveIndex, py::arg("path_to_index"))
Expand Down
1 change: 0 additions & 1 deletion tests/cpp/sift_1b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ test_vs_recall(
efs.push_back(i);
}
for (size_t ef : efs) {
appr_alg.setEf(ef);
StopW stopw = StopW();

float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k);
Expand Down
8 changes: 4 additions & 4 deletions tests/cpp/sift_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,13 @@ float test_approx(
HierarchicalNSW<float> &appr_alg,
size_t vecdim,
vector<std::priority_queue<std::pair<float, labeltype>>> &answers,
size_t k) {
size_t k,
size_t ef = 10) {
size_t correct = 0;
size_t total = 0;
//#pragma omp parallel for
for (int i = 0; i < qsize; i++) {
std::priority_queue<std::pair<float, labeltype >> result = appr_alg.searchKnn(massQ + vecdim * i, 10);
std::priority_queue<std::pair<float, labeltype >> result = appr_alg.searchKnn(massQ + vecdim * i, 10, nullptr, ef);
std::priority_queue<std::pair<float, labeltype >> gt(answers[i]);
unordered_set<labeltype> g;
total += gt.size();
Expand Down Expand Up @@ -131,10 +132,9 @@ void test_vs_recall(
efs.push_back(i);
}*/
for (size_t ef : efs) {
appr_alg.setEf(ef);
StopW stopw = StopW();

float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k);
float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k, ef);
float time_us_per_query = stopw.getElapsedTimeMicro() / qsize;
cout << ef << "\t" << recall << "\t" << time_us_per_query << " us\n";
if (recall > 1.0) {
Expand Down
8 changes: 3 additions & 5 deletions tests/cpp/updates_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ std::vector<datatype> load_batch(std::string path, int size) {
template <typename d_type>
static float
test_approx(std::vector<float> &queries, size_t qsize, hnswlib::HierarchicalNSW<d_type> &appr_alg, size_t vecdim,
std::vector<std::unordered_set<hnswlib::labeltype>> &answers, size_t K) {
std::vector<std::unordered_set<hnswlib::labeltype>> &answers, size_t K, size_t ef) {
size_t correct = 0;
size_t total = 0;

for (int i = 0; i < qsize; i++) {
std::priority_queue<std::pair<d_type, hnswlib::labeltype>> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K);
std::priority_queue<std::pair<d_type, hnswlib::labeltype>> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K, nullptr, ef);
total += K;
while (result.size()) {
if (answers[i].find(result.top().second) != answers[i].end()) {
Expand Down Expand Up @@ -148,13 +148,11 @@ test_vs_recall(

bool test_passed = false;
for (size_t ef : efs) {
appr_alg.setEf(ef);

appr_alg.metric_hops = 0;
appr_alg.metric_distance_computations = 0;
StopW stopw = StopW();

float recall = test_approx<float>(queries, qsize, appr_alg, vecdim, answers, k);
float recall = test_approx<float>(queries, qsize, appr_alg, vecdim, answers, k, ef);
float time_us_per_query = stopw.getElapsedTimeMicro() / qsize;
float distance_comp_per_query = appr_alg.metric_distance_computations / (1.0f * qsize);
float hops_per_query = appr_alg.metric_hops / (1.0f * qsize);
Expand Down
Loading
Loading