From 89abb5ddb931c0b3610e9ea6d3665f3197056f5c Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Mon, 29 Jan 2024 14:25:37 +0300 Subject: [PATCH 1/7] Use `ef` as a function parameter --- examples/python/example.py | 3 -- examples/python/example_filter.py | 3 -- examples/python/example_replace_deleted.py | 3 -- examples/python/example_serialization.py | 3 -- examples/python/pyw_hnswlib.py | 6 +-- hnswlib/bruteforce.h | 2 +- hnswlib/hnswalg.h | 11 +---- hnswlib/hnswlib.h | 2 +- python_bindings/LazyIndex.py | 9 +--- python_bindings/bindings.cpp | 45 +++++-------------- tests/cpp/sift_1b.cpp | 1 - tests/cpp/sift_test.cpp | 8 ++-- tests/cpp/updates_test.cpp | 8 ++-- tests/python/bindings_test.py | 3 -- tests/python/bindings_test_filter.py | 4 -- tests/python/bindings_test_getdata.py | 3 -- tests/python/bindings_test_labels.py | 15 +++---- tests/python/bindings_test_metadata.py | 3 -- tests/python/bindings_test_pickle.py | 16 ++----- tests/python/bindings_test_recall.py | 5 +-- tests/python/bindings_test_replace.py | 13 +++--- tests/python/bindings_test_resize.py | 7 +-- tests/python/bindings_test_spaces.py | 2 - .../python/bindings_test_stress_mt_replace.py | 7 ++- tests/python/speedtest.py | 6 +-- 25 files changed, 46 insertions(+), 142 deletions(-) diff --git a/examples/python/example.py b/examples/python/example.py index 3d6d7477..5d360ee7 100644 --- a/examples/python/example.py +++ b/examples/python/example.py @@ -32,9 +32,6 @@ p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_filter.py b/examples/python/example_filter.py index add22a3d..396d5a21 100644 --- a/examples/python/example_filter.py +++ b/examples/python/example_filter.py @@ -25,9 +25,6 @@ hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_replace_deleted.py b/examples/python/example_replace_deleted.py index 3c0b62e7..f309046e 100644 --- a/examples/python/example_replace_deleted.py +++ b/examples/python/example_replace_deleted.py @@ -32,9 +32,6 @@ # Enable replacing of deleted elements hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_serialization.py b/examples/python/example_serialization.py index 76ca1436..a560eb8a 100644 --- a/examples/python/example_serialization.py +++ b/examples/python/example_serialization.py @@ -33,9 +33,6 @@ p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/pyw_hnswlib.py b/examples/python/pyw_hnswlib.py index 0ccfbc5e..cb64bbee 100644 --- a/examples/python/pyw_hnswlib.py +++ b/examples/python/pyw_hnswlib.py @@ -39,8 +39,6 @@ def add_items(self, data, ids=None): start += 1 self.index.add_items(data=data, ids=np.asarray(int_labels)) - def set_ef(self, ef): - self.index.set_ef(ef) def load_index(self, path): self.index.load_index(path) @@ -55,8 +53,8 @@ def save_index(self, path): def set_num_threads(self, num_threads): self.index.set_num_threads(num_threads) - def knn_query(self, data, k=1): - labels_int, distances = self.index.knn_query(data=data, k=k) + def knn_query(self, data, k=1, ef=10): + labels_int, distances = self.index.knn_query(data=data, k=k, ef=ef) labels = [] for li in labels_int: labels.append( diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 30b33ae9..36c99158 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -98,7 +98,7 @@ class BruteforceSearch : public AlgorithmInterface { std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { + searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const { assert(k <= cur_element_count); std::priority_queue> topResults; if (cur_element_count == 0) return topResults; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index f7d7f264..21f1af0e 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -28,7 +28,6 @@ class HierarchicalNSW : public AlgorithmInterface { size_t maxM_{0}; size_t maxM0_{0}; size_t ef_construction_{0}; - size_t ef_{ 0 }; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -107,7 +106,6 @@ class HierarchicalNSW : public AlgorithmInterface { maxM_ = M_; maxM0_ = M_ * 2; ef_construction_ = std::max(ef_construction, M_); - ef_ = 10; level_generator_.seed(random_seed); update_probability_generator_.seed(random_seed + 1); @@ -157,12 +155,6 @@ class HierarchicalNSW : public AlgorithmInterface { } }; - - void setEf(size_t ef) { - ef_ = ef; - } - - inline std::mutex& getLabelOpMutex(labeltype label) const { // calculate hash size_t lock_id = label & (MAX_LABEL_OPERATION_LOCKS - 1); @@ -694,7 +686,6 @@ class HierarchicalNSW : public AlgorithmInterface { throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; - ef_ = 10; for (size_t i = 0; i < cur_element_count; i++) { label_lookup_[getExternalLabel(i)] = i; unsigned int linkListSize; @@ -1175,7 +1166,7 @@ class HierarchicalNSW : public AlgorithmInterface { std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { + searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const { std::priority_queue> result; if (cur_element_count == 0) return result; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index fb7118fa..9ebf9119 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -161,7 +161,7 @@ class AlgorithmInterface { virtual void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) = 0; virtual std::priority_queue> - searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr) const = 0; + searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const = 0; // Return k nearest neighbor in the order of closer fist virtual std::vector> diff --git a/python_bindings/LazyIndex.py b/python_bindings/LazyIndex.py index dbaa4673..5b9a57f1 100644 --- a/python_bindings/LazyIndex.py +++ b/python_bindings/LazyIndex.py @@ -24,20 +24,15 @@ def get_items(self, ids=None): if self.max_elements==0: return [] return super().get_items(ids) - def knn_query(self, data,k=1, num_threads=-1): + def knn_query(self, data,k=1, num_threads=-1, ef=10): if self.max_elements==0: return [], [] - return super().knn_query(data, k, num_threads) + return super().knn_query(data, k, num_threads, ef) def resize_index(self, size): if self.max_elements==0: return self.init_index(size) else: return super().resize_index(size) - def set_ef(self, ef): - if self.max_elements==0: - self.init_ef_construction=ef - return - super().set_ef(ef) def get_max_elements(self): return self.max_elements def get_current_count(self): diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 5153bb58..2723e0ed 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -150,7 +150,6 @@ class Index { std::string space_name; int dim; size_t seed; - size_t default_ef; bool index_inited; bool ep_added; @@ -177,8 +176,6 @@ class Index { ep_added = true; index_inited = false; num_threads_default = std::thread::hardware_concurrency(); - - default_ef = 10; } @@ -202,18 +199,9 @@ class Index { appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed, allow_replace_deleted); index_inited = true; ep_added = false; - appr_alg->ef_ = default_ef; seed = random_seed; } - - void set_ef(size_t ef) { - default_ef = ef; - if (appr_alg) - appr_alg->ef_ = ef; - } - - void set_num_threads(int num_threads) { this->num_threads_default = num_threads; } @@ -412,7 +400,6 @@ class Index { "M"_a = appr_alg->M_, "mult"_a = appr_alg->mult_, "ef_construction"_a = appr_alg->ef_construction_, - "ef"_a = appr_alg->ef_, "has_deletions"_a = (bool)appr_alg->num_deleted_, "size_links_per_element"_a = appr_alg->size_links_per_element_, "allow_replace_deleted"_a = appr_alg->allow_replace_deleted_, @@ -462,8 +449,7 @@ class Index { "seed"_a = seed); if (index_inited == false) - return py::dict(**params, "ef"_a = default_ef); - + return py::dict(**params); auto ann_params = getAnnData(); return py::dict(**params, **ann_params); @@ -497,7 +483,6 @@ class Index { new_index->index_inited = index_inited_; new_index->ep_added = d["ep_added"].cast(); new_index->num_threads_default = d["num_threads"].cast(); - new_index->default_ef = d["ef"].cast(); if (index_inited_) new_index->setAnnData(d); @@ -532,7 +517,6 @@ class Index { assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); - appr_alg->ef_ = d["ef"].cast(); assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); @@ -601,7 +585,8 @@ class Index { py::object input, size_t k = 1, int num_threads = -1, - const std::function& filter = nullptr) { + const std::function& filter = nullptr, + const size_t ef = 10) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype* data_numpy_l; @@ -630,7 +615,7 @@ class Index { if (normalize == false) { ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) { std::priority_queue> result = appr_alg->searchKnn( - (void*)items.data(row), k, p_idFilter); + (void*)items.data(row), k, p_idFilter, ef); if (result.size() != k) throw std::runtime_error( "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); @@ -650,7 +635,7 @@ class Index { normalize_vector((float*)items.data(row), (norm_array.data() + start_idx)); std::priority_queue> result = appr_alg->searchKnn( - (void*)(norm_array.data() + start_idx), k, p_idFilter); + (void*)(norm_array.data() + start_idx), k, p_idFilter, ef); if (result.size() != k) throw std::runtime_error( "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); @@ -820,7 +805,8 @@ class BFIndex { py::object knnQuery_return_numpy( py::object input, size_t k = 1, - const std::function& filter = nullptr) { + const std::function& filter = nullptr, + const size_t ef = 10) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype *data_numpy_l; @@ -839,7 +825,7 @@ class BFIndex { for (size_t row = 0; row < rows; row++) { std::priority_queue> result = alg->searchKnn( - (void *) items.data(row), k, p_idFilter); + (void *) items.data(row), k, p_idFilter, ef); for (int i = k - 1; i >= 0; i--) { auto &result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -893,7 +879,8 @@ PYBIND11_PLUGIN(hnswlib) { py::arg("data"), py::arg("k") = 1, py::arg("num_threads") = -1, - py::arg("filter") = py::none()) + py::arg("filter") = py::none(), + py::arg("ef") = 10) .def("add_items", &Index::addItems, py::arg("data"), @@ -902,7 +889,6 @@ PYBIND11_PLUGIN(hnswlib) { py::arg("replace_deleted") = false) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) - .def("set_ef", &Index::set_ef, py::arg("ef")) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", @@ -918,15 +904,6 @@ PYBIND11_PLUGIN(hnswlib) { .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) - .def_property("ef", - [](const Index & index) { - return index.index_inited ? index.appr_alg->ef_ : index.default_ef; - }, - [](Index & index, const size_t ef_) { - index.default_ef = ef_; - if (index.appr_alg) - index.appr_alg->ef_ = ef_; - }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; }) @@ -957,7 +934,7 @@ PYBIND11_PLUGIN(hnswlib) { py::class_>(m, "BFIndex") .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) - .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none()) + .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none(), py::arg("ef") = 10) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp index 43777ff6..46a55551 100644 --- a/tests/cpp/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -219,7 +219,6 @@ test_vs_recall( efs.push_back(i); } for (size_t ef : efs) { - appr_alg.setEf(ef); StopW stopw = StopW(); float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); diff --git a/tests/cpp/sift_test.cpp b/tests/cpp/sift_test.cpp index decdf605..31871367 100644 --- a/tests/cpp/sift_test.cpp +++ b/tests/cpp/sift_test.cpp @@ -89,12 +89,13 @@ float test_approx( HierarchicalNSW &appr_alg, size_t vecdim, vector>> &answers, - size_t k) { + size_t k, + size_t ef = 10) { size_t correct = 0; size_t total = 0; //#pragma omp parallel for for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, 10); + std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, 10, nullptr, ef); std::priority_queue> gt(answers[i]); unordered_set g; total += gt.size(); @@ -131,10 +132,9 @@ void test_vs_recall( efs.push_back(i); }*/ for (size_t ef : efs) { - appr_alg.setEf(ef); StopW stopw = StopW(); - float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); + float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k, ef); float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; cout << ef << "\t" << recall << "\t" << time_us_per_query << " us\n"; if (recall > 1.0) { diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp index 52e1fa14..d9b7c96a 100644 --- a/tests/cpp/updates_test.cpp +++ b/tests/cpp/updates_test.cpp @@ -106,12 +106,12 @@ std::vector load_batch(std::string path, int size) { template static float test_approx(std::vector &queries, size_t qsize, hnswlib::HierarchicalNSW &appr_alg, size_t vecdim, - std::vector> &answers, size_t K) { + std::vector> &answers, size_t K, size_t ef) { size_t correct = 0; size_t total = 0; for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K); + std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K, nullptr, ef); total += K; while (result.size()) { if (answers[i].find(result.top().second) != answers[i].end()) { @@ -148,13 +148,11 @@ test_vs_recall( bool test_passed = false; for (size_t ef : efs) { - appr_alg.setEf(ef); - appr_alg.metric_hops = 0; appr_alg.metric_distance_computations = 0; StopW stopw = StopW(); - float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k); + float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k, ef); float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; float distance_comp_per_query = appr_alg.metric_distance_computations / (1.0f * qsize); float hops_per_query = appr_alg.metric_hops / (1.0f * qsize); diff --git a/tests/python/bindings_test.py b/tests/python/bindings_test.py index f9b3092f..d496f199 100644 --- a/tests/python/bindings_test.py +++ b/tests/python/bindings_test.py @@ -28,9 +28,6 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(10) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_filter.py b/tests/python/bindings_test_filter.py index 480c8dcd..460f05ae 100644 --- a/tests/python/bindings_test_filter.py +++ b/tests/python/bindings_test_filter.py @@ -30,10 +30,6 @@ def testRandomSelf(self): hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) bf_index.init_index(max_elements=num_elements) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - hnsw_index.set_ef(10) - hnsw_index.set_num_threads(4) # by default using all available cores print("Adding %d elements" % (len(data))) diff --git a/tests/python/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py index 515ecebd..01ed6aa5 100644 --- a/tests/python/bindings_test_getdata.py +++ b/tests/python/bindings_test_getdata.py @@ -29,9 +29,6 @@ def testGettingItems(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_labels.py b/tests/python/bindings_test_labels.py index 524a24d5..d73bb017 100644 --- a/tests/python/bindings_test_labels.py +++ b/tests/python/bindings_test_labels.py @@ -31,10 +31,6 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) - p.set_num_threads(4) # by default using all available cores # We split the data in two batches: @@ -73,13 +69,12 @@ def testRandomSelf(self): print("\nLoading index from '%s'\n" % index_path) p.load_index(index_path) - p.set_ef(100) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) + labels, distances = p.knn_query(data, k=1, ef=100) items = p.get_items(labels) # Check the recall: @@ -94,7 +89,7 @@ def testRandomSelf(self): self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) # Delete data1 - labels1_deleted, _ = p.knn_query(data1, k=1) + labels1_deleted, _ = p.knn_query(data1, k=1, ef=100) # delete probable duplicates from nearest neighbors labels1_deleted_no_dup = set(labels1_deleted.flatten()) for l in labels1_deleted_no_dup: @@ -116,9 +111,9 @@ def testRandomSelf(self): p.save_index(del_index_path) p = hnswlib.Index(space='l2', dim=dim) p.load_index(del_index_path) - p.set_ef(100) - labels1_after, _ = p.knn_query(data1, k=1) + + labels1_after, _ = p.knn_query(data1, k=1, ef=100) for la in labels1_after: if la[0] in labels1_deleted_no_dup: print(f"Found deleted label {la[0]} during knn search after index loading") @@ -127,7 +122,7 @@ def testRandomSelf(self): # Unmark deleted data for l in labels1_deleted_no_dup: p.unmark_deleted(l) - labels_restored, _ = p.knn_query(data1, k=1) + labels_restored, _ = p.knn_query(data1, k=1, ef=100) self.assertAlmostEqual(np.mean(labels_restored.reshape(-1) == np.arange(len(data1))), 1.0, 3) print("All the data in data1 are restored") diff --git a/tests/python/bindings_test_metadata.py b/tests/python/bindings_test_metadata.py index 69dce03d..997390b7 100644 --- a/tests/python/bindings_test_metadata.py +++ b/tests/python/bindings_test_metadata.py @@ -27,9 +27,6 @@ def testMetadata(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_pickle.py b/tests/python/bindings_test_pickle.py index 1fa0e822..1ba97260 100644 --- a/tests/python/bindings_test_pickle.py +++ b/tests/python/bindings_test_pickle.py @@ -64,8 +64,6 @@ def test_space_main(self, space, dim): p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) - p.ef = self.ef - p0.ef = self.ef p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items @@ -81,10 +79,10 @@ def test_space_main(self, space, dim): self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") # Test if returned distances are same - l, d = p.knn_query(test_data, k=self.k) - l0, d0 = p0.knn_query(test_data, k=self.k) - l1, d1 = p1.knn_query(test_data, k=self.k) - l2, d2 = p2.knn_query(test_data, k=self.k) + l, d = p.knn_query(test_data, k=self.k, ef=400) + l0, d0 = p0.knn_query(test_data, k=self.k, ef=400) + l1, d1 = p1.knn_query(test_data, k=self.k, ef=400) + l2, d2 = p2.knn_query(test_data, k=self.k, ef=400) self.assertLessEqual(np.sum(((d-d0)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") self.assertLessEqual(np.sum(((d0-d1)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") @@ -102,11 +100,6 @@ def test_space_main(self, space, dim): total_thresh=self.item_err_thresh, dists_thresh=self.dists_err_thresh) - # Check ef parameter value - self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") - self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") - self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") - self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") # Check M parameter value self.assertEqual(p.M, self.M, "incorrect value of p.M") @@ -126,7 +119,6 @@ class PickleUnitTests(unittest.TestCase): def setUp(self): self.ef_construction = 200 self.M = 32 - self.ef = 400 self.num_elements = 1000 self.num_test_elements = 100 diff --git a/tests/python/bindings_test_recall.py b/tests/python/bindings_test_recall.py index 2190ba45..eac88e7f 100644 --- a/tests/python/bindings_test_recall.py +++ b/tests/python/bindings_test_recall.py @@ -34,9 +34,6 @@ def testRandomSelf(self): hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) bf_index.init_index(max_elements=num_elements) - # Controlling the recall for hnsw by setting ef: - # higher ef leads to better accuracy, but slower search - hnsw_index.set_ef(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores @@ -52,7 +49,7 @@ def testRandomSelf(self): query_data = np.float32(np.random.random((num_queries, dim))) # Query the elements and measure recall: - labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) + labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k, ef=200) labels_bf, distances_bf = bf_index.knn_query(query_data, k) # Measure recall diff --git a/tests/python/bindings_test_replace.py b/tests/python/bindings_test_replace.py index 80003a3a..bd886a28 100644 --- a/tests/python/bindings_test_replace.py +++ b/tests/python/bindings_test_replace.py @@ -46,7 +46,6 @@ def testRandomSelf(self): hnsw_index = hnswlib.Index(space='l2', dim=dim) hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) - hnsw_index.set_ef(100) hnsw_index.set_num_threads(4) # Add batch 1 and 2 @@ -57,18 +56,18 @@ def testRandomSelf(self): # Delete nearest neighbors of batch 2 print("Deleting neighbors of batch 2") - labels2_deleted, _ = hnsw_index.knn_query(data2, k=1) + labels2_deleted, _ = hnsw_index.knn_query(data2, k=1, ef=100) # delete probable duplicates from nearest neighbors labels2_deleted_no_dup = set(labels2_deleted.flatten()) num_duplicates = len(labels2_deleted) - len(labels2_deleted_no_dup) for l in labels2_deleted_no_dup: hnsw_index.mark_deleted(l) - labels1_found, _ = hnsw_index.knn_query(data1, k=1) + labels1_found, _ = hnsw_index.knn_query(data1, k=1, ef=100) items = hnsw_index.get_items(labels1_found) diff_with_gt_labels = np.mean(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) - labels2_after, _ = hnsw_index.knn_query(data2, k=1) + labels2_after, _ = hnsw_index.knn_query(data2, k=1, ef=100) for la in labels2_after: if la[0] in labels2_deleted_no_dup: print(f"Found deleted label {la[0]} during knn search") @@ -125,7 +124,7 @@ def testRandomSelf(self): # Check recall print("Checking recall") - labels_found, _ = hnsw_index.knn_query(data4_tr, k=1) + labels_found, _ = hnsw_index.knn_query(data4_tr, k=1, ef=100) recall = np.mean(labels_found.reshape(-1) == labels4_tr) print(f"Recall for the 4 batch: {recall}") self.assertGreater(recall, recall_threshold) @@ -144,7 +143,7 @@ def testRandomSelf(self): # Check recall print("Checking recall") - labels_found, _ = hnsw_index_pckl.knn_query(data3_tr, k=1) + labels_found, _ = hnsw_index_pckl.knn_query(data3_tr, k=1, ef=100) recall = np.mean(labels_found.reshape(-1) == labels3_tr) print(f"Recall for the 3 batch: {recall}") self.assertGreater(recall, recall_threshold) @@ -195,9 +194,7 @@ def test_recall_degradation(self): bf_index = hnswlib.BFIndex(space='l2', dim=dim) bf_index.init_index(max_elements=max_num_elements) - hnsw_index_no_replace.set_ef(100) hnsw_index_no_replace.set_num_threads(50) - hnsw_index_with_replace.set_ef(100) hnsw_index_with_replace.set_num_threads(50) # Add data diff --git a/tests/python/bindings_test_resize.py b/tests/python/bindings_test_resize.py index b5bceeb1..1c276abe 100644 --- a/tests/python/bindings_test_resize.py +++ b/tests/python/bindings_test_resize.py @@ -30,9 +30,6 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) - # Controlling the recall by setting ef: - # higher ef leads to better accuracy, but slower search - p.set_ef(20) p.set_num_threads(idx % 8) # by default using all available cores @@ -44,7 +41,7 @@ def testRandomSelf(self): p.add_items(data1) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1) + labels, distances = p.knn_query(data1, k=1, ef=20) items = p.get_items(list(range(len(data1)))) @@ -62,7 +59,7 @@ def testRandomSelf(self): p.add_items(data2) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1) + labels, distances = p.knn_query(data, k=1, ef=20) items=p.get_items(list(range(num_elements))) # Check the recall: diff --git a/tests/python/bindings_test_spaces.py b/tests/python/bindings_test_spaces.py index c3cceb87..99fa02c4 100644 --- a/tests/python/bindings_test_spaces.py +++ b/tests/python/bindings_test_spaces.py @@ -27,8 +27,6 @@ def testRandomSelf(self): p = hnswlib.Index(space=space, dim=dim) p.init_index(max_elements=5, ef_construction=100, M=16) - p.set_ef(10) - p.add_items(data2) # Query the elements for themselves and measure recall: diff --git a/tests/python/bindings_test_stress_mt_replace.py b/tests/python/bindings_test_stress_mt_replace.py index 8cd3e9bc..bca66013 100644 --- a/tests/python/bindings_test_stress_mt_replace.py +++ b/tests/python/bindings_test_stress_mt_replace.py @@ -33,7 +33,6 @@ def testRandomSelf(self): hnsw_index = hnswlib.Index(space='l2', dim=dim) hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) - hnsw_index.set_ef(100) hnsw_index.set_num_threads(50) # Add batch 1 and 2 @@ -41,18 +40,18 @@ def testRandomSelf(self): hnsw_index.add_items(data2, labels2) # maximum number of elements is reached # Delete nearest neighbors of batch 2 - labels2_deleted, _ = hnsw_index.knn_query(data2, k=1) + labels2_deleted, _ = hnsw_index.knn_query(data2, k=1, ef=100) labels2_deleted_flat = labels2_deleted.flatten() # delete probable duplicates from nearest neighbors labels2_deleted_no_dup = set(labels2_deleted_flat) for l in labels2_deleted_no_dup: hnsw_index.mark_deleted(l) - labels1_found, _ = hnsw_index.knn_query(data1, k=1) + labels1_found, _ = hnsw_index.knn_query(data1, k=1, ef=100) items = hnsw_index.get_items(labels1_found) diff_with_gt_labels = np.mean(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) - labels2_after, _ = hnsw_index.knn_query(data2, k=1) + labels2_after, _ = hnsw_index.knn_query(data2, k=1, ef=100) labels2_after_flat = labels2_after.flatten() common = np.intersect1d(labels2_after_flat, labels2_deleted_flat) self.assertTrue(common.size == 0) diff --git a/tests/python/speedtest.py b/tests/python/speedtest.py index 8d16cfc3..6582bee6 100644 --- a/tests/python/speedtest.py +++ b/tests/python/speedtest.py @@ -28,9 +28,6 @@ p.init_index(max_elements=num_elements, ef_construction=60, M=16) -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores @@ -45,13 +42,12 @@ p.set_num_threads(threads) times=[] time.sleep(1) -p.set_ef(15) for _ in range(1): # p.load_index(index_path) for _ in range(3): t0=time.time() qdata=data[:5000*threads] - labels, distances = p.knn_query(qdata, k=1) + labels, distances = p.knn_query(qdata, k=1, ef=15) tt=time.time()-t0 times.append(tt) recall=np.sum(labels.reshape(-1)==np.arange(len(qdata)))/len(qdata) From af3de3a97474278352cddfc0046fe25f88eafb39 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 30 Jan 2024 02:00:07 +0300 Subject: [PATCH 2/7] Update searchKnnCloserFirst method signature --- hnswlib/hnswlib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 9ebf9119..c6d9ce6b 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -165,7 +165,7 @@ class AlgorithmInterface { // Return k nearest neighbor in the order of closer fist virtual std::vector> - searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const; + searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const; virtual void saveIndex(const std::string &location) = 0; virtual ~AlgorithmInterface(){ @@ -175,11 +175,11 @@ class AlgorithmInterface { template std::vector> AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k, - BaseFilterFunctor* isIdAllowed) const { + BaseFilterFunctor* isIdAllowed, const size_t ef_) const { std::vector> result; // here searchKnn returns the result in the order of further first - auto ret = searchKnn(query_data, k, isIdAllowed); + auto ret = searchKnn(query_data, k, isIdAllowed, ef_); { size_t sz = ret.size(); result.resize(sz); From 53e276ddf8d8dcbc6cc21a5fcb59320e5db75ecf Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 30 Jan 2024 09:55:07 +0300 Subject: [PATCH 3/7] Remove set_ef call from example --- examples/python/example_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/python/example_search.py b/examples/python/example_search.py index 4581843b..201e2d3f 100644 --- a/examples/python/example_search.py +++ b/examples/python/example_search.py @@ -23,11 +23,9 @@ # Element insertion (can be called several times): p.add_items(data, ids) -# Controlling the recall by setting ef: -p.set_ef(50) # ef should always be > k # Query dataset, k - number of the closest elements (returns 2 numpy arrays) -labels, distances = p.knn_query(data, k=1) +labels, distances = p.knn_query(data, k=1, ef=50) # Index objects support pickling # WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! From 63ddcdd560a86c513821797b2b861bb3520b87c4 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 30 Jan 2024 09:57:38 +0300 Subject: [PATCH 4/7] Remove ef parameter check from Index --- examples/python/example_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/python/example_search.py b/examples/python/example_search.py index 201e2d3f..3b27cf54 100644 --- a/examples/python/example_search.py +++ b/examples/python/example_search.py @@ -36,4 +36,3 @@ print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") -print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") From 94cd1b30c32f6d0aa6c20c197006c176b4a28055 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 30 Jan 2024 10:14:09 +0300 Subject: [PATCH 5/7] Revert all the changes. --- examples/python/example.py | 3 ++ examples/python/example_filter.py | 3 ++ examples/python/example_replace_deleted.py | 3 ++ examples/python/example_search.py | 5 ++- examples/python/example_serialization.py | 3 ++ examples/python/pyw_hnswlib.py | 6 ++- hnswlib/bruteforce.h | 2 +- hnswlib/hnswalg.h | 11 ++++- hnswlib/hnswlib.h | 8 ++-- python_bindings/LazyIndex.py | 9 +++- python_bindings/bindings.cpp | 45 ++++++++++++++----- tests/cpp/sift_1b.cpp | 1 + tests/cpp/sift_test.cpp | 8 ++-- tests/cpp/updates_test.cpp | 8 ++-- tests/python/bindings_test.py | 3 ++ tests/python/bindings_test_filter.py | 4 ++ tests/python/bindings_test_getdata.py | 3 ++ tests/python/bindings_test_labels.py | 15 ++++--- tests/python/bindings_test_metadata.py | 3 ++ tests/python/bindings_test_pickle.py | 16 +++++-- tests/python/bindings_test_recall.py | 5 ++- tests/python/bindings_test_replace.py | 13 +++--- tests/python/bindings_test_resize.py | 7 ++- tests/python/bindings_test_spaces.py | 2 + .../python/bindings_test_stress_mt_replace.py | 7 +-- tests/python/speedtest.py | 6 ++- 26 files changed, 149 insertions(+), 50 deletions(-) diff --git a/examples/python/example.py b/examples/python/example.py index 5d360ee7..3d6d7477 100644 --- a/examples/python/example.py +++ b/examples/python/example.py @@ -32,6 +32,9 @@ p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_filter.py b/examples/python/example_filter.py index 396d5a21..add22a3d 100644 --- a/examples/python/example_filter.py +++ b/examples/python/example_filter.py @@ -25,6 +25,9 @@ hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_replace_deleted.py b/examples/python/example_replace_deleted.py index f309046e..3c0b62e7 100644 --- a/examples/python/example_replace_deleted.py +++ b/examples/python/example_replace_deleted.py @@ -32,6 +32,9 @@ # Enable replacing of deleted elements hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_search.py b/examples/python/example_search.py index 3b27cf54..4581843b 100644 --- a/examples/python/example_search.py +++ b/examples/python/example_search.py @@ -23,9 +23,11 @@ # Element insertion (can be called several times): p.add_items(data, ids) +# Controlling the recall by setting ef: +p.set_ef(50) # ef should always be > k # Query dataset, k - number of the closest elements (returns 2 numpy arrays) -labels, distances = p.knn_query(data, k=1, ef=50) +labels, distances = p.knn_query(data, k=1) # Index objects support pickling # WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! @@ -36,3 +38,4 @@ print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") diff --git a/examples/python/example_serialization.py b/examples/python/example_serialization.py index a560eb8a..76ca1436 100644 --- a/examples/python/example_serialization.py +++ b/examples/python/example_serialization.py @@ -33,6 +33,9 @@ p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/pyw_hnswlib.py b/examples/python/pyw_hnswlib.py index cb64bbee..0ccfbc5e 100644 --- a/examples/python/pyw_hnswlib.py +++ b/examples/python/pyw_hnswlib.py @@ -39,6 +39,8 @@ def add_items(self, data, ids=None): start += 1 self.index.add_items(data=data, ids=np.asarray(int_labels)) + def set_ef(self, ef): + self.index.set_ef(ef) def load_index(self, path): self.index.load_index(path) @@ -53,8 +55,8 @@ def save_index(self, path): def set_num_threads(self, num_threads): self.index.set_num_threads(num_threads) - def knn_query(self, data, k=1, ef=10): - labels_int, distances = self.index.knn_query(data=data, k=k, ef=ef) + def knn_query(self, data, k=1): + labels_int, distances = self.index.knn_query(data=data, k=k) labels = [] for li in labels_int: labels.append( diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 36c99158..30b33ae9 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -98,7 +98,7 @@ class BruteforceSearch : public AlgorithmInterface { std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const { + searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { assert(k <= cur_element_count); std::priority_queue> topResults; if (cur_element_count == 0) return topResults; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 21f1af0e..f7d7f264 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -28,6 +28,7 @@ class HierarchicalNSW : public AlgorithmInterface { size_t maxM_{0}; size_t maxM0_{0}; size_t ef_construction_{0}; + size_t ef_{ 0 }; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -106,6 +107,7 @@ class HierarchicalNSW : public AlgorithmInterface { maxM_ = M_; maxM0_ = M_ * 2; ef_construction_ = std::max(ef_construction, M_); + ef_ = 10; level_generator_.seed(random_seed); update_probability_generator_.seed(random_seed + 1); @@ -155,6 +157,12 @@ class HierarchicalNSW : public AlgorithmInterface { } }; + + void setEf(size_t ef) { + ef_ = ef; + } + + inline std::mutex& getLabelOpMutex(labeltype label) const { // calculate hash size_t lock_id = label & (MAX_LABEL_OPERATION_LOCKS - 1); @@ -686,6 +694,7 @@ class HierarchicalNSW : public AlgorithmInterface { throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; + ef_ = 10; for (size_t i = 0; i < cur_element_count; i++) { label_lookup_[getExternalLabel(i)] = i; unsigned int linkListSize; @@ -1166,7 +1175,7 @@ class HierarchicalNSW : public AlgorithmInterface { std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const { + searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { std::priority_queue> result; if (cur_element_count == 0) return result; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index c6d9ce6b..fb7118fa 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -161,11 +161,11 @@ class AlgorithmInterface { virtual void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) = 0; virtual std::priority_queue> - searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const = 0; + searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr) const = 0; // Return k nearest neighbor in the order of closer fist virtual std::vector> - searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr, const size_t ef_ = 10) const; + searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const; virtual void saveIndex(const std::string &location) = 0; virtual ~AlgorithmInterface(){ @@ -175,11 +175,11 @@ class AlgorithmInterface { template std::vector> AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k, - BaseFilterFunctor* isIdAllowed, const size_t ef_) const { + BaseFilterFunctor* isIdAllowed) const { std::vector> result; // here searchKnn returns the result in the order of further first - auto ret = searchKnn(query_data, k, isIdAllowed, ef_); + auto ret = searchKnn(query_data, k, isIdAllowed); { size_t sz = ret.size(); result.resize(sz); diff --git a/python_bindings/LazyIndex.py b/python_bindings/LazyIndex.py index 5b9a57f1..dbaa4673 100644 --- a/python_bindings/LazyIndex.py +++ b/python_bindings/LazyIndex.py @@ -24,15 +24,20 @@ def get_items(self, ids=None): if self.max_elements==0: return [] return super().get_items(ids) - def knn_query(self, data,k=1, num_threads=-1, ef=10): + def knn_query(self, data,k=1, num_threads=-1): if self.max_elements==0: return [], [] - return super().knn_query(data, k, num_threads, ef) + return super().knn_query(data, k, num_threads) def resize_index(self, size): if self.max_elements==0: return self.init_index(size) else: return super().resize_index(size) + def set_ef(self, ef): + if self.max_elements==0: + self.init_ef_construction=ef + return + super().set_ef(ef) def get_max_elements(self): return self.max_elements def get_current_count(self): diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 2723e0ed..5153bb58 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -150,6 +150,7 @@ class Index { std::string space_name; int dim; size_t seed; + size_t default_ef; bool index_inited; bool ep_added; @@ -176,6 +177,8 @@ class Index { ep_added = true; index_inited = false; num_threads_default = std::thread::hardware_concurrency(); + + default_ef = 10; } @@ -199,9 +202,18 @@ class Index { appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed, allow_replace_deleted); index_inited = true; ep_added = false; + appr_alg->ef_ = default_ef; seed = random_seed; } + + void set_ef(size_t ef) { + default_ef = ef; + if (appr_alg) + appr_alg->ef_ = ef; + } + + void set_num_threads(int num_threads) { this->num_threads_default = num_threads; } @@ -400,6 +412,7 @@ class Index { "M"_a = appr_alg->M_, "mult"_a = appr_alg->mult_, "ef_construction"_a = appr_alg->ef_construction_, + "ef"_a = appr_alg->ef_, "has_deletions"_a = (bool)appr_alg->num_deleted_, "size_links_per_element"_a = appr_alg->size_links_per_element_, "allow_replace_deleted"_a = appr_alg->allow_replace_deleted_, @@ -449,7 +462,8 @@ class Index { "seed"_a = seed); if (index_inited == false) - return py::dict(**params); + return py::dict(**params, "ef"_a = default_ef); + auto ann_params = getAnnData(); return py::dict(**params, **ann_params); @@ -483,6 +497,7 @@ class Index { new_index->index_inited = index_inited_; new_index->ep_added = d["ep_added"].cast(); new_index->num_threads_default = d["num_threads"].cast(); + new_index->default_ef = d["ef"].cast(); if (index_inited_) new_index->setAnnData(d); @@ -517,6 +532,7 @@ class Index { assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); + appr_alg->ef_ = d["ef"].cast(); assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); @@ -585,8 +601,7 @@ class Index { py::object input, size_t k = 1, int num_threads = -1, - const std::function& filter = nullptr, - const size_t ef = 10) { + const std::function& filter = nullptr) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype* data_numpy_l; @@ -615,7 +630,7 @@ class Index { if (normalize == false) { ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) { std::priority_queue> result = appr_alg->searchKnn( - (void*)items.data(row), k, p_idFilter, ef); + (void*)items.data(row), k, p_idFilter); if (result.size() != k) throw std::runtime_error( "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); @@ -635,7 +650,7 @@ class Index { normalize_vector((float*)items.data(row), (norm_array.data() + start_idx)); std::priority_queue> result = appr_alg->searchKnn( - (void*)(norm_array.data() + start_idx), k, p_idFilter, ef); + (void*)(norm_array.data() + start_idx), k, p_idFilter); if (result.size() != k) throw std::runtime_error( "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); @@ -805,8 +820,7 @@ class BFIndex { py::object knnQuery_return_numpy( py::object input, size_t k = 1, - const std::function& filter = nullptr, - const size_t ef = 10) { + const std::function& filter = nullptr) { py::array_t < dist_t, py::array::c_style | py::array::forcecast > items(input); auto buffer = items.request(); hnswlib::labeltype *data_numpy_l; @@ -825,7 +839,7 @@ class BFIndex { for (size_t row = 0; row < rows; row++) { std::priority_queue> result = alg->searchKnn( - (void *) items.data(row), k, p_idFilter, ef); + (void *) items.data(row), k, p_idFilter); for (int i = k - 1; i >= 0; i--) { auto &result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -879,8 +893,7 @@ PYBIND11_PLUGIN(hnswlib) { py::arg("data"), py::arg("k") = 1, py::arg("num_threads") = -1, - py::arg("filter") = py::none(), - py::arg("ef") = 10) + py::arg("filter") = py::none()) .def("add_items", &Index::addItems, py::arg("data"), @@ -889,6 +902,7 @@ PYBIND11_PLUGIN(hnswlib) { py::arg("replace_deleted") = false) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) + .def("set_ef", &Index::set_ef, py::arg("ef")) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", @@ -904,6 +918,15 @@ PYBIND11_PLUGIN(hnswlib) { .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) + .def_property("ef", + [](const Index & index) { + return index.index_inited ? index.appr_alg->ef_ : index.default_ef; + }, + [](Index & index, const size_t ef_) { + index.default_ef = ef_; + if (index.appr_alg) + index.appr_alg->ef_ = ef_; + }) .def_property_readonly("max_elements", [](const Index & index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; }) @@ -934,7 +957,7 @@ PYBIND11_PLUGIN(hnswlib) { py::class_>(m, "BFIndex") .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) - .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none(), py::arg("ef") = 10) + .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none()) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp index 46a55551..43777ff6 100644 --- a/tests/cpp/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -219,6 +219,7 @@ test_vs_recall( efs.push_back(i); } for (size_t ef : efs) { + appr_alg.setEf(ef); StopW stopw = StopW(); float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); diff --git a/tests/cpp/sift_test.cpp b/tests/cpp/sift_test.cpp index 31871367..decdf605 100644 --- a/tests/cpp/sift_test.cpp +++ b/tests/cpp/sift_test.cpp @@ -89,13 +89,12 @@ float test_approx( HierarchicalNSW &appr_alg, size_t vecdim, vector>> &answers, - size_t k, - size_t ef = 10) { + size_t k) { size_t correct = 0; size_t total = 0; //#pragma omp parallel for for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, 10, nullptr, ef); + std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, 10); std::priority_queue> gt(answers[i]); unordered_set g; total += gt.size(); @@ -132,9 +131,10 @@ void test_vs_recall( efs.push_back(i); }*/ for (size_t ef : efs) { + appr_alg.setEf(ef); StopW stopw = StopW(); - float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k, ef); + float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; cout << ef << "\t" << recall << "\t" << time_us_per_query << " us\n"; if (recall > 1.0) { diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp index d9b7c96a..52e1fa14 100644 --- a/tests/cpp/updates_test.cpp +++ b/tests/cpp/updates_test.cpp @@ -106,12 +106,12 @@ std::vector load_batch(std::string path, int size) { template static float test_approx(std::vector &queries, size_t qsize, hnswlib::HierarchicalNSW &appr_alg, size_t vecdim, - std::vector> &answers, size_t K, size_t ef) { + std::vector> &answers, size_t K) { size_t correct = 0; size_t total = 0; for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K, nullptr, ef); + std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K); total += K; while (result.size()) { if (answers[i].find(result.top().second) != answers[i].end()) { @@ -148,11 +148,13 @@ test_vs_recall( bool test_passed = false; for (size_t ef : efs) { + appr_alg.setEf(ef); + appr_alg.metric_hops = 0; appr_alg.metric_distance_computations = 0; StopW stopw = StopW(); - float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k, ef); + float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k); float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; float distance_comp_per_query = appr_alg.metric_distance_computations / (1.0f * qsize); float hops_per_query = appr_alg.metric_hops / (1.0f * qsize); diff --git a/tests/python/bindings_test.py b/tests/python/bindings_test.py index d496f199..f9b3092f 100644 --- a/tests/python/bindings_test.py +++ b/tests/python/bindings_test.py @@ -28,6 +28,9 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(10) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_filter.py b/tests/python/bindings_test_filter.py index 460f05ae..480c8dcd 100644 --- a/tests/python/bindings_test_filter.py +++ b/tests/python/bindings_test_filter.py @@ -30,6 +30,10 @@ def testRandomSelf(self): hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) bf_index.init_index(max_elements=num_elements) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + hnsw_index.set_ef(10) + hnsw_index.set_num_threads(4) # by default using all available cores print("Adding %d elements" % (len(data))) diff --git a/tests/python/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py index 01ed6aa5..515ecebd 100644 --- a/tests/python/bindings_test_getdata.py +++ b/tests/python/bindings_test_getdata.py @@ -29,6 +29,9 @@ def testGettingItems(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_labels.py b/tests/python/bindings_test_labels.py index d73bb017..524a24d5 100644 --- a/tests/python/bindings_test_labels.py +++ b/tests/python/bindings_test_labels.py @@ -31,6 +31,10 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) + p.set_num_threads(4) # by default using all available cores # We split the data in two batches: @@ -69,12 +73,13 @@ def testRandomSelf(self): print("\nLoading index from '%s'\n" % index_path) p.load_index(index_path) + p.set_ef(100) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1, ef=100) + labels, distances = p.knn_query(data, k=1) items = p.get_items(labels) # Check the recall: @@ -89,7 +94,7 @@ def testRandomSelf(self): self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0) # Delete data1 - labels1_deleted, _ = p.knn_query(data1, k=1, ef=100) + labels1_deleted, _ = p.knn_query(data1, k=1) # delete probable duplicates from nearest neighbors labels1_deleted_no_dup = set(labels1_deleted.flatten()) for l in labels1_deleted_no_dup: @@ -111,9 +116,9 @@ def testRandomSelf(self): p.save_index(del_index_path) p = hnswlib.Index(space='l2', dim=dim) p.load_index(del_index_path) + p.set_ef(100) - - labels1_after, _ = p.knn_query(data1, k=1, ef=100) + labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: if la[0] in labels1_deleted_no_dup: print(f"Found deleted label {la[0]} during knn search after index loading") @@ -122,7 +127,7 @@ def testRandomSelf(self): # Unmark deleted data for l in labels1_deleted_no_dup: p.unmark_deleted(l) - labels_restored, _ = p.knn_query(data1, k=1, ef=100) + labels_restored, _ = p.knn_query(data1, k=1) self.assertAlmostEqual(np.mean(labels_restored.reshape(-1) == np.arange(len(data1))), 1.0, 3) print("All the data in data1 are restored") diff --git a/tests/python/bindings_test_metadata.py b/tests/python/bindings_test_metadata.py index 997390b7..69dce03d 100644 --- a/tests/python/bindings_test_metadata.py +++ b/tests/python/bindings_test_metadata.py @@ -27,6 +27,9 @@ def testMetadata(self): p.init_index(max_elements=num_elements, ef_construction=100, M=16) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_pickle.py b/tests/python/bindings_test_pickle.py index 1ba97260..1fa0e822 100644 --- a/tests/python/bindings_test_pickle.py +++ b/tests/python/bindings_test_pickle.py @@ -64,6 +64,8 @@ def test_space_main(self, space, dim): p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M) + p.ef = self.ef + p0.ef = self.ef p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items @@ -79,10 +81,10 @@ def test_space_main(self, space, dim): self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same") # Test if returned distances are same - l, d = p.knn_query(test_data, k=self.k, ef=400) - l0, d0 = p0.knn_query(test_data, k=self.k, ef=400) - l1, d1 = p1.knn_query(test_data, k=self.k, ef=400) - l2, d2 = p2.knn_query(test_data, k=self.k, ef=400) + l, d = p.knn_query(test_data, k=self.k) + l0, d0 = p0.knn_query(test_data, k=self.k) + l1, d1 = p1.knn_query(test_data, k=self.k) + l2, d2 = p2.knn_query(test_data, k=self.k) self.assertLessEqual(np.sum(((d-d0)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match") self.assertLessEqual(np.sum(((d0-d1)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match") @@ -100,6 +102,11 @@ def test_space_main(self, space, dim): total_thresh=self.item_err_thresh, dists_thresh=self.dists_err_thresh) + # Check ef parameter value + self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") + self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") + self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") + self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") # Check M parameter value self.assertEqual(p.M, self.M, "incorrect value of p.M") @@ -119,6 +126,7 @@ class PickleUnitTests(unittest.TestCase): def setUp(self): self.ef_construction = 200 self.M = 32 + self.ef = 400 self.num_elements = 1000 self.num_test_elements = 100 diff --git a/tests/python/bindings_test_recall.py b/tests/python/bindings_test_recall.py index eac88e7f..2190ba45 100644 --- a/tests/python/bindings_test_recall.py +++ b/tests/python/bindings_test_recall.py @@ -34,6 +34,9 @@ def testRandomSelf(self): hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16) bf_index.init_index(max_elements=num_elements) + # Controlling the recall for hnsw by setting ef: + # higher ef leads to better accuracy, but slower search + hnsw_index.set_ef(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores @@ -49,7 +52,7 @@ def testRandomSelf(self): query_data = np.float32(np.random.random((num_queries, dim))) # Query the elements and measure recall: - labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k, ef=200) + labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k) labels_bf, distances_bf = bf_index.knn_query(query_data, k) # Measure recall diff --git a/tests/python/bindings_test_replace.py b/tests/python/bindings_test_replace.py index bd886a28..80003a3a 100644 --- a/tests/python/bindings_test_replace.py +++ b/tests/python/bindings_test_replace.py @@ -46,6 +46,7 @@ def testRandomSelf(self): hnsw_index = hnswlib.Index(space='l2', dim=dim) hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) + hnsw_index.set_ef(100) hnsw_index.set_num_threads(4) # Add batch 1 and 2 @@ -56,18 +57,18 @@ def testRandomSelf(self): # Delete nearest neighbors of batch 2 print("Deleting neighbors of batch 2") - labels2_deleted, _ = hnsw_index.knn_query(data2, k=1, ef=100) + labels2_deleted, _ = hnsw_index.knn_query(data2, k=1) # delete probable duplicates from nearest neighbors labels2_deleted_no_dup = set(labels2_deleted.flatten()) num_duplicates = len(labels2_deleted) - len(labels2_deleted_no_dup) for l in labels2_deleted_no_dup: hnsw_index.mark_deleted(l) - labels1_found, _ = hnsw_index.knn_query(data1, k=1, ef=100) + labels1_found, _ = hnsw_index.knn_query(data1, k=1) items = hnsw_index.get_items(labels1_found) diff_with_gt_labels = np.mean(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) - labels2_after, _ = hnsw_index.knn_query(data2, k=1, ef=100) + labels2_after, _ = hnsw_index.knn_query(data2, k=1) for la in labels2_after: if la[0] in labels2_deleted_no_dup: print(f"Found deleted label {la[0]} during knn search") @@ -124,7 +125,7 @@ def testRandomSelf(self): # Check recall print("Checking recall") - labels_found, _ = hnsw_index.knn_query(data4_tr, k=1, ef=100) + labels_found, _ = hnsw_index.knn_query(data4_tr, k=1) recall = np.mean(labels_found.reshape(-1) == labels4_tr) print(f"Recall for the 4 batch: {recall}") self.assertGreater(recall, recall_threshold) @@ -143,7 +144,7 @@ def testRandomSelf(self): # Check recall print("Checking recall") - labels_found, _ = hnsw_index_pckl.knn_query(data3_tr, k=1, ef=100) + labels_found, _ = hnsw_index_pckl.knn_query(data3_tr, k=1) recall = np.mean(labels_found.reshape(-1) == labels3_tr) print(f"Recall for the 3 batch: {recall}") self.assertGreater(recall, recall_threshold) @@ -194,7 +195,9 @@ def test_recall_degradation(self): bf_index = hnswlib.BFIndex(space='l2', dim=dim) bf_index.init_index(max_elements=max_num_elements) + hnsw_index_no_replace.set_ef(100) hnsw_index_no_replace.set_num_threads(50) + hnsw_index_with_replace.set_ef(100) hnsw_index_with_replace.set_num_threads(50) # Add data diff --git a/tests/python/bindings_test_resize.py b/tests/python/bindings_test_resize.py index 1c276abe..b5bceeb1 100644 --- a/tests/python/bindings_test_resize.py +++ b/tests/python/bindings_test_resize.py @@ -30,6 +30,9 @@ def testRandomSelf(self): p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) + # Controlling the recall by setting ef: + # higher ef leads to better accuracy, but slower search + p.set_ef(20) p.set_num_threads(idx % 8) # by default using all available cores @@ -41,7 +44,7 @@ def testRandomSelf(self): p.add_items(data1) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data1, k=1, ef=20) + labels, distances = p.knn_query(data1, k=1) items = p.get_items(list(range(len(data1)))) @@ -59,7 +62,7 @@ def testRandomSelf(self): p.add_items(data2) # Query the elements for themselves and measure recall: - labels, distances = p.knn_query(data, k=1, ef=20) + labels, distances = p.knn_query(data, k=1) items=p.get_items(list(range(num_elements))) # Check the recall: diff --git a/tests/python/bindings_test_spaces.py b/tests/python/bindings_test_spaces.py index 99fa02c4..c3cceb87 100644 --- a/tests/python/bindings_test_spaces.py +++ b/tests/python/bindings_test_spaces.py @@ -27,6 +27,8 @@ def testRandomSelf(self): p = hnswlib.Index(space=space, dim=dim) p.init_index(max_elements=5, ef_construction=100, M=16) + p.set_ef(10) + p.add_items(data2) # Query the elements for themselves and measure recall: diff --git a/tests/python/bindings_test_stress_mt_replace.py b/tests/python/bindings_test_stress_mt_replace.py index bca66013..8cd3e9bc 100644 --- a/tests/python/bindings_test_stress_mt_replace.py +++ b/tests/python/bindings_test_stress_mt_replace.py @@ -33,6 +33,7 @@ def testRandomSelf(self): hnsw_index = hnswlib.Index(space='l2', dim=dim) hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) + hnsw_index.set_ef(100) hnsw_index.set_num_threads(50) # Add batch 1 and 2 @@ -40,18 +41,18 @@ def testRandomSelf(self): hnsw_index.add_items(data2, labels2) # maximum number of elements is reached # Delete nearest neighbors of batch 2 - labels2_deleted, _ = hnsw_index.knn_query(data2, k=1, ef=100) + labels2_deleted, _ = hnsw_index.knn_query(data2, k=1) labels2_deleted_flat = labels2_deleted.flatten() # delete probable duplicates from nearest neighbors labels2_deleted_no_dup = set(labels2_deleted_flat) for l in labels2_deleted_no_dup: hnsw_index.mark_deleted(l) - labels1_found, _ = hnsw_index.knn_query(data1, k=1, ef=100) + labels1_found, _ = hnsw_index.knn_query(data1, k=1) items = hnsw_index.get_items(labels1_found) diff_with_gt_labels = np.mean(np.abs(data1 - items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) - labels2_after, _ = hnsw_index.knn_query(data2, k=1, ef=100) + labels2_after, _ = hnsw_index.knn_query(data2, k=1) labels2_after_flat = labels2_after.flatten() common = np.intersect1d(labels2_after_flat, labels2_deleted_flat) self.assertTrue(common.size == 0) diff --git a/tests/python/speedtest.py b/tests/python/speedtest.py index 6582bee6..8d16cfc3 100644 --- a/tests/python/speedtest.py +++ b/tests/python/speedtest.py @@ -28,6 +28,9 @@ p.init_index(max_elements=num_elements, ef_construction=60, M=16) +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) # Set number of threads used during batch search/construction # By default using all available cores @@ -42,12 +45,13 @@ p.set_num_threads(threads) times=[] time.sleep(1) +p.set_ef(15) for _ in range(1): # p.load_index(index_path) for _ in range(3): t0=time.time() qdata=data[:5000*threads] - labels, distances = p.knn_query(qdata, k=1, ef=15) + labels, distances = p.knn_query(qdata, k=1) tt=time.time()-t0 times.append(tt) recall=np.sum(labels.reshape(-1)==np.arange(len(qdata)))/len(qdata) From b7940ba090cf6b82835ddaa41d8d1ef12b99ba47 Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Tue, 30 Jan 2024 10:26:52 +0300 Subject: [PATCH 6/7] Add overloaded searchKnn method with explicit ef parameter --- hnswlib/bruteforce.h | 4 ++++ hnswlib/hnswalg.h | 11 ++++++++--- hnswlib/hnswlib.h | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 30b33ae9..8cf4acf9 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -96,6 +96,10 @@ class BruteforceSearch : public AlgorithmInterface { cur_element_count--; } + std::priority_queue> + searchKnn(const void *query_data, size_t k, const size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const { + return searchKnn(query_data, k, isIdAllowed); + } std::priority_queue> searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index f7d7f264..e9e4f7f3 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1173,9 +1173,14 @@ class HierarchicalNSW : public AlgorithmInterface { return cur_c; } - std::priority_queue> searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const { + return searchKnn(query_data, k, this->ef_, isIdAllowed); + } + + + std::priority_queue> + searchKnn(const void *query_data, size_t k, const size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const { std::priority_queue> result; if (cur_element_count == 0) return result; @@ -1210,10 +1215,10 @@ class HierarchicalNSW : public AlgorithmInterface { std::priority_queue, std::vector>, CompareByFirst> top_candidates; if (num_deleted_) { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_, k), isIdAllowed); + currObj, query_data, std::max(ef, k), isIdAllowed); } else { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_, k), isIdAllowed); + currObj, query_data, std::max(ef, k), isIdAllowed); } while (top_candidates.size() > k) { diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index fb7118fa..b93c2599 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -160,6 +160,9 @@ class AlgorithmInterface { public: virtual void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) = 0; + virtual std::priority_queue> + searchKnn(const void*, size_t, const size_t ef_, BaseFilterFunctor* isIdAllowed = nullptr) const = 0; + virtual std::priority_queue> searchKnn(const void*, size_t, BaseFilterFunctor* isIdAllowed = nullptr) const = 0; @@ -167,11 +170,34 @@ class AlgorithmInterface { virtual std::vector> searchKnnCloserFirst(const void* query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const; + virtual std::vector> + searchKnnCloserFirst(const void* query_data, size_t k, const size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const; + virtual void saveIndex(const std::string &location) = 0; virtual ~AlgorithmInterface(){ } }; +template +std::vector> +AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k, + const size_t ef, BaseFilterFunctor* isIdAllowed) const { + std::vector> result; + + // here searchKnn returns the result in the order of further first + auto ret = searchKnn(query_data, k, ef, isIdAllowed); + { + size_t sz = ret.size(); + result.resize(sz); + while (!ret.empty()) { + result[--sz] = ret.top(); + ret.pop(); + } + } + + return result; +} + template std::vector> AlgorithmInterface::searchKnnCloserFirst(const void* query_data, size_t k, From b9dc4be0ad63d55e373c4e6428c4b6f45e34b26f Mon Sep 17 00:00:00 2001 From: ozanarmagan Date: Sun, 4 Feb 2024 20:29:03 +0300 Subject: [PATCH 7/7] Fix candidate selection bug and handle label deletion --- hnswlib/hnswalg.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index e9e4f7f3..02670330 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -271,9 +271,9 @@ class HierarchicalNSW : public AlgorithmInterface { _mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0); #endif - if (!isMarkedDeleted(candidate_id)) + if (!isMarkedDeleted(candidate_id)) top_candidates.emplace(dist1, candidate_id); - + if (top_candidates.size() > ef_construction_) top_candidates.pop(); @@ -765,6 +765,8 @@ class HierarchicalNSW : public AlgorithmInterface { lock_table.unlock(); markDeletedInternal(internalId); + lock_table.lock(); + label_lookup_.erase(label); } @@ -884,6 +886,10 @@ class HierarchicalNSW : public AlgorithmInterface { setExternalLabel(internal_id_replaced, label); std::unique_lock lock_table(label_lookup_lock); + // check if the label is already in the index + if (label_lookup_.find(label) != label_lookup_.end() && !isMarkedDeleted(label_lookup_[label])) { + markDeletedInternal(label_lookup_[label]); + } label_lookup_.erase(label_replaced); label_lookup_[label] = internal_id_replaced; lock_table.unlock();