diff --git a/README.md b/README.md index 67d174b..8dee54b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -NGT +
+ +
+ === Neighborhood Graph and Tree for Indexing High-dimensional Data diff --git a/VERSION b/VERSION index de28578..91c74a5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.7.6 +1.7.7 diff --git a/lib/NGT/Capi.cpp b/lib/NGT/Capi.cpp index df27bb2..cdf5b52 100644 --- a/lib/NGT/Capi.cpp +++ b/lib/NGT/Capi.cpp @@ -511,6 +511,27 @@ bool ngt_batch_append_index(NGTIndex index, float *obj, uint32_t data_count, NGT } } +bool ngt_batch_insert_index(NGTIndex index, float *obj, uint32_t data_count, uint32_t *ids, NGTError error) { + NGT::Index* pindex = static_cast(index); + int32_t dim = pindex->getObjectSpace().getDimension(); + + bool status = true; + float *objptr = obj; + for (size_t idx = 0; idx < data_count; idx++, objptr += dim) { + try{ + std::vector vobj(objptr, objptr + dim); + ids[idx] = pindex->insert(vobj); + }catch(std::exception &err) { + status = false; + ids[idx] = 0; + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); + operate_error_string_(ss, error); + } + } + return status; +} + bool ngt_create_index(NGTIndex index, uint32_t pool_size, NGTError error) { if(index == NULL){ std::stringstream ss; diff --git a/lib/NGT/Capi.h b/lib/NGT/Capi.h index 1c9d8ff..d39222b 100644 --- a/lib/NGT/Capi.h +++ b/lib/NGT/Capi.h @@ -102,6 +102,8 @@ ObjectID ngt_append_index_as_float(NGTIndex, float*, uint32_t, NGTError); bool ngt_batch_append_index(NGTIndex, float*, uint32_t, NGTError); +bool ngt_batch_insert_index(NGTIndex, float*, uint32_t, uint32_t *, NGTError); + bool ngt_create_index(NGTIndex, uint32_t, NGTError); bool ngt_remove_index(NGTIndex, ObjectID, NGTError); diff --git a/lib/NGT/Command.cpp b/lib/NGT/Command.cpp index c7d2522..1adfc50 100644 --- a/lib/NGT/Command.cpp +++ b/lib/NGT/Command.cpp @@ -716,35 +716,10 @@ #endif cerr << "ngt::reconstructGraph: Extract the graph data." << endl; // extract only edges from the index to reduce the memory usage. - NGT::GraphIndex &outGraph = (NGT::GraphIndex&)outIndex.getIndex(); Timer timer; timer.start(); vector graph; - graph.reserve(outGraph.repository.size()); - for (size_t id = 1; id < outGraph.repository.size(); id++) { - if (id % 1000000 == 0) { - cerr << "Processed " << id << " objects." << endl; - } - try { - NGT::GraphNode &node = *outGraph.getNode(id); -#if defined(NGT_SHARED_MEMORY_ALLOCATOR) - NGT::ObjectDistances nd; - nd.reserve(node.size()); - for (auto n = node.begin(outGraph.repository.allocator); n != node.end(outGraph.repository.allocator); ++n) { - nd.push_back(ObjectDistance((*n).id, (*n).distance)); - } - graph.push_back(nd); -#else - graph.push_back(node); -#endif - if (graph.back().size() != graph.back().capacity()) { - cerr << "ngt::reconstructGraph: Warning! The graph size must be the same as the capacity. " << id << endl; - } - } catch(NGT::Exception &err) { - cerr << "ngt::reconstructGraph: Warning! Cannot get the node. ID=" << id << ":" << err.what() << endl; - continue; - } - } + GraphReconstructor::extractGraph(graph, outIndex); char mode = args.getChar("m", 's'); char pamode = args.getChar("P", 'a'); @@ -794,7 +769,8 @@ double gtEpsilon = 0.1; double mergin = 0.2; - NGT::Optimizer optimizer(outIndex); + NGT::Optimizer optimizer(outIndex); + NGT::GraphIndex &outGraph = (NGT::GraphIndex&)outIndex.getIndex(); try { auto param = optimizer.adjustSearchEdgeSize(baseAccuracyRange, rateAccuracyRange, querySize, gtEpsilon, mergin); NeighborhoodGraph::Property &prop = outGraph.getGraphProperty(); diff --git a/lib/NGT/GraphReconstructor.h b/lib/NGT/GraphReconstructor.h index f3ebe42..c6c5620 100644 --- a/lib/NGT/GraphReconstructor.h +++ b/lib/NGT/GraphReconstructor.h @@ -30,6 +30,36 @@ namespace NGT { class GraphReconstructor { public: + static void extractGraph(vector &graph, NGT::Index &index) { + NGT::GraphIndex &graphIndex = static_cast(index.getIndex()); + graph.reserve(graphIndex.repository.size()); + for (size_t id = 1; id < graphIndex.repository.size(); id++) { + if (id % 1000000 == 0) { + cerr << "GraphReconstructor::extractGraph: Processed " << id << " objects." << endl; + } + try { + NGT::GraphNode &node = *graphIndex.getNode(id); +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + NGT::ObjectDistances nd; + nd.reserve(node.size()); + for (auto n = node.begin(graphIndex.repository.allocator); n != node.end(graphIndex.repository.allocator); ++n) { + nd.push_back(ObjectDistance((*n).id, (*n).distance)); + } + graph.push_back(nd); +#else + graph.push_back(node); +#endif + if (graph.back().size() != graph.back().capacity()) { + cerr << "GraphReconstructor::extractGraph: Warning! The graph size must be the same as the capacity. " << id << endl; + } + } catch(NGT::Exception &err) { + cerr << "GraphReconstructor::extractGraph: Warning! Cannot get the node. ID=" << id << ":" << err.what() << endl; + continue; + } + } + + } + @@ -382,12 +412,14 @@ class GraphReconstructor { } originalEdgeTimer.stop(); - reverseEdgeTimer.start(); + reverseEdgeTimer.start(); + int insufficientNodeCount = 0; for (size_t id = 1; id <= graph.size(); ++id) { try { NGT::ObjectDistances &node = graph[id - 1]; size_t rsize = reverseEdgeSize; if (rsize > node.size()) { + insufficientNodeCount++; rsize = node.size(); } for (size_t i = 0; i < rsize; ++i) { @@ -408,6 +440,9 @@ class GraphReconstructor { } } reverseEdgeTimer.stop(); + if (insufficientNodeCount != 0) { + cerr << "# of the nodes edges of which are in short = " << insufficientNodeCount << endl; + } normalizeEdgeTimer.start(); for (size_t id = 1; id < outGraph.repository.size(); id++) { diff --git a/lib/NGT/Optimizer.h b/lib/NGT/Optimizer.h index cecfa6c..bd88a38 100644 --- a/lib/NGT/Optimizer.h +++ b/lib/NGT/Optimizer.h @@ -16,6 +16,8 @@ #pragma once +#include "Command.h" + #define NGT_LOG_BASED_OPTIMIZATION namespace NGT { @@ -525,9 +527,6 @@ namespace NGT { toOver = fromOver; toOverEpsilon = fromOverEpsilon; } - if (fromOverEpsilon == toOverEpsilon) { - cerr << "Warning!! fromOverEpsilon equals toOverEpsilon " << fromOverEpsilon << ". This might cause some problems." << endl; - } fromUnderEpsilon = fromOverEpsilon - epsilonStep; } sp.beginOfEpsilon = sp.endOfEpsilon = fromUnderEpsilon; @@ -742,7 +741,7 @@ namespace NGT { cerr << "adjustRateSearchEdgeSize::explore for the mergin " << mergin << ", " << rateStart << "..." << endl; for (size_t rateStep = 16; rateStep != 1; rateStep /= 2) { double prevTime = DBL_MAX; - for (size_t rate = rateStart; rate < 200; rate += rateStep) { + for (size_t rate = rateStart; rate < 2000; rate += rateStep) { if (rate > 1000) { stringstream msg; msg << "rate is too large! " << rate; diff --git a/python/setup.py b/python/setup.py index f36c645..40e9b77 100644 --- a/python/setup.py +++ b/python/setup.py @@ -21,7 +21,7 @@ # for pip >= 10.0 from pip._internal import locations -version = '1.4.0' +version = '1.5.0' if static_library: with open('../VERSION', 'r') as fh: diff --git a/python/src/ngtpy.cpp b/python/src/ngtpy.cpp index ea79184..e696a50 100644 --- a/python/src/ngtpy.cpp +++ b/python/src/ngtpy.cpp @@ -15,6 +15,8 @@ // #include "NGT/Index.h" +#include "NGT/GraphReconstructor.h" +#include "NGT/Optimizer.h" #include #include @@ -207,6 +209,8 @@ class Index : public NGT::Index { NGT::SearchContainer sc(*ngtquery); sc.setSize(size); // the number of resultant objects. + NGT::ObjectDistances rs; + sc.setResults(&rs); NGT::Index::linearSearch(sc); @@ -214,34 +218,27 @@ class Index : public NGT::Index { NGT::Index::deleteObject(ngtquery); if (!withDistance) { - NGT::ResultPriorityQueue &r = sc.getWorkingResult(); - py::array_t ids(r.size()); + py::array_t ids(rs.size()); py::buffer_info idsinfo = ids.request(); - int *endptr = reinterpret_cast(idsinfo.ptr); - int *ptr = endptr + (r.size() - 1); + int *ptr = reinterpret_cast(idsinfo.ptr); if (zeroNumbering) { - while (ptr >= endptr) { - *ptr-- = r.top().id - 1; - r.pop(); + for (auto ri = rs.begin(); ri != rs.end(); ++ri) { + *ptr++ = (*ri).id - 1; } } else { - while (ptr >= endptr) { - *ptr-- = r.top().id; - r.pop(); + for (auto ri = rs.begin(); ri != rs.end(); ++ri) { + *ptr++ = (*ri).id; } } - return ids; } py::list results; - NGT::ObjectDistances r; - r.moveFrom(sc.getWorkingResult()); if (zeroNumbering) { - for (auto ri = r.begin(); ri != r.end(); ++ri) { + for (auto ri = rs.begin(); ri != rs.end(); ++ri) { results.append(py::make_tuple((*ri).id - 1, (*ri).distance)); } } else { - for (auto ri = r.begin(); ri != r.end(); ++ri) { + for (auto ri = rs.begin(); ri != rs.end(); ++ri) { results.append(py::make_tuple((*ri).id, (*ri).distance)); } } @@ -287,6 +284,139 @@ class Index : public NGT::Index { size_t numOfDistanceComputations; }; +class Optimizer { +public: + Optimizer() { + numOfOutgoingEdges = 10; + numOfIncomingEdges= 120; + baseAccuracyRange = pair(0.30, 0.50); + rateAccuracyRange = pair(0.80, 0.90); + numOfQueries = 100; + gtEpsilon = 0.1; + mergin = 0.2; + } + + void adjustSearchCoefficients(const string indexPath){ + NGT::Index index(indexPath); + NGT::GraphIndex &graph = static_cast(index.getIndex()); + NGT::Optimizer optimizer(index); + try { + auto coefficients = optimizer.adjustSearchEdgeSize(baseAccuracyRange, rateAccuracyRange, numOfQueries, gtEpsilon, mergin); + NGT::NeighborhoodGraph::Property &prop = graph.getGraphProperty(); + prop.dynamicEdgeSizeBase = coefficients.first; + prop.dynamicEdgeSizeRate = coefficients.second; + } catch(NGT::Exception &err) { + stringstream msg; + msg << "Optimizer::adjustSearchCoefficients: Cannot adjust the search coefficients. " << err.what(); + NGTThrowException(msg); + } + graph.saveIndex(indexPath); + } + + void execute( + const string inIndexPath, + const string outIndexPath + ){ + if ((numOfOutgoingEdges < 0 && numOfIncomingEdges >= 0) || + (numOfOutgoingEdges >= 0 && numOfIncomingEdges < 0)) { + NGTThrowException("Optimizer::execute: Specified any of the number of edges is invalid."); + } +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + if (access(outIndexPath.c_str(), 0) == 0) { + stringstream msg; + msg << "Optimizer::execute: The specified index exists. " << outIndexPath; + NGTThrowException(msg); + } + const string com = "cp -r " + inIndexPath + " " + outIndexPath; + system(com.c_str()); + NGT::Index outIndex(outIndexPath); +#else + NGT::Index outIndex(inIndexPath); +#endif + cerr << "Optimizer::execute: Extract the graph data." << endl; + // extract only edges from the index to reduce the memory usage. + NGT::GraphIndex &outGraph = static_cast(outIndex.getIndex()); + NGT::Timer timer; + timer.start(); + vector graph; + NGT::GraphReconstructor::extractGraph(graph, outIndex); + + if (numOfOutgoingEdges >= 0) { + NGT::GraphReconstructor::convertToANNG(graph); + NGT::GraphReconstructor::reconstructGraph(graph, outIndex, numOfOutgoingEdges, numOfIncomingEdges); + } + timer.stop(); + cerr << "Optimizer::execute: Graph reconstruction time=" << timer.time << " (sec) " << endl; + timer.reset(); + timer.start(); + NGT::GraphReconstructor::adjustPathsEffectively(outIndex); + timer.stop(); + cerr << "Optimizer::execute: Path adjustment time=" << timer.time << " (sec) " << endl; + + NGT::Optimizer optimizer(outIndex); + try { + auto coefficients = optimizer.adjustSearchEdgeSize(baseAccuracyRange, rateAccuracyRange, numOfQueries, gtEpsilon, mergin); + NGT::NeighborhoodGraph::Property &prop = outGraph.getGraphProperty(); + prop.dynamicEdgeSizeBase = coefficients.first; + prop.dynamicEdgeSizeRate = coefficients.second; + } catch(NGT::Exception &err) { + stringstream msg; + msg << "Optimizer::execute: Cannot adjust the search coefficients. " << err.what(); + NGTThrowException(msg); + } + + outGraph.saveIndex(outIndexPath); + + } + + void set(int outgoing, int incoming, int nofqs, + float baseAccuracyFrom, float baseAccuracyTo, + float rateAccuracyFrom, float rateAccuracyTo, + double qte, double m + ) { + if (outgoing >= 0) { + numOfOutgoingEdges = outgoing; + } + if (incoming >= 0) { + numOfIncomingEdges = incoming; + } + if (nofqs > 0) { + numOfQueries = nofqs; + } + auto range = baseAccuracyRange; + if (baseAccuracyFrom > 0.0) { + range.first = baseAccuracyFrom; + } + if (baseAccuracyTo > 0.0) { + range.second = baseAccuracyTo; + } + baseAccuracyRange = range; + range = rateAccuracyRange; + if (rateAccuracyFrom > 0.0) { + range.first = rateAccuracyFrom; + } + if (rateAccuracyTo > 0.0) { + range.second = rateAccuracyTo; + } + rateAccuracyRange = range; + if (qte != DBL_MIN) { + gtEpsilon = qte; + } + if (m > 0.0) { + mergin = m; + } + } + + size_t numOfOutgoingEdges; + size_t numOfIncomingEdges; + pair baseAccuracyRange; + pair rateAccuracyRange; + size_t numOfQueries; + double gtEpsilon; + double mergin; + +}; + PYBIND11_MODULE(ngtpy, m) { m.doc() = "ngt python"; @@ -330,5 +460,24 @@ PYBIND11_MODULE(ngtpy, m) { .def("insert", &::Index::insert, py::arg("object"), py::arg("debug") = false); + + py::class_(m, "Optimizer") + .def(py::init<>()) + .def("execute", &::Optimizer::execute, + py::arg("in_index_path"), + py::arg("out_index_path")) + .def("adjust_search_coefficients", &::Optimizer::adjustSearchCoefficients, + py::arg("index_path")) + .def("set", &::Optimizer::set, + py::arg("num_of_outgoings") = -1, + py::arg("num_of_incomings") = -1, + py::arg("num_of_queries") = -1, + py::arg("low_accuracy_from") = -1.0, + py::arg("low_accuracy_to") = -1.0, + py::arg("high_accuracy_from") = -1.0, + py::arg("high_accuracy_to") = -1.0, + py::arg("gt_epsilon") = DBL_MIN, + py::arg("merge") = -1.0 + ); }