diff --git a/CMakeLists.txt b/CMakeLists.txt index feb0f98..fb3eb6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.1) project( ngt ) set(ngt_VERSION_MAJOR 1 ) -set(ngt_VERSION_MINOR 0 ) +set(ngt_VERSION_MINOR 1 ) set(ngt_VERSION_PATCH 0 ) set( ngt_VERSION ${ngt_VERSION_MAJOR}.${ngt_VERSION_MINOR}.${ngt_VERSION_PATCH} ) @@ -15,7 +15,7 @@ endif (NOT CMAKE_BUILD_TYPE) message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") if( ${UNIX} ) - option(WALL "enable all warnings" ON) + option(WALL "enable all warnings" ON) if( ${WALL} ) add_compile_options(-Wall) endif() @@ -24,6 +24,7 @@ if( ${UNIX} ) set(CMAKE_CXX_STANDARD 11) # for std::unordered_set, std::unique_ptr set(CMAKE_CXX_STANDARD_REQUIRED ON) + add_subdirectory("${PROJECT_SOURCE_DIR}/lib") add_subdirectory("${PROJECT_SOURCE_DIR}/bin") endif( ${UNIX} ) diff --git a/README.jp b/README.jp index 1c76164..f70e763 100644 --- a/README.jp +++ b/README.jp @@ -288,5 +288,14 @@ ngt コマンド使用例 http://www.apache.org/licenses/LICENSE-2.0 +貢献者ライセンス同意(CLA) + + 本ソフトウェアへのソースコードのご提供者は以下の貢献者ライセンスに同意して頂きます。 + + https://gist.github.com/ydnjp/3095832f100d5c3d2592 + + なお、GitHub (https://github.com/yahoojapan/NGT) へのご提供の場合のみ、個別の同意書面なしに、 + 上記貢献者ライセンスに同意して頂いたと見なしますので、ご注意ください。 + --- -Copyright (C) 2015-2016 Yahoo! JAPAN Research +Copyright (C) 2015-2017 Yahoo! JAPAN Research diff --git a/README.md b/README.md index e2a4332..4322ed8 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Note: Since there is no lock function, the index should be used only for referen License ------- -Copyright (C) 2015-2016 Yahoo Japan Corporation +Copyright (C) 2015-2017 Yahoo Japan Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. @@ -49,6 +49,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. +Contributor License Agreement +----------------------------- + +This project requires contributors to agree to a [Contributor License Agreement (CLA)](https://gist.github.com/ydnjp/3095832f100d5c3d2592). + +Note that only for contributions to the NGT repository on the GitHub (https://github.com/yahoojapan/NGT), the contibutors of them shall be deemed to have agreed to the CLA without individual written agreements. + Publications ------------ @@ -63,5 +70,5 @@ Publications ##### ANNG - Iwasaki, M.: Proximity search in metric spaces using approximate k nearest neigh-bor graph (in Japanese). IPSJ Trans. on Database 3(1) (2010) 18-28. ([pdf](http://i.yimg.jp/i/docs/research_lab/articles/miwasaki-ipsj-tod-2010.pdf)) -Copyright © 2015-2016 Yahoo Japan Corporation All Rights Reserved. +Copyright © 2015-2076 Yahoo Japan Corporation All Rights Reserved. diff --git a/bin/ngt/Command.h b/bin/ngt/Command.h new file mode 100644 index 0000000..f94d2a9 --- /dev/null +++ b/bin/ngt/Command.h @@ -0,0 +1,925 @@ +// +// Copyright (C) 2015-2017 Yahoo Japan Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "NGT/Index.h" + + +namespace NGT { + +static float roundFloat(float f, int digit) +{ + return roundf(f * pow(10.0, digit)) / pow(10.0, digit); +} + +class Args : public map +{ +public: + Args(int argc, char **argv): + option("a:b:c:d:e:f:g:hi:j:k:l:m:n:o:p:q:r:s:t:u:v:w:x:y:z:" + "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:") + { + int opt; + while ((opt = getopt(argc, argv, option)) != -1) { + if ((char)opt == 'h') { + string str; + str.append(1, (char)opt); + insert(pair(str, "")); + continue; + } + string str; + str.append(1, (char)opt); + insert(pair(str, string(optarg))); + } + for (int i = 0; optind < argc; optind++, i++) { + stringstream ss; + ss << "#" << i; + insert(pair(ss.str(), string(argv[optind]))); + } + } + string &find(const char *s) { return get(s); } + char getChar(const char *s, char v) { + try { + return get(s)[0]; + } catch (...) { + return v; + } + } + string getString(const char *s, const char *v) { + try { + return get(s); + } catch (...) { + return v; + } + } + string &get(const char *s) { + Args::iterator ai; + ai = map::find(string(s)); + if (ai == this->end()) { + stringstream msg; + msg << s << ": Not specified" << endl; + NGTThrowException(msg.str()); + } + return ai->second; + } + long getl(const char *s, long v) { + char *e; + long val; + try { + val = strtol(get(s).c_str(), &e, 10); + } catch (...) { + return v; + } + if (*e != 0) { + stringstream msg; + msg << "ARGS::getl: Illegal string. Option=-" << s << " Specified value=" << get(s) + << " Illegal string=" << e << endl; + NGTThrowException(msg.str()); + } + return val; + } + float getf(const char *s, float v) { + char *e; + float val; + try { + val = strtof(get(s).c_str(), &e); + } catch (...) { + return v; + } + if (*e != 0) { + stringstream msg; + msg << "ARGS::getf: Illegal string. Option=-" << s << " Specified value=" << get(s) + << " Illegal string=" << e << endl; + NGTThrowException(msg.str()); + } + return val; + } + const char *option; +}; + + +class Command { +public: + Command():debugLevel(0) {} + + void + create(Args &args) + { + const string usage = "Usage: ngt create " + "-d dimension [-p #-of-thread] [-i index-type(t|g)] [-g graph-type(a|k|b)] " + "[-t truncation-edge-limit] [-E edge-size] [-S edge-size-for-search] [-L edge-size-limit] " + "[-e epsilon] [-o object-type(f|c)] [-D distance-function] [-n data-size] " + "index(output) data.tsv(input)"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified." << endl; + cerr << usage << endl; + return; + } + string data; + try { + data = args.get("#2"); + } catch (...) { + cerr << "ngt: Error: Data is not specified." << endl; + } + + NGT::Property property; + + property.edgeSizeForCreation = args.getl("E", 10); + property.edgeSizeForSearch = args.getl("S", 40); + property.batchSizeForCreation = args.getl("b", 200); + property.insertionRadiusCoefficient = args.getf("e", 0.1) + 1.0; + property.truncationThreshold = args.getl("t", 0); + property.dimension = args.getl("d", 0); + property.threadPoolSize = args.getl("p", 24); + + if (property.dimension <= 0) { + cerr << "ngt: Error: Specify greater than 0 for # of your data dimension by a parameter -d." << endl; + cerr << usage << endl; + return; + } + + char graphType = args.getChar("g", 'a'); + switch(graphType) { + case 'a': property.graphType = NGT::Property::GraphType::GraphTypeANNG; break; + case 'k': property.graphType = NGT::Property::GraphType::GraphTypeKNNG; break; + case 'b': property.graphType = NGT::Property::GraphType::GraphTypeBKNNG; break; + case 'd': property.graphType = NGT::Property::GraphType::GraphTypeDNNG; break; + default: + cerr << "ngt: Error: Invalid graph type. " << graphType << endl; + cerr << usage << endl; + return; + } + + char seedType = args.getChar("s", 'r'); + switch(seedType) { + case 'f': property.seedType = NGT::Property::SeedType::SeedTypeFixedNodes; break; + case '1': property.seedType = NGT::Property::SeedType::SeedTypeFirstNode; break; + default: + case 'r': property.seedType = NGT::Property::SeedType::SeedTypeRandomNodes; break; + } + + char objectType = args.getChar("o", 'f'); + char distanceType = args.getChar("D", '2'); + + size_t dataSize = args.getl("n", 0); + char indexType = args.getChar("i", 't'); + + if (debugLevel >= 1) { + cerr << "edgeSizeForCreation=" << property.edgeSizeForCreation << endl; + cerr << "edgeSizeForSearch=" << property.edgeSizeForSearch << endl; + cerr << "edgeSizeLimit=" << property.edgeSizeLimitForCreation << endl; + cerr << "batch size=" << property.batchSizeForCreation << endl; + cerr << "graphType=" << property.graphType << endl; + cerr << "epsilon=" << property.insertionRadiusCoefficient - 1.0 << endl; + cerr << "thread size=" << property.threadPoolSize << endl; + cerr << "dimension=" << property.dimension << endl; + cerr << "indexType=" << indexType << endl; + } + + switch (objectType) { + case 'f': + property.objectType = NGT::Index::Property::ObjectType::Float; + break; + case 'c': + property.objectType = NGT::Index::Property::ObjectType::Uint8; + break; + default: + cerr << "ngt: Error: Invalid object type. " << objectType << endl; + cerr << usage << endl; + return; + } + + switch (distanceType) { + case '1': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeL1; + break; + case '2': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeL2; + break; + case 'a': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeAngle; + break; + case 'h': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeHamming; + break; + case 'c': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeCosine; + break; + default: + cerr << "ngt: Error: Invalid distance type. " << distanceType << endl; + cerr << usage << endl; + return; + } + + switch (indexType) { + case 't': + NGT::Index::createGraphAndTree(database, property, data, dataSize); + break; + case 'g': + NGT::Index::createGraph(database, property, data, dataSize); + break; + } + } + + void + append(Args &args) + { + const string usage = "Usage: ngt append [-p #-of-thread] [-d dimension] [-n data-size] " + "index(output) data.tsv(input)"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified." << endl; + cerr << usage << endl; + return; + } + string data; + try { + data = args.get("#2"); + } catch (...) { + cerr << "ngt: Error: Data is not specified." << endl; + cerr << usage << endl; + return; + } + + int threadSize = args.getl("p", 50); + size_t dimension = args.getl("d", 0); + size_t dataSize = args.getl("n", 0); + + if (debugLevel >= 1) { + cerr << "thread size=" << threadSize << endl; + cerr << "dimension=" << dimension << endl; + } + + try { + NGT::Index::append(database, data, threadSize, dataSize); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + } + + class SearchParameter { + public: + SearchParameter() {} + SearchParameter(Args &args) { + try { + query = args.get("#2"); + } catch (...) { + NGTThrowException("ngt: Error: Query is not specified"); + } + querySize = args.getl("Q", 0); + indexType = args.getChar("i", 't'); + size = args.getl("n", 20); + // edgeSize + // -1 : using the size which was specified at the index creation. + // 0 : no limitation for the edge size. + edgeSize = args.getl("E", -1); + outputMode = args.getString("o", "-"); + radius = args.getf("r", FLT_MAX); + trial = args.getl("t", 1); + { + beginOfEpsilon = endOfEpsilon = stepOfEpsilon = 0.1; + string epsilon = args.getString("e", "0.1"); + vector tokens; + NGT::Common::tokenize(epsilon, tokens, ":"); + if (tokens.size() >= 1) { beginOfEpsilon = endOfEpsilon = NGT::Common::strtod(tokens[0]); } + if (tokens.size() >= 2) { endOfEpsilon = NGT::Common::strtod(tokens[1]); } + if (tokens.size() >= 3) { stepOfEpsilon = NGT::Common::strtod(tokens[2]); } + step = 0; + if (tokens.size() >= 4) { step = NGT::Common::strtol(tokens[3]); } + } + } + string query; + size_t querySize; + char indexType; + int size; + long edgeSize; + string outputMode; + float radius; + double beginOfEpsilon; + double endOfEpsilon; + double stepOfEpsilon; + size_t step; + size_t trial; + }; + + static void + search(NGT::Index &index, SearchParameter &searchParameter, ostream &stream) + { + + if (searchParameter.outputMode[0] == 'e') { + stream << "# Beginning of Evaluation" << endl; + } + + ifstream is(searchParameter.query); + if (!is) { + cerr << "Cannot open the specified file. " << searchParameter.query << endl; + return; + } + string line; + double totalTime = 0; + size_t queryCount = 0; + while(getline(is, line)) { + if (searchParameter.querySize > 0 && queryCount >= searchParameter.querySize) { + break; + } + NGT::Object *object = index.allocateObject(line, " \t"); + queryCount++; + size_t step = searchParameter.step == 0 ? UINT_MAX : searchParameter.step; + for (size_t n = 0; n <= step; n++) { + double epsilon; + if (searchParameter.step != 0) { + epsilon = searchParameter.beginOfEpsilon + (searchParameter.endOfEpsilon - searchParameter.beginOfEpsilon) * n / step; + } else { + epsilon = searchParameter.beginOfEpsilon + searchParameter.stepOfEpsilon * n; + if (epsilon > searchParameter.endOfEpsilon) { + break; + } + } + //epsilon = roundFloat(epsilon, 6); + NGT::SearchContainer sc(*object); + NGT::ObjectDistances objects; + sc.setResults(&objects); + sc.setSize(searchParameter.size); + sc.setRadius(searchParameter.radius); + sc.setEpsilon(epsilon); + NGT::Timer timer; + try { + if (searchParameter.outputMode[0] == 'e') { + double time = 0.0; + uint64_t ntime = 0; + double minTime = DBL_MAX; + size_t trial = searchParameter.trial <= 1 ? 2 : searchParameter.trial; + for (size_t t = 0; t < trial; t++) { + switch (searchParameter.indexType) { + case 't': timer.start(); index.search(sc); timer.stop(); break; + case 'g': timer.start(); index.searchUsingOnlyGraph(sc); timer.stop(); break; + case 's': timer.start(); index.linearSearch(sc); timer.stop(); break; + } + if (minTime > timer.time) { + minTime = timer.time; + } + time += timer.time; + ntime += timer.ntime; + } + time /= (double)searchParameter.trial; + ntime /= searchParameter.trial; + timer.time = minTime; + timer.ntime = ntime; + } else { + switch (searchParameter.indexType) { + case 't': timer.start(); index.search(sc); timer.stop(); break; + case 'g': timer.start(); index.searchUsingOnlyGraph(sc); timer.stop(); break; + case 's': timer.start(); index.linearSearch(sc); timer.stop(); break; + } + } + } catch (NGT::Exception &err) { + throw err; + } + totalTime += timer.time; + if (searchParameter.outputMode[0] == 'e') { + stream << "# Query No.=" << queryCount << endl; + stream << "# Query=" << line.substr(0, 20) + " ..." << endl; + stream << "# Index Type=" << searchParameter.indexType << endl; + stream << "# Size=" << searchParameter.size << endl; + stream << "# Radius=" << searchParameter.radius << endl; + stream << "# Epsilon=" << epsilon << endl; + stream << "# Query Time (msec)=" << timer.time * 1000.0 << endl; + } else { + stream << "Query No." << queryCount << endl; + stream << "Rank\tID\tDistance" << endl; + } + for (size_t i = 0; i < objects.size(); i++) { + stream << i + 1 << "\t" << objects[i].id << "\t"; + stream << objects[i].distance << endl; + } + if (searchParameter.outputMode[0] == 'e') { + stream << "# End of Search" << endl; + } else { + stream << "Query Time= " << timer.time << " (sec), " << timer.time * 1000.0 << " (msec)" << endl; + } + } // for + index.deleteObject(object); + if (searchParameter.outputMode[0] == 'e') { + stream << "# End of Query" << endl; + } + } // while + if (searchParameter.outputMode[0] == 'e') { + stream << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl; + stream << "# Number of queries=" << queryCount << endl; + stream << "# End of Evaluation" << endl; + + if (searchParameter.outputMode == "e+") { + // show graph information + size_t esize = searchParameter.edgeSize; + long double distance = 0.0; + size_t numberOfNodes = 0; + size_t numberOfEdges = 0; + + NGT::GraphIndex &graph = (NGT::GraphIndex&)index.getIndex(); + for (size_t id = 1; id < graph.repository.size(); id++) { + NGT::GraphNode *node = 0; + try { + node = graph.getNode(id); + } catch(NGT::Exception &err) { + cerr << "Graph::search: Warning. Cannot get the node. ID=" << id << ":" << err.what() << " If the node was removed, no problem." << endl; + continue; + } + numberOfNodes++; + if (numberOfNodes % 1000000 == 0) { + cerr << "Processed " << numberOfNodes << endl; + } + for (size_t i = 0; i < node->size(); i++) { + if (esize != 0 && i >= esize) { + break; + } + numberOfEdges++; +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + distance += (*node).at(i, graph.repository.allocator).distance; +#else + distance += (*node)[i].distance; +#endif + } + } + + stream << "# # of nodes=" << numberOfNodes << endl; + stream << "# # of edges=" << numberOfEdges << endl; + stream << "# Average number of edges=" << (double)numberOfEdges / (double)numberOfNodes << endl; + stream << "# Average distance of edges=" << setprecision(10) << distance / (double)numberOfEdges << endl; + } + } else { + stream << "Average Query Time= " << totalTime / (double)queryCount << " (sec), " + << totalTime * 1000.0 / (double)queryCount << " (msec), (" + << totalTime << "/" << queryCount << ")" << endl; + } + } + + void + search(Args &args) { + const string usage = "Usage: ngt search [-i g|t|s] [-n result-size] [-e epsilon] [-E edge-size] [-o output-mode] index(input) query.tsv(input)"; + + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + + SearchParameter searchParameter(args); + + if (debugLevel >= 1) { + cerr << "indexType=" << searchParameter.indexType << endl; + cerr << "size=" << searchParameter.size << endl; + cerr << "edgeSize=" << searchParameter.edgeSize << endl; + cerr << "epsilon=" << searchParameter.beginOfEpsilon << "<->" << searchParameter.endOfEpsilon << "," + << searchParameter.stepOfEpsilon << endl; + } + + try { + NGT::Property property; + property.clear(); + if (searchParameter.edgeSize >= 0) { + property.edgeSizeForSearch = searchParameter.edgeSize; + } + NGT::Index index(database, property); + search(index, searchParameter, cout); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + + } + + + void + remove(Args &args) + { + const string usage = "Usage: ngt remove [-d object-ID-type(f|d)] index(input) object-ID(input)"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + try { + args.get("#2"); + } catch (...) { + cerr << "ngt: Error: ID is not specified" << endl; + cerr << usage << endl; + return; + } + char dataType = args.getChar("d", 'f'); + if (debugLevel >= 1) { + cerr << "dataType=" << dataType << endl; + } + + try { + vector objects; + if (dataType == 'f') { + string ids; + try { + ids = args.get("#2"); + } catch (...) { + cerr << "ngt: Error: Data file is not specified" << endl; + cerr << usage << endl; + return; + } + ifstream is(ids); + if (!is) { + cerr << "ngt: Error: Cannot open the specified file. " << ids << endl; + cerr << usage << endl; + return; + } + string line; + int count = 0; + while(getline(is, line)) { + count++; + vector tokens; + NGT::Common::tokenize(line, tokens, "\t "); + if (tokens.size() == 0 || tokens[0].size() == 0) { + continue; + } + char *e; + size_t id; + try { + id = strtol(tokens[0].c_str(), &e, 10); + objects.push_back(id); + } catch (...) { + cerr << "Illegal data. " << tokens[0] << endl; + } + if (*e != 0) { + cerr << "Illegal data. " << e << endl; + } + cerr << "removed ID=" << id << endl; + } + } else { + size_t id = args.getl("#2", 0); + cerr << "removed ID=" << id << endl; + objects.push_back(id); + } + NGT::Index::remove(database, objects); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + } + + void + exportIndex(Args &args) + { + const string usage = "Usage: ngt export index(input) export-file(output)"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + string exportFile; + try { + exportFile = args.get("#2"); + } catch (...) { + cerr << "ngt: Error: ID is not specified" << endl; + cerr << usage << endl; + return; + } + try { + NGT::Index::exportIndex(database, exportFile); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + } + + void + importIndex(Args &args) + { + const string usage = "Usage: ngt import index(output) import-file(input)"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + string importFile; + try { + importFile = args.get("#2"); + } catch (...) { + cerr << "ngt: Error: ID is not specified" << endl; + cerr << usage << endl; + return; + } + + try { + NGT::Index::importIndex(database, importFile); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + + } + + void + prune(Args &args) + { + const string usage = "Usage: ngt prune -e #-of-forcedly-pruned-edges -s #-of-selecively-pruned-edge"; + string indexName; + try { + indexName = args.get("#1"); + } catch (...) { + cerr << "Index is not specified" << endl; + cerr << usage << endl; + return; + } + + // the number of forcedly pruned edges + size_t forcedlyPrunedEdgeSize = args.getl("e", 0); + // the number of selectively pruned edges + size_t selectivelyPrunedEdgeSize = args.getl("s", 0); + + cerr << "forcedly pruned edge size=" << forcedlyPrunedEdgeSize << endl; + cerr << "selectively pruned edge size=" << selectivelyPrunedEdgeSize << endl; + + if (selectivelyPrunedEdgeSize == 0 && forcedlyPrunedEdgeSize == 0) { + cerr << "prune: Error! Either of selective edge size or remaining edge size should be specified." << endl; + cerr << usage << endl; + return; + } + + if (forcedlyPrunedEdgeSize != 0 && selectivelyPrunedEdgeSize != 0 && selectivelyPrunedEdgeSize >= forcedlyPrunedEdgeSize) { + cerr << "prune: Error! selective edge size is less than remaining edge size." << endl; + cerr << usage << endl; + return; + } + + NGT::Index index(indexName); + cerr << "loaded the input index." << endl; + + NGT::GraphIndex &graph = (NGT::GraphIndex&)index.getIndex(); + + for (size_t id = 1; id < graph.repository.size(); id++) { + try { + NGT::GraphNode &node = *graph.getNode(id); + if (id % 1000000 == 0) { + cerr << "Processed " << id << endl; + } + if (forcedlyPrunedEdgeSize > 0 && node.size() >= forcedlyPrunedEdgeSize) { +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + node.resize(forcedlyPrunedEdgeSize, graph.repository.allocator); +#else + node.resize(forcedlyPrunedEdgeSize); +#endif + } + if (selectivelyPrunedEdgeSize > 0 && node.size() >= selectivelyPrunedEdgeSize) { +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + cerr << "not implemented" << endl; + abort(); +#else + size_t rank = 0; + for (NGT::GraphNode::iterator i = node.begin(); i != node.end(); ++rank) { + //for (size_t i = 0; i < node.size(); ++i) { + if (rank >= selectivelyPrunedEdgeSize) { + bool found = false; + for (size_t t1 = 0; t1 < node.size() && found == false; ++t1) { + if (t1 >= selectivelyPrunedEdgeSize) { + break; + } + if (rank == t1) { + continue; + } + NGT::GraphNode &node2 = *graph.getNode(node[t1].id); + for (size_t t2 = 0; t2 < node2.size(); ++t2) { + if (t2 >= selectivelyPrunedEdgeSize) { + break; + } + if (node2[t2].id == (*i).id) { + found = true; + break; + } + } // for + } // for + if (found) { + //remove + i = node.erase(i); + continue; + } + } + i++; + } // for +#endif + } + + } catch(NGT::Exception &err) { + cerr << "Graph::search: Warning. Cannot get the node. ID=" << id << ":" << err.what() << endl; + continue; + } + } + + graph.saveIndex(indexName); + + } + + + static void showStatisticsOfGraph(NGT::GraphIndex &outGraph, char mode = '-', size_t edgeSize = UINT_MAX) + { + long double distance = 0.0; + size_t numberOfNodes = 0; + size_t numberOfOutdegree = 0; + size_t numberOfNodesWithoutEdges = 0; + size_t maxNumberOfOutdegree = 0; + size_t minNumberOfOutdegree = SIZE_MAX; + vector indegreeCount; + vector outdegreeHistogram; + vector indegreeHistogram; + NGT::GraphRepository &graph = outGraph.repository; + indegreeCount.resize(graph.size(), 0); + for (size_t id = 1; id < graph.size(); id++) { + NGT::GraphNode *node = 0; + try { + node = outGraph.getNode(id); + } catch(NGT::Exception &err) { + cerr << "ngt info: Warning. Cannot get the node. ID=" << id << ":" << err.what() << endl; + continue; + } + numberOfNodes++; + if (numberOfNodes % 1000000 == 0) { + cerr << "Processed " << numberOfNodes << endl; + } + size_t esize = node->size() > edgeSize ? edgeSize : node->size(); + if (esize == 0) { + numberOfNodesWithoutEdges++; + } + if (esize > maxNumberOfOutdegree) { + maxNumberOfOutdegree = esize; + } + if (esize < minNumberOfOutdegree) { + minNumberOfOutdegree = esize; + } + if (outdegreeHistogram.size() <= esize) { + outdegreeHistogram.resize(esize + 1); + } + outdegreeHistogram[esize]++; + if (mode == 'e') { + cout << id << "," << esize << ": "; + } + for (size_t i = 0; i < esize; i++) { +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + NGT::ObjectDistance &n = (*node).at(i, graph.allocator); +#else + NGT::ObjectDistance &n = (*node)[i]; +#endif + if (n.id == 0) { + cerr << "ngt info: Warning. id is zero." << endl; + } + indegreeCount[n.id]++; + numberOfOutdegree++; + double d = n.distance; + if (mode == 'e') { + cout << d << " "; + } + distance += d; + } + if (mode == 'e') { + cout << endl; + } + } + + // calculate variance + double averageNumberOfOutdegree = (double)numberOfOutdegree / (double)numberOfNodes; + double sumOfSquareOfOutdegree = 0; + double sumOfSquareOfIndegree = 0; + for (size_t id = 1; id < graph.size(); id++) { + NGT::GraphNode *node = 0; + try { + node = outGraph.getNode(id); + } catch(NGT::Exception &err) { + cerr << "ngt info: Warning. Cannot get the node. ID=" << id << ":" << err.what() << endl; + continue; + } + size_t esize = node->size(); + sumOfSquareOfOutdegree += ((double)esize - averageNumberOfOutdegree) * ((double)esize - averageNumberOfOutdegree); + sumOfSquareOfIndegree += ((double)indegreeCount[id] - averageNumberOfOutdegree) * ((double)indegreeCount[id] - averageNumberOfOutdegree); + } + + size_t numberOfNodesWithoutIndegree = 0; + size_t maxNumberOfIndegree = 0; + size_t minNumberOfIndegree = SIZE_MAX; + for (size_t id = 1; id < graph.size(); id++) { + if (indegreeCount[id] == 0) { + numberOfNodesWithoutIndegree++; + } + if (indegreeCount[id] > maxNumberOfIndegree) { + maxNumberOfIndegree = indegreeCount[id]; + } + if (indegreeCount[id] < minNumberOfIndegree) { + minNumberOfIndegree = indegreeCount[id]; + } + if (indegreeHistogram.size() <= indegreeCount[id]) { + indegreeHistogram.resize(indegreeCount[id] + 1); + } + indegreeHistogram[indegreeCount[id]]++; + } + cerr << "# of nodes=" << numberOfNodes << endl; + cerr << "# of edges=" << numberOfOutdegree << endl; + cerr << "# of nodes without edges=" << numberOfNodesWithoutEdges << endl; + cerr << "Max outdegree=" << maxNumberOfOutdegree << endl; + cerr << "Min outdegree=" << minNumberOfOutdegree << endl; + cerr << "Average number of edges=" << (double)numberOfOutdegree / (double)numberOfNodes << endl; + cerr << "Average distance of edges=" << setprecision(10) << distance / (double)numberOfOutdegree << endl; + cerr << "# of nodes where indegree is 0=" << numberOfNodesWithoutIndegree << endl; + cerr << "Max indegree=" << maxNumberOfIndegree << endl; + cerr << "Min indegree=" << minNumberOfIndegree << endl; + cerr << "max-out,min-out,v-out,max-in,min-in,v-in:" + << maxNumberOfOutdegree << ":" << minNumberOfOutdegree << ":" << sumOfSquareOfOutdegree / (double)numberOfOutdegree<< ":" + << maxNumberOfIndegree << ":" << minNumberOfIndegree << ":" << sumOfSquareOfIndegree / (double)numberOfOutdegree << endl; + + if (mode == 'h') { + cerr << "#\tout\tin" << endl; + for (size_t i = 0; i < outdegreeHistogram.size() || i < indegreeHistogram.size(); i++) { + size_t out = outdegreeHistogram.size() <= i ? 0 : outdegreeHistogram[i]; + size_t in = indegreeHistogram.size() <= i ? 0 : indegreeHistogram[i]; + cerr << i << "\t" << out << "\t" << in << endl; + } + } + } + + void + info(Args &args) + { + const string usage = "Usage: ngt info [-E #-of edges] [-g virtually-created-graph-type] [-m h|e] index"; + + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "ngt: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + + size_t edgeSize = args.getl("E", UINT_MAX); + char mode = args.getChar("m", '-'); + + try { + NGT::GraphIndex index(database); + showStatisticsOfGraph(index, mode, edgeSize); + } catch (NGT::Exception &err) { + cerr << "ngt: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "ngt: Error" << endl; + cerr << usage << endl; + } + } + + + ////////////////////////////////////////////////////////////// + + void setDebugLevel(int level) { debugLevel = level; } + int getDebugLevel() { return debugLevel; } + +protected: + int debugLevel; + +}; + +}; // NGT diff --git a/bin/ngt/README.md b/bin/ngt/README.md index fd39a99..d5c1ea7 100644 --- a/bin/ngt/README.md +++ b/bin/ngt/README.md @@ -18,13 +18,11 @@ Command **Note:** -The getopt function of the POSIX specification, all options must appear before positional arguments. -For example, the cygwin is one of the these environments. -In these environments, you must write command as below: +When the environment variable POSIXLY_CORERECT is set on some platforms such as Cygwin, you should specifiy options +before the command as follows. $ ngt [option] command index [data] - ### Description **ngt** provides high-speed nearest neighbor searches against a large volume of data (several million to several 10 million items of data) in high dimensional vector data space (several ten to several thousand dimensions). diff --git a/bin/ngt/ngt.cpp b/bin/ngt/ngt.cpp index 5169f34..f004a1f 100644 --- a/bin/ngt/ngt.cpp +++ b/bin/ngt/ngt.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,864 +14,55 @@ // limitations under the License. // -#include +#include "Command.h" -#include "NGT/Index.h" - - -static float roundFloat(float f, int digit) -{ - return roundf(f * pow(10.0, digit)) / pow(10.0, digit); +void help() { + cerr << "Usage : ngt command database data" << endl; + cerr << " command : create search remove append export import" << endl; } -class Args : public map -{ -public: - Args(int argc, char **argv): - option("a:b:c:d:e:f:g:hi:j:k:l:m:n:o:p:q:r:s:t:u:v:w:x:y:z:" - "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:") - { - int opt; - while ((opt = getopt(argc, argv, option)) != -1) { - if ((char)opt == 'h') { - string str; - str.append(1, (char)opt); - insert(pair(str, "")); - continue; - } - string str; - str.append(1, (char)opt); - insert(pair(str, string(optarg))); - } - for (int i = 0; optind < argc; optind++, i++) { - stringstream ss; - ss << "#" << i; - insert(pair(ss.str(), string(argv[optind]))); - } - } - string &find(const char *s) { return get(s); } - char getChar(const char *s, char v) { - try { - return get(s)[0]; - } catch (...) { - return v; - } - } - string getString(const char *s, const char *v) { - try { - return get(s); - } catch (...) { - return v; - } - } - string &get(const char *s) { - Args::iterator ai; - ai = map::find(string(s)); - if (ai == this->end()) { - stringstream msg; - msg << s << ": Not specified" << endl; - NGTThrowException(msg.str()); - } - return ai->second; - } - long getl(const char *s, long v) { - char *e; - long val; - try { - val = strtol(get(s).c_str(), &e, 10); - } catch (...) { - return v; - } - if (*e != 0) { - stringstream msg; - msg << "ARGS::getl: Illegal string. Option=-" << s << " Specified value=" << get(s) - << " Illegal string=" << e << endl; - NGTThrowException(msg.str()); - } - return val; - } - float getf(const char *s, float v) { - char *e; - float val; - try { - val = strtof(get(s).c_str(), &e); - } catch (...) { - return v; - } - if (*e != 0) { - stringstream msg; - msg << "ARGS::getf: Illegal string. Option=-" << s << " Specified value=" << get(s) - << " Illegal string=" << e << endl; - NGTThrowException(msg.str()); - } - return val; - } - const char *option; -}; - -class NGTCommand { -public: - NGTCommand():debugLevel(0) {} - - void - create(Args &args) - { - const string usage = "Usage: ngt create " - "-d dimension [-p #-of-thread] [-i index-type(t|g)] [-g graph-type(a|k|b)] " - "[-t truncation-edge-limit] [-E edge-size] [-S edge-size-for-search] [-L edge-size-limit] " - "[-e epsilon] [-o object-type(f|c)] [-D distance-function] [-n data-size] " - "index(output) data.tsv(input)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified." << endl; - cerr << usage << endl; - return; - } - string data; - try { - data = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: Data is not specified." << endl; - } - - NGT::Property property; - - property.edgeSizeForCreation = args.getl("E", 10); - property.edgeSizeForSearch = args.getl("S", 40); - property.batchSizeForCreation = args.getl("b", 200); - property.insertionRadiusCoefficient = args.getf("e", 0.1) + 1.0; - property.truncationThreshold = args.getl("t", 0); - property.dimension = args.getl("d", 0); - property.threadPoolSize = args.getl("p", 24); - - if (property.dimension <= 0) { - cerr << "ngt: Error: Specify greater than 0 for # of your data dimension by a parameter -d." << endl; - cerr << usage << endl; - return; - } - - char graphType = args.getChar("g", 'a'); - switch(graphType) { - case 'a': property.graphType = NGT::Property::GraphType::GraphTypeANNG; break; - case 'k': property.graphType = NGT::Property::GraphType::GraphTypeKNNG; break; - case 'b': property.graphType = NGT::Property::GraphType::GraphTypeBKNNG; break; - case 'd': property.graphType = NGT::Property::GraphType::GraphTypeDNNG; break; - default: - cerr << "ngt: Error: Invalid graph type. " << graphType << endl; - cerr << usage << endl; - return; - } - - char seedType = args.getChar("s", 'r'); - switch(seedType) { - case 'f': property.seedType = NGT::Property::SeedType::SeedTypeFixedNodes; break; - case '1': property.seedType = NGT::Property::SeedType::SeedTypeFirstNode; break; - default: - case 'r': property.seedType = NGT::Property::SeedType::SeedTypeRandomNodes; break; - } - - char objectType = args.getChar("o", 'f'); - char distanceType = args.getChar("D", '2'); - - size_t dataSize = args.getl("n", 0); - char indexType = args.getChar("i", 't'); - - if (debugLevel >= 1) { - cerr << "edgeSizeForCreation=" << property.edgeSizeForCreation << endl; - cerr << "edgeSizeForSearch=" << property.edgeSizeForSearch << endl; - cerr << "edgeSizeLimit=" << property.edgeSizeLimitForCreation << endl; - cerr << "batch size=" << property.batchSizeForCreation << endl; - cerr << "graphType=" << property.graphType << endl; - cerr << "epsilon=" << property.insertionRadiusCoefficient - 1.0 << endl; - cerr << "thread size=" << property.threadPoolSize << endl; - cerr << "dimension=" << property.dimension << endl; - cerr << "indexType=" << indexType << endl; - } - - switch (objectType) { - case 'f': - property.objectType = NGT::Index::Property::ObjectType::Float; - break; - case 'c': - property.objectType = NGT::Index::Property::ObjectType::Uint8; - break; - default: - cerr << "ngt: Error: Invalid object type. " << objectType << endl; - cerr << usage << endl; - return; - } - - switch (distanceType) { - case '1': - property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeL1; - break; - case '2': - property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeL2; - break; - case 'a': - property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeAngle; - break; - case 'h': - property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeHamming; - break; - default: - cerr << "ngt: Error: Invalid distance type. " << distanceType << endl; - cerr << usage << endl; - return; - } - - switch (indexType) { - case 't': - NGT::Index::createGraphAndTree(database, property, data, dataSize); - break; - case 'g': - NGT::Index::createGraph(database, property, data, dataSize); - break; - } - } - - void - append(Args &args) - { - const string usage = "Usage: ngt append [-p #-of-thread] [-d dimension] [-n data-size] " - "index(output) data.tsv(input)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified." << endl; - cerr << usage << endl; - return; - } - string data; - try { - data = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: Data is not specified." << endl; - cerr << usage << endl; - return; - } - - int threadSize = args.getl("p", 50); - size_t dimension = args.getl("d", 0); - size_t dataSize = args.getl("n", 0); - - if (debugLevel >= 1) { - cerr << "thread size=" << threadSize << endl; - cerr << "dimension=" << dimension << endl; - } - - try { - NGT::Index::append(database, data, threadSize, dataSize); - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - } - - void - search(Args &args) - { - const string usage = "Usage: ngt search [-i g|t|s] [-n result-size] [-e epsilon] [-E edge-size] [-o output-mode] index(input) query.tsv(input)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified" << endl; - cerr << usage << endl; - return; - } - string query; - try { - query = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: Query is not specified" << endl; - cerr << usage << endl; - return; - } - - char indexType = args.getChar("i", 't'); - int size = args.getl("n", 20); - size_t edgeSize = args.getl("E", 0); - char outputMode = args.getChar("o", '-'); - float radius = args.getf("r", FLT_MAX); - - float beginOfEpsilon, endOfEpsilon, stepOfEpsilon; - { - beginOfEpsilon = endOfEpsilon = stepOfEpsilon = 0.1; - string epsilon = args.getString("e", "0.1"); - vector tokens; - NGT::Common::tokenize(epsilon, tokens, ":"); - if (tokens.size() >= 1) { beginOfEpsilon = endOfEpsilon = NGT::Common::strtod(tokens[0]); } - if (tokens.size() >= 2) { endOfEpsilon = NGT::Common::strtod(tokens[1]); } - if (tokens.size() >= 3) { stepOfEpsilon = NGT::Common::strtod(tokens[2]); } - } - - if (debugLevel >= 1) { - cerr << "indexType=" << indexType << endl; - cerr << "size=" << size << endl; - cerr << "edgeSize=" << edgeSize << endl; - cerr << "epsilon=" << beginOfEpsilon << "<->" << endOfEpsilon << "," << stepOfEpsilon << endl; - } - - try { - NGT::Property property; - property.load(database); - if (edgeSize != 0) { - property.edgeSizeForSearch = edgeSize; - } - NGT::Index index(database, property); - ifstream is(query); - if (!is) { - cerr << "Cannot open the specified file. " << query << endl; - return; - } - if (outputMode == 's') { cout << "# Beginning of Evaluation" << endl; } - string line; - double totalTime = 0; - int queryCount = 0; - while(getline(is, line)) { - NGT::Object *object = index.allocateObject(line, " \t"); - queryCount++; - for (float epsilon = beginOfEpsilon; epsilon <= endOfEpsilon; epsilon += stepOfEpsilon) { - epsilon = roundFloat(epsilon, 6); - NGT::SearchContainer sc(*object); - NGT::ObjectDistances objects; - sc.setResults(&objects); - sc.setSize(size); - sc.setRadius(radius); - sc.setEpsilon(epsilon); - if (debugLevel >= 1) { - cerr << "size=" << sc.size << endl; - cerr << "explorationCoefficient=" << sc.explorationCoefficient << endl; - } - NGT::Timer timer; - try { - if (outputMode == 'e') { - switch (indexType) { - case 't': index.search(sc); break; - case 'g': index.searchUsingOnlyGraph(sc); break; - case 's': index.linearSearch(sc); break; - } - } - switch (indexType) { - case 't': timer.start(); index.search(sc); timer.stop(); break; - case 'g': timer.start(); index.searchUsingOnlyGraph(sc); timer.stop(); break; - case 's': timer.start(); index.linearSearch(sc); timer.stop(); break; - } - } catch (NGT::Exception &err) { - throw err; - } - totalTime += timer.time; - if (outputMode == 'e') { - cout << "# Query No.=" << queryCount << endl; - cout << "# Query=" << line.substr(0, 20) + " ..." << endl; - cout << "# Index Type=" << indexType << endl; - cout << "# Size=" << size << endl; - cout << "# Radius=" << radius << endl; - cout << "# Epsilon=" << epsilon << endl; - cout << "# Query Time (msec)=" << timer.time * 1000.0 << endl; - } else { - cout << "Query No." << queryCount << endl; - cout << "Rank\tID\tDistance" << endl; - } - for (size_t i = 0; i < objects.size(); i++) { - cout << i + 1 << "\t" << objects[i].id << "\t"; - cout << objects[i].distance << endl; - } - if (outputMode == 'e') { - cout << "# End of Search" << endl; - } else { - cout << "Query Time= " << timer.time << " (sec), " << timer.time * 1000.0 << " (msec)" << endl; - } - } // for - if (outputMode == 'e') { - cout << "# End of Query" << endl; - } - index.deleteObject(object); - } // while - if (outputMode == 'e') { - cout << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl; - cout << "# Number of queries=" << queryCount << endl; - cout << "# End of Evaluation" << endl; - - // show graph information - size_t esize = edgeSize; - long double distance = 0.0; - size_t numberOfNodes = 0; - size_t numberOfEdges = 0; - - NGT::GraphIndex &graph = (NGT::GraphIndex&)index.getIndex(); - for (size_t id = 1; id < graph.repository.size(); id++) { - NGT::GraphNode *node = 0; - try { - node = graph.getNode(id); - } catch(NGT::Exception &err) { - cerr << "Graph::search: Warning. Cannot get the node. ID=" << id << ":" << err.what() << " If the node was removed, no problem." << endl; - continue; - } - numberOfNodes++; - if (numberOfNodes % 1000000 == 0) { - cerr << "Processed " << numberOfNodes << endl; - } - for (size_t i = 0; i < node->size(); i++) { - if (esize != 0 && i >= esize) { - break; - } - numberOfEdges++; -#if defined(NGT_SHARED_MEMORY_ALLOCATOR) - distance += (*node).at(i, graph.repository.allocator).distance; -#else - distance += (*node)[i].distance; -#endif - } - } - - cerr << "# of nodes=" << numberOfNodes << endl; - cerr << "# of edges=" << numberOfEdges << endl; - cerr << "Average number of edges=" << (double)numberOfEdges / (double)numberOfNodes << endl; - cerr << "Average distance of edges=" << setprecision(10) << distance / (double)numberOfEdges << endl; - - } else { - cout << "Average Query Time= " << totalTime / (double)queryCount << " (sec), " - << totalTime * 1000.0 / (double)queryCount << " (msec), (" - << totalTime << "/" << queryCount << ")" << endl; - } - - - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - - } - - void - remove(Args &args) - { - const string usage = "Usage: ngt remove [-d object-ID-type(f|d)] index(input) object-ID(input)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified" << endl; - cerr << usage << endl; - return; - } - try { - args.get("#2"); - } catch (...) { - cerr << "ngt: Error: ID is not specified" << endl; - cerr << usage << endl; - return; - } - char dataType = args.getChar("d", 'f'); - if (debugLevel >= 1) { - cerr << "dataType=" << dataType << endl; - } - - try { - vector objects; - if (dataType == 'f') { - string ids; - try { - ids = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: Data file is not specified" << endl; - cerr << usage << endl; - return; - } - ifstream is(ids); - if (!is) { - cerr << "ngt: Error: Cannot open the specified file. " << ids << endl; - cerr << usage << endl; - return; - } - string line; - int count = 0; - while(getline(is, line)) { - count++; - vector tokens; - NGT::Common::tokenize(line, tokens, "\t "); - if (tokens.size() == 0 || tokens[0].size() == 0) { - continue; - } - char *e; - size_t id; - try { - id = strtol(tokens[0].c_str(), &e, 10); - objects.push_back(id); - } catch (...) { - cerr << "Illegal data. " << tokens[0] << endl; - } - if (*e != 0) { - cerr << "Illegal data. " << e << endl; - } - cerr << "removed ID=" << id << endl; - } - } else { - size_t id = args.getl("#2", 0); - cerr << "removed ID=" << id << endl; - objects.push_back(id); - } - NGT::Index::remove(database, objects); - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - } - - void - exportIndex(Args &args) - { - const string usage = "Usage: ngt export index(input) export-file(output)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified" << endl; - cerr << usage << endl; - return; - } - string exportFile; - try { - exportFile = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: ID is not specified" << endl; - cerr << usage << endl; - return; - } - try { - NGT::Index::exportIndex(database, exportFile); - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - } - - void - importIndex(Args &args) - { - const string usage = "Usage: ngt import index(output) import-file(input)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified" << endl; - cerr << usage << endl; - return; - } - string importFile; - try { - importFile = args.get("#2"); - } catch (...) { - cerr << "ngt: Error: ID is not specified" << endl; - cerr << usage << endl; - return; - } - - try { - NGT::Index::importIndex(database, importFile); - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - - } - - void - prune(Args &args) - { - const string usage = "Usage: ngt prune -e #-of-forcedly-pruned-edges -s #-of-selecively-pruned-edge"; - string indexName; - try { - indexName = args.get("#1"); - } catch (...) { - cerr << "Index is not specified" << endl; - cerr << usage << endl; - return; - } - - // the number of forcedly pruned edges - size_t forcedlyPrunedEdgeSize = args.getl("e", 0); - // the number of selectively pruned edges - size_t selectivelyPrunedEdgeSize = args.getl("s", 0); - - cerr << "forcedly pruned edge size=" << forcedlyPrunedEdgeSize << endl; - cerr << "selectively pruned edge size=" << selectivelyPrunedEdgeSize << endl; - - if (selectivelyPrunedEdgeSize == 0 && forcedlyPrunedEdgeSize == 0) { - cerr << "prune: Error! Either of selective edge size or remaining edge size should be specified." << endl; - cerr << usage << endl; - return; - } - - if (forcedlyPrunedEdgeSize != 0 && selectivelyPrunedEdgeSize != 0 && selectivelyPrunedEdgeSize >= forcedlyPrunedEdgeSize) { - cerr << "prune: Error! selective edge size is less than remaining edge size." << endl; - cerr << usage << endl; - return; - } - - NGT::Index index(indexName); - cerr << "loaded the input index." << endl; - - NGT::GraphIndex &graph = (NGT::GraphIndex&)index.getIndex(); - - for (size_t id = 1; id < graph.repository.size(); id++) { - try { - NGT::GraphNode &node = *graph.getNode(id); - if (id % 1000000 == 0) { - cerr << "Processed " << id << endl; - } - if (forcedlyPrunedEdgeSize > 0 && node.size() >= forcedlyPrunedEdgeSize) { -#ifdef NGT_SHARED_MEMORY_ALLOCATOR - node.resize(forcedlyPrunedEdgeSize, graph.repository.allocator); -#else - node.resize(forcedlyPrunedEdgeSize); -#endif - } - if (selectivelyPrunedEdgeSize > 0 && node.size() >= selectivelyPrunedEdgeSize) { -#ifdef NGT_SHARED_MEMORY_ALLOCATOR - cerr << "not implemented" << endl; - abort(); -#else - size_t rank = 0; - for (NGT::GraphNode::iterator i = node.begin(); i != node.end(); ++rank) { - //for (size_t i = 0; i < node.size(); ++i) { - if (rank >= selectivelyPrunedEdgeSize) { - bool found = false; - for (size_t t1 = 0; t1 < node.size() && found == false; ++t1) { - if (t1 >= selectivelyPrunedEdgeSize) { - break; - } - if (rank == t1) { - continue; - } - NGT::GraphNode &node2 = *graph.getNode(node[t1].id); - for (size_t t2 = 0; t2 < node2.size(); ++t2) { - if (t2 >= selectivelyPrunedEdgeSize) { - break; - } - if (node2[t2].id == (*i).id) { - found = true; - break; - } - } // for - } // for - if (found) { - //remove - i = node.erase(i); - continue; - } - } - i++; - } // for -#endif - } - - } catch(NGT::Exception &err) { - cerr << "Graph::search: Warning. Cannot get the node. ID=" << id << ":" << err.what() << endl; - continue; - } - } - - graph.saveIndex(indexName); - - } - - - void - info(Args &args) - { - const string usage = "Usage: ngt info [-E #-of edges] [-g virtually-created-graph-type] [-m (h)] index(output)"; - string database; - try { - database = args.get("#1"); - } catch (...) { - cerr << "ngt: Error: DB is not specified" << endl; - cerr << usage << endl; - return; - } - - size_t edgeSize = args.getl("E", UINT_MAX); - char mode = args.getChar("m", '-'); - - try { - NGT::GraphIndex index(database); - long double distance = 0.0; - size_t numberOfNodes = 0; - size_t numberOfOutdegree = 0; - size_t numberOfNodesWithoutEdges = 0; - size_t maxNumberOfOutdegree = 0; - size_t minNumberOfOutdegree = SIZE_MAX; - vector indegreeCount; - vector outdegreeHistogram; - vector indegreeHistogram; - indegreeCount.resize(index.repository.size(), 0); - for (size_t id = 1; id < index.repository.size(); id++) { - NGT::GraphNode *node = 0; - try { - node = index.getNode(id); - } catch(NGT::Exception &err) { - cerr << "ngt info: Warning. Cannot get the node. ID=" << id << ":" << err.what() << endl; - continue; - } - numberOfNodes++; - if (numberOfNodes % 1000000 == 0) { - cerr << "Processed " << numberOfNodes << endl; - } - size_t esize = node->size() > edgeSize ? edgeSize : node->size(); - if (esize == 0) { - numberOfNodesWithoutEdges++; - } - if (esize > maxNumberOfOutdegree) { - maxNumberOfOutdegree = esize; - } - if (esize < minNumberOfOutdegree) { - minNumberOfOutdegree = esize; - } - if (outdegreeHistogram.size() <= esize) { - outdegreeHistogram.resize(esize + 1); - } - outdegreeHistogram[esize]++; - for (size_t i = 0; i < esize; i++) { -#if defined(NGT_SHARED_MEMORY_ALLOCATOR) - NGT::ObjectDistance &n = (*node).at(i, index.repository.allocator); -#else - NGT::ObjectDistance &n = (*node)[i]; -#endif - if (n.id == 0) { - cerr << "ngt info: Warning. id is zero." << endl; - } - indegreeCount[n.id]++; - numberOfOutdegree++; - distance += n.distance; - } - } - size_t numberOfNodesWithoutIndegree = 0; - size_t maxNumberOfIndegree = 0; - size_t minNumberOfIndegree = SIZE_MAX; - for (size_t id = 1; id < index.repository.size(); id++) { - if (indegreeCount[id] == 0) { - numberOfNodesWithoutIndegree++; - } - if (indegreeCount[id] > maxNumberOfIndegree) { - maxNumberOfIndegree = indegreeCount[id]; - } - if (indegreeCount[id] < minNumberOfIndegree) { - minNumberOfIndegree = indegreeCount[id]; - } - if (indegreeHistogram.size() <= indegreeCount[id]) { - indegreeHistogram.resize(indegreeCount[id] + 1); - } - indegreeHistogram[indegreeCount[id]]++; - } - cout << "# of nodes=" << numberOfNodes << endl; - cout << "# of edges=" << numberOfOutdegree << endl; - cout << "# of nodes without edges=" << numberOfNodesWithoutEdges << endl; - cout << "Max outdegree=" << maxNumberOfOutdegree << endl; - cout << "Min outdegree=" << minNumberOfOutdegree << endl; - cout << "Average number of edges=" << (double)numberOfOutdegree / (double)numberOfNodes << endl; - cout << "Average distance of edges=" << setprecision(10) << distance / (double)numberOfOutdegree << endl; - cout << "# of nodes where indegree is 0=" << numberOfNodesWithoutIndegree << endl; - cout << "Max indegree=" << maxNumberOfIndegree << endl; - cout << "Min indegree=" << minNumberOfIndegree << endl; - - if (mode == 'h') { - cout << "#\tout\tin" << endl; - for (size_t i = 0; i < outdegreeHistogram.size() || i < indegreeHistogram.size(); i++) { - size_t out = outdegreeHistogram.size() <= i ? 0 : outdegreeHistogram[i]; - size_t in = indegreeHistogram.size() <= i ? 0 : indegreeHistogram[i]; - cout << i << "\t" << out << "\t" << in << endl; - } - } - - } catch (NGT::Exception &err) { - cerr << "ngt: Error " << err.what() << endl; - cerr << usage << endl; - } catch (...) { - cerr << "ngt: Error" << endl; - cerr << usage << endl; - } - } - - - ////////////////////////////////////////////////////////////// - - void help() { - cerr << "Usage : ngt command database [data]" << endl; - cerr << " commands : create search remove append export import prune" << endl; - } - - void execute(Args args) { - string command; - try { - command = args.get("#0"); - } catch(...) { - help(); - return; - } - - debugLevel = args.getl("X", 0); - - try { - if (debugLevel >= 1) { - cerr << "ngt: command=" << command << endl; - } - if (command == "search") { - search(args); - } else if (command == "create") { - create(args); - } else if (command == "append") { - append(args); - } else if (command == "remove") { - remove(args); - } else if (command == "export") { - exportIndex(args); - } else if (command == "import") { - importIndex(args); - } else if (command == "prune") { - prune(args); - } else if (command == "info") { - info(args); - } else { - cerr << "ngt: Error: Illegal command. " << command << endl; - } - } catch(NGT::Exception &err) { - cerr << "ngt: Error: " << err.what() << endl; - } - } - - int debugLevel; - -}; - int main(int argc, char **argv) { - Args args(argc, argv); - - NGTCommand ngt; - - ngt.execute(args); + NGT::Args args(argc, argv); + + NGT::Command ngt; + + string command; + try { + command = args.get("#0"); + } catch(...) { + help(); + return 0; + } + + ngt.setDebugLevel(args.getl("X", 0)); + + try { + if (ngt.getDebugLevel() >= 1) { + cerr << "ngt: command=" << command << endl; + } + if (command == "search") { + ngt.search(args); + } else if (command == "create") { + ngt.create(args); + } else if (command == "append") { + ngt.append(args); + } else if (command == "remove") { + ngt.remove(args); + } else if (command == "export") { + ngt.exportIndex(args); + } else if (command == "import") { + ngt.importIndex(args); + } else if (command == "prune") { + ngt.prune(args); + } else if (command == "info") { + ngt.info(args); + } else { + cerr << "ngt: Error: Illegal command. " << command << endl; + } + } catch(NGT::Exception &err) { + cerr << "ngt: Error: " << err.what() << endl; + } } - - diff --git a/bin/search/search.cpp b/bin/search/search.cpp index 2501d88..c6548aa 100644 --- a/bin/search/search.cpp +++ b/bin/search/search.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,14 +19,14 @@ int main(int argc, char **argv) { - string database = "index"; // Index. + string indexFile = "index"; // Index. string query = "./data/sift-query-3.tsv"; // Query file. int size = 20; // The number of resultant objects. float radius = FLT_MAX; // Radius of search range. float epsilon = 0.1; // Epsilon to expand explored range. try { - NGT::Index index(database); // open the specified index. + NGT::Index index(indexFile); // open the specified index. ifstream is(query); // open a query file. if (!is) { cerr << "Cannot open the specified file. " << query << endl; diff --git a/lib/NGT/Common.h b/lib/NGT/Common.h index eff5b61..ef17a7e 100644 --- a/lib/NGT/Common.h +++ b/lib/NGT/Common.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -344,6 +344,7 @@ namespace NGT { uint64_t size; }; + class PropertySet : public map { public: void set(const string &key, const string &value) { @@ -1581,6 +1582,7 @@ namespace NGT { class Container { public: Container(Object &o, ObjectID i):object(o), id(i) {} + Container(Container &c):object(c.object), id(c.id) {} Object &object; ObjectID id; }; @@ -1589,6 +1591,14 @@ namespace NGT { public: SearchContainer(Object &f, ObjectID i): Container(f, i) { initialize(); } SearchContainer(Object &f): Container(f, 0) { initialize(); } + SearchContainer(SearchContainer &sc): Container(sc) { *this = sc; } + SearchContainer &operator=(SearchContainer &sc) { + size = sc.size; + radius = sc.radius; + explorationCoefficient = sc.explorationCoefficient; + result = sc.result; + return *this; + } virtual ~SearchContainer() {} virtual void initialize() { size = 10; @@ -1608,9 +1618,11 @@ namespace NGT { return *result; } + size_t size; Distance radius; float explorationCoefficient; + private: ObjectDistances *result; }; @@ -1625,26 +1637,29 @@ namespace NGT { public: Timer():time(0) {} - void reset() { time = 0; } + void reset() { time = 0; ntime = 0; } void start() { + struct timespec res; + clock_getres(CLOCK_REALTIME, &res); reset(); - gettimeofday(&startTime, 0); + clock_gettime(CLOCK_REALTIME, &startTime); } void restart() { - gettimeofday(&startTime, 0); + clock_gettime(CLOCK_REALTIME, &startTime); } void stop() { - gettimeofday(&stopTime, 0); + clock_gettime(CLOCK_REALTIME, &stopTime); sec = stopTime.tv_sec - startTime.tv_sec; - usec = stopTime.tv_usec - startTime.tv_usec; - if (usec < 0) { + nsec = stopTime.tv_nsec - startTime.tv_nsec; + if (nsec < 0) { sec -= 1; - usec += 1000000; + nsec += 1000000000L; } - time += (double)sec + (double)usec / 1000000.0; + time += (double)sec + (double)nsec / 1000000000.0; + ntime += sec * 1000000000L + nsec; } friend ostream &operator<<(ostream &os, Timer &t) { @@ -1652,12 +1667,13 @@ namespace NGT { return os; } - struct timeval startTime; - struct timeval stopTime; + struct timespec startTime; + struct timespec stopTime; - int sec; - int usec; - double time; + int64_t sec; + int64_t nsec; + int64_t ntime; // nano second + double time; // second }; } // namespace NGT diff --git a/lib/NGT/Graph.cpp b/lib/NGT/Graph.cpp index 6c8ea9d..e5e082f 100644 --- a/lib/NGT/Graph.cpp +++ b/lib/NGT/Graph.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -146,13 +146,6 @@ NeighborhoodGraph::setupSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, GraphNode &neighbors = *rsptr; if (neighbors.size() == 0) { // When the only one object is in the DB, the node has no neighbors. - if ((repository.size() != 1) && (target.id != 1)) { -#if defined(NGT_SHARED_MEMORY_ALLOCATOR) - cerr << "Graph::search: Warning! The node has no neighbors. Node id=" << target.id << " " << repository.allocator.file << endl; -#else - cerr << "Graph::search: Warning! The node has no neighbors. Node id=" << target.id << endl; -#endif - } continue; } #ifdef NGT_GRAPH_UNCHECK_STACK_SORT @@ -186,11 +179,11 @@ NeighborhoodGraph::setupSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, } distanceChecked.insert(neighbor.id); #endif // NGT_GRAPH_CHECK_VECTOR + #ifdef NGT_EXPLORATION_COEFFICIENT_OPTIMIZATION sc.explorationCoefficient = exp(-(double)distanceChecked.size() / 20000.0) / 10.0 + 1.0; #endif - Distance distance = comparator(sc.object, *getObjectRepository().get(neighbor.id)); ObjectDistance r(neighbor.id, distance); if (distance <= explorationRadius) { @@ -213,9 +206,10 @@ NeighborhoodGraph::setupSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, sc.radius = results.top().distance; explorationRadius = sc.explorationCoefficient * sc.radius; } - } // if (distance <= sc.radius) { - } // if (distance <= explorationRadius) { - } // for (Results::iterator ni = neighbors.begin(); ni != neighbors.end(); ni++) { + } + } + } + #ifdef NGT_GRAPH_UNCHECK_STACK_SORT // sort is not effectictive. std::sort(sort.begin(), sort.end()); @@ -223,7 +217,7 @@ NeighborhoodGraph::setupSeeds(NGT::SearchContainer &sc, ObjectDistances &seeds, unchecked.push(*si); } #endif - } + } { ObjectDistances &qresults = sc.getResult(); qresults.moveFrom(results); @@ -569,7 +563,6 @@ NeighborhoodGraph::truncateEdgesOptimally( for (size_t i = 0; i < delNodes.size(); i++) { GraphNode::iterator j; GraphNode &res = *getNode(delNodes[i].id); - bool find = false; #if defined(NGT_SHARED_MEMORY_ALLOCATOR) for (j = res.begin(repository.allocator); j != res.end(repository.allocator); j++) { #else @@ -581,7 +574,6 @@ NeighborhoodGraph::truncateEdgesOptimally( #else res.erase(j); #endif - find = true; break; } } diff --git a/lib/NGT/Graph.h b/lib/NGT/Graph.h index ca52b5c..f4df75a 100644 --- a/lib/NGT/Graph.h +++ b/lib/NGT/Graph.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -212,7 +212,7 @@ namespace NGT { batchSizeForCreation = 200; graphType = GraphTypeANNG; } - void setNotAvailable() { + void clear() { truncationThreshold = -1; edgeSizeForCreation = -1; edgeSizeForSearch = -1; @@ -488,26 +488,6 @@ namespace NGT { bool addEdge(ObjectID target, ObjectID addID, Distance addDistance, bool identityCheck = true) { size_t minsize = 0; GraphNode &node = property.truncationThreshold == 0 ? *getNode(target) : *getNode(target, minsize); -#ifdef NGT_CHECK - bool found = false; - for (int i = 0; i < node.size(); i++) { - if (node[i].id == addID) { - cerr << "already existed. " << node[i].id << " " << node[i].distance << ":" << addID << " " << addDistance << endl; - for (int i = 0; i < node.size(); i++) { - cerr << i << " " << node[i].id << ":" << node[i].distance << endl; - } - found = true; - } - } - for (int i = 0; i < (int)node.size() - 1; i++) { - cerr << node[i].distance << ":" << node[i + 1].distance << endl; - assert(node[i].distance <= node[i + 1].distance); - if (node[i].id == node[i + 1].id) { - cerr << "before insert: " << target << " " << node[i].id << ":" << node[i + 1].id << " " << node[i].distance << ":" << node[i + 1].distance << endl; - } - assert(node[i].id != node[i + 1].id); - } -#endif ObjectDistance obj(addID, addDistance); // this seach ocuppies about 1% of total insertion time. #if defined(NGT_SHARED_MEMORY_ALLOCATOR) @@ -527,45 +507,11 @@ namespace NGT { return false; } #endif -#ifdef NGT_CHECK - if (found) { - cerr << "Found!" << endl; - cerr << target << ":" << addID << " " << addDistance << endl; - cerr << (*ni).id << " " << (*ni).distance<< endl; - for (int i = 0; i < node.size(); i++) { - cerr << i << " " << node[i].id << ":" << node[i].distance << endl; - } - } - int nodeSize = node.size(); -#endif #if defined(NGT_SHARED_MEMORY_ALLOCATOR) node.insert(ni, obj, repository.allocator); #else node.insert(ni, obj); #endif -#ifdef NGT_CHECK - assert(nodeSize + 1 == node.size()); - for (int i = 0; i < (int)node.size() - 1; i++) { - if (node[i].distance > node[i + 1].distance) { - cerr << i << ":" << node[i].distance << "," << node[i + 1].distance << endl; - cerr << node[i].id << "," << node[i + 1].id << endl; - } - assert(node[i].distance <= node[i + 1].distance); - if (node[i].id == node[i + 1].id) { - for (int i = 0; i < node.size(); i++) { - cerr << i << " " << node[i].id << ":" << node[i].distance << endl; - } - cerr << "after insert: " << target << " " << node[i].id << ":" << node[i + 1].id << " " << node[i].distance << ":" << node[i + 1].distance << endl; - cerr << (*ni).id << ":" << addID << endl; - cerr << (*ni).distance << ":" << addDistance << endl; - if (ni == node.end()) { - cerr << "object of end" << endl; - } - } - assert(node[i].id != node[i + 1].id); - } -#endif - if ((size_t)property.truncationThreshold != 0 && node.size() - minsize > (size_t)property.truncationThreshold) { return true; diff --git a/lib/NGT/Index.cpp b/lib/NGT/Index.cpp index 5af4310..eee7575 100644 --- a/lib/NGT/Index.cpp +++ b/lib/NGT/Index.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Index.h b/lib/NGT/Index.h index ac46d35..8620bcf 100644 --- a/lib/NGT/Index.h +++ b/lib/NGT/Index.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ namespace NGT { void setDefault() { dimension = 0; threadPoolSize = 32; - objectType = ObjectType::Uint8; + objectType = ObjectType::Float; distanceType = DistanceType::DistanceTypeL2; indexType = IndexType::GraphAndTree; #ifdef NGT_SHARED_MEMORY_ALLOCATOR @@ -79,7 +79,7 @@ namespace NGT { databaseType = DatabaseType::Memory; #endif } - void setNotAvailable() { + void clear() { dimension = -1; threadPoolSize = -1; objectType = ObjectTypeNone; @@ -108,6 +108,7 @@ namespace NGT { case DistanceType::DistanceTypeL2: p.set("DistanceType", "L2"); break; case DistanceType::DistanceTypeHamming: p.set("DistanceType", "Hamming"); break; case DistanceType::DistanceTypeAngle: p.set("DistanceType", "Angle"); break; + case DistanceType::DistanceTypeCosine: p.set("DistanceType", "Cosine"); break; default : cerr << "Fatal error. Invalid distance type. " << distanceType <second == "Angle") { distanceType = DistanceType::DistanceTypeAngle; + } else if (it->second == "Cosine") { + distanceType = DistanceType::DistanceTypeCosine; } else { cerr << "Invalid Distance Type in the property. " << it->first << ":" << it->second << endl; } @@ -227,6 +230,8 @@ namespace NGT { } static void mkdir(const string &dir) { ::mkdir(dir.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); } static void createGraphAndTree(const string &database, NGT::Property &prop, const string &dataFile, size_t dataSize = 0); + void createGraphAndTree(const string &database, NGT::Property &prop); + size_t insert(vector &object) ; static void createGraph(const string &database, NGT::Property &prop, const string &dataFile, size_t dataSize = 0); static void append(const string &database, const string &dataFile, size_t threadSize, size_t dataSize); static void remove(const string &database, vector &objects); @@ -451,6 +456,7 @@ namespace NGT { for (si = seeds.begin(); si != seeds.end(); si++) { (*si).distance = -1.0; } +#if 0 NGT::SearchContainer so(sc.object); ObjectDistances &rs = sc.getResult(); @@ -460,6 +466,10 @@ namespace NGT { so.size = sc.size; so.radius = sc.radius; so.explorationCoefficient = sc.explorationCoefficient; +#else + NGT::SearchContainer so(sc); + so.getResult().clear(); +#endif try { NeighborhoodGraph::search(so, seeds); } catch(Exception &err) { @@ -820,9 +830,9 @@ namespace NGT { Index::Property::setDefault(); NeighborhoodGraph::Property::setDefault(); } - void setNotAvailable() { - Index::Property::setNotAvailable(); - NeighborhoodGraph::Property::setNotAvailable(); + void clear() { + Index::Property::clear(); + NeighborhoodGraph::Property::clear(); } void set(NGT::Property &p) { Index::Property::set(p); @@ -897,6 +907,34 @@ inline void delete idx; } +inline void + NGT::Index::createGraphAndTree(const string &database, NGT::Property &prop) { + if (prop.dimension == 0) { + NGTThrowException("Index::createGraphAndTree. Dimension is not specified."); + } + prop.indexType = NGT::Index::Property::IndexType::GraphAndTree; +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + mkdir(database); + index = new NGT::GraphAndTreeIndex(database, prop); +#else + index = new NGT::GraphAndTreeIndex(prop); +#endif + assert(index != 0); +} + +inline size_t + NGT::Index::insert(vector &object) +{ + if (getObjectSpace().getRepository().size() == 0) { + getObjectSpace().getRepository().initialize(); + } + + PersistentObject *o = getObjectSpace().getRepository().allocatePersistentObject(object); + getObjectSpace().getRepository().push_back(o); + + return getObjectSpace().getRepository().size() - 1; +} + inline void NGT::Index::createGraph(const string &database, NGT::Property &prop, const string &dataFile, size_t dataSize) { if (prop.dimension == 0) { @@ -939,8 +977,8 @@ NGT::Index::loadAndCreateIndex(Index &index, const string &database, const strin timer.start(); index.createIndex(threadSize); timer.stop(); - cerr << "Index creation time=" << timer.time << " (sec) " << timer.time * 1000.0 << " (msec)" << endl; index.saveIndex(database); + cerr << "Index creation time=" << timer.time << " (sec) " << timer.time * 1000.0 << " (msec)" << endl; } inline void @@ -960,8 +998,8 @@ NGT::Index::append(const string &database, const string &dataFile, size_t thread timer.start(); index.createIndex(threadSize); timer.stop(); - cerr << "Index creation time=" << timer.time << " (sec) " << timer.time * 1000.0 << " (msec)" << endl; index.saveIndex(database); + cerr << "Index creation time=" << timer.time << " (sec) " << timer.time * 1000.0 << " (msec)" << endl; return; } diff --git a/lib/NGT/MmapManager.cpp b/lib/NGT/MmapManager.cpp index b695e27..a0b1034 100644 --- a/lib/NGT/MmapManager.cpp +++ b/lib/NGT/MmapManager.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/MmapManager.h b/lib/NGT/MmapManager.h index bd644ee..e7b5cbf 100644 --- a/lib/NGT/MmapManager.h +++ b/lib/NGT/MmapManager.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/MmapManagerDefs.h b/lib/NGT/MmapManagerDefs.h index 110b06d..239c09a 100644 --- a/lib/NGT/MmapManagerDefs.h +++ b/lib/NGT/MmapManagerDefs.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/MmapManagerException.h b/lib/NGT/MmapManagerException.h index bb61d94..b434202 100644 --- a/lib/NGT/MmapManagerException.h +++ b/lib/NGT/MmapManagerException.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/MmapManagerImpl.hpp b/lib/NGT/MmapManagerImpl.hpp index 615b932..8726f62 100644 --- a/lib/NGT/MmapManagerImpl.hpp +++ b/lib/NGT/MmapManagerImpl.hpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Node.cpp b/lib/NGT/Node.cpp index e6c026f..b9fcd53 100644 --- a/lib/NGT/Node.cpp +++ b/lib/NGT/Node.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Node.h b/lib/NGT/Node.h index 0fab5c9..7e1e780 100644 --- a/lib/NGT/Node.h +++ b/lib/NGT/Node.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/ObjectSpace.h b/lib/NGT/ObjectSpace.h index aacb101..6a8c02a 100644 --- a/lib/NGT/ObjectSpace.h +++ b/lib/NGT/ObjectSpace.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,6 +16,11 @@ #pragma once + +#if !defined(NGT_AVX_DISABLED) && defined(__AVX__) +#include +#endif + #include "Common.h" class ObjectSpace; @@ -35,7 +40,6 @@ namespace NGT { } else { return distance < o.distance; } - } bool operator>(const ObjectDistance &o) const { if (distance == o.distance) { @@ -201,7 +205,8 @@ namespace NGT { DistanceTypeL1 = 0, DistanceTypeL2 = 1, DistanceTypeHamming = 2, - DistanceTypeAngle = 3 + DistanceTypeAngle = 3, + DistanceTypeCosine = 4 }; typedef priority_queue, less > ResultSet; ObjectSpace(size_t d):dimension(d), distanceType(DistanceTypeNone), comparator(0) {} @@ -248,6 +253,8 @@ namespace NGT { virtual void setDistanceType(DistanceType t) = 0; + virtual void *getObject(size_t idx) = 0; + size_t getDimension() { return dimension; } protected: @@ -370,6 +377,8 @@ namespace NGT { construct(s, allocator); } + virtual ~PersistentObject() {} + uint8_t &at(size_t idx, SharedMemoryAllocator &allocator) const { uint8_t *a = (uint8_t *)allocator.getAddr(array); return a[idx]; @@ -377,6 +386,8 @@ namespace NGT { uint8_t &operator[](size_t idx) const { cerr << "not implemented" << endl; assert(0); + uint8_t *a = 0; + return a[idx]; } // set v in objectspace to this object using allocator. @@ -733,6 +744,27 @@ namespace NGT { #endif }; + class ComparatorCosineSimilarity : public Comparator { + public: +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + ComparatorCosineSimilarity(size_t d, SharedMemoryAllocator &a) : Comparator(d, a) {} + double operator()(Object &objecta, Object &objectb) { + return ObjectSpaceT::compareCosineSimilarity((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } + double operator()(Object &objecta, PersistentObject &objectb) { + return ObjectSpaceT::compareCosineSimilarity((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } + double operator()(PersistentObject &objecta, PersistentObject &objectb) { + return ObjectSpaceT::compareCosineSimilarity((OBJECT_TYPE*)&objecta.at(0, allocator), (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } +#else + ComparatorCosineSimilarity(size_t d) : Comparator(d) {} + double operator()(Object &objecta, Object &objectb) { + return ObjectSpaceT::compareAngleDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } +#endif + }; + ObjectSpaceT(size_t d, const type_info &ot, DistanceType t) : ObjectSpace(d), ObjectRepository(d, ot) { size_t objectSize = 0; if (ot == typeid(uint8_t)) { @@ -804,6 +836,10 @@ namespace NGT { break; case DistanceTypeAngle: comparator = new ObjectSpaceT::ComparatorAngleDistance(ObjectSpace::dimension, ObjectRepository::allocator); + break; + case DistanceTypeCosine: + comparator = new ObjectSpaceT::ComparatorCosineSimilarity(ObjectSpace::dimension, ObjectRepository::allocator); + break; #else case DistanceTypeL1: comparator = new ObjectSpaceT::ComparatorL1(ObjectSpace::dimension); @@ -816,8 +852,11 @@ namespace NGT { break; case DistanceTypeAngle: comparator = new ObjectSpaceT::ComparatorAngleDistance(ObjectSpace::dimension); -#endif break; + case DistanceTypeCosine: + comparator = new ObjectSpaceT::ComparatorCosineSimilarity(ObjectSpace::dimension); + break; +#endif default: cerr << "Distance type is not specified" << endl; assert(distanceType != DistanceTypeNone); @@ -827,7 +866,7 @@ namespace NGT { static COMPARE_TYPE absolute(int v) { return abs(v); } static COMPARE_TYPE absolute(double v) { return fabs(v); } - +#if defined(NGT_AVX_DISABLED) || !defined(__AVX__) inline static double compareL2(OBJECT_TYPE *a, OBJECT_TYPE *b, size_t size) { assert(a != 0); assert(b != 0); @@ -850,7 +889,52 @@ namespace NGT { } return sqrt((double)d); } - +#else + inline static double compareL2(float *a, float *b, size_t size) { + __m256 sum = _mm256_setzero_ps(); + float *last = a + size; + float *lastgroup = last - 7; + while (a < lastgroup) { + __m256 v = _mm256_sub_ps(_mm256_loadu_ps(a), _mm256_loadu_ps(b)); + sum = _mm256_add_ps(sum, _mm256_mul_ps(v, v)); + a += 8; + b += 8; + } + __attribute__((aligned(32))) float f[8]; + _mm256_store_ps(f, sum); + double s = f[0] + f[1] + f[2] + f[3] + f[4] + f[5] + f[6] + f[7]; + while (a < last) { + double d = *a++ - *b++; + s += d * d; + } + return sqrt(s); + } + inline static double compareL2(unsigned char *a, unsigned char *b, size_t size) { + __m128 sum = _mm_setzero_ps(); + unsigned char *last = a + size; + OBJECT_TYPE *lastgroup = last - 7; + const __m128i zero = _mm_setzero_si128(); + while (a < lastgroup) { + __m128i x1 = _mm_cvtepu8_epi16(*(__m128i const*)a); + __m128i x2 = _mm_cvtepu8_epi16(*(__m128i const*)b); + x1 = _mm_subs_epi16(x1, x2); + __m128i v = _mm_mullo_epi16(x1, x1); + sum = _mm_add_ps(sum, _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, zero))); + sum = _mm_add_ps(sum, _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, zero))); + a += 8; + b += 8; + } + __attribute__((aligned(32))) float f[4]; + _mm_store_ps(f, sum); + double s = f[0] + f[1] + f[2] + f[3]; + while (a < last) { + int d = (int)*a++ - (int)*b++; + s += d * d; + } + return sqrt(s); + } +#endif +#if defined(NGT_AVX_DISABLED) || !defined(__AVX__) static double compareL1(OBJECT_TYPE *a, OBJECT_TYPE *b, size_t size) { assert(a != 0); assert(b != 0); @@ -873,7 +957,53 @@ namespace NGT { } return d; } - +#else + inline static double compareL1(float *a, float *b, size_t size) { + __m256 sum = _mm256_setzero_ps(); + float *last = a + size; + float *lastgroup = last - 7; + while (a < lastgroup) { + __m256 x1 = _mm256_sub_ps(_mm256_loadu_ps(a), _mm256_loadu_ps(b)); + const __m256 mask = _mm256_set1_ps(-0.0f); + __m256 v = _mm256_andnot_ps(mask, x1); + sum = _mm256_add_ps(sum, v); + a += 8; + b += 8; + } + __attribute__((aligned(32))) float f[8]; + _mm256_store_ps(f, sum); + double s = f[0] + f[1] + f[2] + f[3] + f[4] + f[5] + f[6] + f[7]; + while (a < last) { + double d = fabs(*a++ - *b++); + s += d; + } + return s; + } + inline static double compareL1(unsigned char *a, unsigned char *b, size_t size) { + __m128 sum = _mm_setzero_ps(); + unsigned char *last = a + size; + OBJECT_TYPE *lastgroup = last - 7; + const __m128i zero = _mm_setzero_si128(); + while (a < lastgroup) { + __m128i x1 = _mm_cvtepu8_epi16(*(__m128i const*)a); + __m128i x2 = _mm_cvtepu8_epi16(*(__m128i const*)b); + x1 = _mm_subs_epi16(x1, x2); + x1 = _mm_sign_epi16(x1, x1); + sum = _mm_add_ps(sum, _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, zero))); + sum = _mm_add_ps(sum, _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, zero))); + a += 8; + b += 8; + } + __attribute__((aligned(32))) float f[4]; + _mm_store_ps(f, sum); + double s = f[0] + f[1] + f[2] + f[3]; + while (a < last) { + double d = fabs((double)*a++ - (double)*b++); + s += d; + } + return s; + } +#endif inline static double popCount(uint32_t x) { x = (x & 0x55555555) + (x >> 1 & 0x55555555); x = (x & 0x33333333) + (x >> 2 & 0x33333333); @@ -899,33 +1029,21 @@ namespace NGT { } inline static double compareAngleDistance(OBJECT_TYPE *a, OBJECT_TYPE *b, size_t size) { - size_t loc = 0; - double cosine = 0.0F; - // Calculate the norm of A + // Calculate the norm of A and B (the supplied vector). double normA = 0.0F; - for (loc = 0; loc < size; loc++) { - normA += ((double) a[loc]) * a[loc]; - } - - assert(normA > 0.0F); - normA = sqrt (normA); - - // Calculate the norm of the supplied vector. double normB = 0.0F; - for (loc = 0; loc < size; loc++) { - normB += ((double) b[loc]) * b[loc]; + double sum = 0.0F; + for (size_t loc = 0; loc < size; loc++) { + normA += (double)a[loc] * (double)a[loc]; + normB += (double)b[loc] * (double)b[loc]; + sum += (double)a[loc] * (double)b[loc]; } + assert(normA > 0.0F); assert(normB > 0.0F); - normB = sqrt (normB); // Compute the dot product of the two vectors. - cosine = 0.0F; - - for (loc = 0; loc < size; loc++) { - cosine += (a[loc] / normA) * (b[loc] / normB); - } - + double cosine = sum / (sqrt(normA) * sqrt(normB)); // Compute the vector angle from the cosine value, and return. // Roundoff error could have put the cosine value out of range. // Handle these cases explicitly. @@ -939,6 +1057,26 @@ namespace NGT { } + inline static double compareCosineSimilarity(OBJECT_TYPE *a, OBJECT_TYPE *b, size_t size) { + // Calculate the norm of A and B (the supplied vector). + double normA = 0.0F; + double normB = 0.0F; + double sum = 0.0F; + for (size_t loc = 0; loc < size; loc++) { + normA += (double)a[loc] * (double)a[loc]; + normB += (double)b[loc] * (double)b[loc]; + sum += (double)a[loc] * (double)b[loc]; + } + + assert(normA > 0.0F); + assert(normB > 0.0F); + + // Compute the dot product of the two vectors. + double cosine = sum / (sqrt(normA) * sqrt(normB)); + + return 1.0 - cosine; + } + void serialize(const string &ofile) { ObjectRepository::serialize(ofile, this); } void deserialize(const string &ifile) { ObjectRepository::deserialize(ifile, this); } void serializeAsText(const string &ofile) { ObjectRepository::serializeAsText(ofile, this); } @@ -984,6 +1122,20 @@ namespace NGT { return; } + void *getObject(size_t idx) { + if (idx >= ObjectRepository::size()) { + stringstream msg; + msg << "NGT::ObjectSpaceT: Out of range. " << idx << ":" << ObjectRepository::size() << "."; + NGTThrowException(msg); + } + PersistentObject &obj = *(*this)[idx]; +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + return reinterpret_cast(&obj.at(0, allocator)); +#else + return reinterpret_cast(&obj[0]); +#endif + } + Object *allocateObject() { return ObjectRepository::allocateObject(); } void deleteObject(Object *po) { ObjectRepository::deleteObject(po); } diff --git a/lib/NGT/SharedMemoryAllocator.cpp b/lib/NGT/SharedMemoryAllocator.cpp index 678669d..d0f6d1c 100644 --- a/lib/NGT/SharedMemoryAllocator.cpp +++ b/lib/NGT/SharedMemoryAllocator.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/SharedMemoryAllocator.h b/lib/NGT/SharedMemoryAllocator.h index 02bbec1..eda44f1 100644 --- a/lib/NGT/SharedMemoryAllocator.h +++ b/lib/NGT/SharedMemoryAllocator.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Thread.cpp b/lib/NGT/Thread.cpp index 2ae51c5..80e502a 100644 --- a/lib/NGT/Thread.cpp +++ b/lib/NGT/Thread.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Thread.h b/lib/NGT/Thread.h index 0e7fce7..a9231dc 100644 --- a/lib/NGT/Thread.h +++ b/lib/NGT/Thread.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Tree.cpp b/lib/NGT/Tree.cpp index f3eb543..30b8f03 100644 --- a/lib/NGT/Tree.cpp +++ b/lib/NGT/Tree.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -36,7 +36,6 @@ DVPTree::insert(InsertContainer &iobj) { assert(q.nodeID.getType() == Node::ID::Leaf); LeafNode *ln = (LeafNode*)getNode(q.nodeID); - Node::ID lid = q.nodeID; insert(iobj, ln); return; @@ -94,7 +93,7 @@ DVPTree::insert(InsertContainer &iobj, LeafNode *leafNode) } if (leaf.getObjectSize() >= leafObjectsSize) { - Node::ID nid = split(iobj, leaf); + split(iobj, leaf); } else { insertObject(iobj, leaf); } diff --git a/lib/NGT/Tree.h b/lib/NGT/Tree.h index 66d747b..12ffc33 100644 --- a/lib/NGT/Tree.h +++ b/lib/NGT/Tree.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/defines.h.in b/lib/NGT/defines.h.in index fb8625a..b402f2a 100644 --- a/lib/NGT/defines.h.in +++ b/lib/NGT/defines.h.in @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2016 Yahoo Japan Corporation +// Copyright (C) 2015-2017 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ // Begin of cmake defines #cmakedefine NGT_SHARED_MEMORY_ALLOCATOR // use shared memory for indexes #cmakedefine NGT_GRAPH_CHECK_VECTOR // use vector to check whether accessed +#cmakedefine NGT_AVX_DISABLED // not use avx to compare // End of cmake defines //////////////////////////////////////////////////////////////////////////