diff --git a/VERSION b/VERSION index e010258..157e54f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.5 +2.0.6 diff --git a/bin/qbg/README.md b/bin/qbg/README.md index 99a102f..e1136b2 100644 --- a/bin/qbg/README.md +++ b/bin/qbg/README.md @@ -278,9 +278,9 @@ Build the quantized graph: $ qbg build qbg-index -### Search with the quantized graph +### Search with the quantized blob graph -Search k nearest neighbors with the quantized graph: +Search k nearest neighbors with the quantized blob graph: $ qbg search -n 20 -e 0.02 qbg-index query.tsv diff --git a/lib/NGT/Capi.cpp b/lib/NGT/Capi.cpp index 01a23d6..bd51bab 100644 --- a/lib/NGT/Capi.cpp +++ b/lib/NGT/Capi.cpp @@ -811,7 +811,7 @@ NGTObjectSpace ngt_get_object_space(NGTIndex index, NGTError error) { } } -float* ngt_get_object_as_float(NGTObjectSpace object_space, ObjectID id, NGTError error) { +void* ngt_get_object(NGTObjectSpace object_space, ObjectID id, NGTError error) { if(object_space == NULL){ std::stringstream ss; ss << "Capi : " << __FUNCTION__ << "() : parametor error: object_space = " << object_space; @@ -819,7 +819,7 @@ float* ngt_get_object_as_float(NGTObjectSpace object_space, ObjectID id, NGTErro return NULL; } try{ - return static_cast((static_cast(object_space))->getObject(id)); + return (static_cast(object_space))->getObject(id); }catch(std::exception &err) { std::stringstream ss; ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); @@ -828,21 +828,12 @@ float* ngt_get_object_as_float(NGTObjectSpace object_space, ObjectID id, NGTErro } } +float* ngt_get_object_as_float(NGTObjectSpace object_space, ObjectID id, NGTError error) { + return static_cast(ngt_get_object(object_space, id, error)); +} + uint8_t* ngt_get_object_as_integer(NGTObjectSpace object_space, ObjectID id, NGTError error) { - if(object_space == NULL){ - std::stringstream ss; - ss << "Capi : " << __FUNCTION__ << "() : parametor error: object_space = " << object_space; - operate_error_string_(ss, error); - return NULL; - } - try{ - return static_cast((static_cast(object_space))->getObject(id)); - }catch(std::exception &err) { - std::stringstream ss; - ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); - operate_error_string_(ss, error); - return NULL; - } + return static_cast(ngt_get_object(object_space, id, error)); } void ngt_destroy_results(NGTObjectDistances results) { diff --git a/lib/NGT/Capi.h b/lib/NGT/Capi.h index bf07a68..35bff03 100644 --- a/lib/NGT/Capi.h +++ b/lib/NGT/Capi.h @@ -153,6 +153,8 @@ bool ngt_remove_index(NGTIndex, ObjectID, NGTError); NGTObjectSpace ngt_get_object_space(NGTIndex, NGTError); +void* ngt_get_object(NGTObjectSpace, ObjectID, NGTError); + float* ngt_get_object_as_float(NGTObjectSpace, ObjectID, NGTError); uint8_t* ngt_get_object_as_integer(NGTObjectSpace, ObjectID, NGTError); diff --git a/lib/NGT/NGTQ/Capi.cpp b/lib/NGT/NGTQ/Capi.cpp index 972d076..c3d5ec3 100644 --- a/lib/NGT/NGTQ/Capi.cpp +++ b/lib/NGT/NGTQ/Capi.cpp @@ -125,11 +125,7 @@ void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *param bool ngtqg_quantize(const char *indexPath, NGTQGQuantizationParameters parameters, NGTError error) { try{ -#ifdef NGTQ_QBG - NGTQG::Index::quantize(indexPath, parameters.max_number_of_edges); -#else - NGTQG::Index::quantize(indexPath, parameters.dimension_of_subvector, parameters.max_number_of_edges); -#endif + NGTQG::Index::quantize(indexPath, parameters.dimension_of_subvector, parameters.max_number_of_edges, true); return true; }catch(std::exception &err){ std::stringstream ss; @@ -314,7 +310,7 @@ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBG return false; } - NGTQ::Optimizer optimizer; + QBG::Optimizer optimizer; optimizer.numberOfObjects = parameters->number_of_objects; optimizer.numberOfClusters = 16; @@ -333,7 +329,7 @@ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBG optimizer.timelimit *= 60.0 * 60.0; optimizer.rotation = parameters->rotation; optimizer.repositioning = parameters->repositioning; - optimizer.globalType = NGTQ::Optimizer::GlobalTypeNone; + optimizer.globalType = QBG::Optimizer::GlobalTypeNone; optimizer.silence = true; try { diff --git a/lib/NGT/NGTQ/HierarchicalKmeans.h b/lib/NGT/NGTQ/HierarchicalKmeans.h index 7ca640b..2e51905 100644 --- a/lib/NGT/NGTQ/HierarchicalKmeans.h +++ b/lib/NGT/NGTQ/HierarchicalKmeans.h @@ -45,6 +45,31 @@ namespace QBG { silence = true; } + HierarchicalKmeans(QBG::BuildParameters ¶m) { +#ifdef NGTQ_QBG + maxSize = param.hierarchicalClustering.maxSize; + numOfObjects = param.hierarchicalClustering.numOfObjects; + numOfClusters = param.hierarchicalClustering.numOfClusters; + numOfTotalClusters = param.hierarchicalClustering.numOfTotalClusters; + numOfTotalBlobs = param.hierarchicalClustering.numOfTotalBlobs; + clusterID = param.hierarchicalClustering.clusterID; + + initMode = param.hierarchicalClustering.initMode; + + numOfRandomObjects = param.hierarchicalClustering.numOfRandomObjects; + + numOfFirstObjects = param.hierarchicalClustering.numOfFirstObjects; + numOfFirstClusters = param.hierarchicalClustering.numOfFirstClusters; + numOfSecondObjects = param.hierarchicalClustering.numOfSecondObjects; + numOfSecondClusters = param.hierarchicalClustering.numOfSecondClusters; + numOfThirdClusters = param.hierarchicalClustering.numOfThirdClusters; + extractCentroid = param.hierarchicalClustering.extractCentroid; + + threeLayerClustering = param.hierarchicalClustering.threeLayerClustering; + silence = param.silence; +#endif + } + static int32_t searchLeaf(std::vector &nodes, int32_t rootID, float *object) { auto nodeID = rootID; while (true) { @@ -945,7 +970,7 @@ namespace QBG { abort(); } { - std::ofstream of(prefix + "_qcentroid.tsv"); + std::ofstream of(prefix + QBG::Index::getSecondCentroidSuffix()); extractCentroids(of, nodes); } std::vector qNodeIDs; @@ -958,7 +983,7 @@ namespace QBG { hierarchicalKmeansWithNumberOfClustersInParallel(numOfTotalBlobs, numOfObjects, numOfTotalClusters, objectList, objectSpace, nodes, initMode); { - std::ofstream of(prefix + "_btoq_index.tsv"); + std::ofstream of(prefix + QBG::Index::get3rdTo2ndSuffix()); extractBtoQIndex(of, nodes, qNodeIDs); } } @@ -1067,7 +1092,7 @@ namespace QBG { timer.stop(); std::cerr << "subclustering(1) time=" << timer << std::endl; std::cerr << "save quantization centroid" << std::endl; - NGT::Clustering::saveClusters(prefix + "_qcentroid.tsv", secondClusters); + NGT::Clustering::saveClusters(prefix + QBG::Index::getSecondCentroidSuffix(), secondClusters); timer.start(); std::cerr << "Assign for the third. (" << numOfSecondObjects << "-" << numOfObjects << ")..." << std::endl; assignWithNGT(secondClusters, numOfSecondObjects + 1, numOfObjects, objectSpace, objectList); @@ -1087,14 +1112,14 @@ namespace QBG { } } std::cerr << "save bqindex..." << std::endl; - NGT::Clustering::saveVector(prefix + "_bqindex.tsv", bqindex); + NGT::Clustering::saveVector(prefix + QBG::Index::get3rdTo2ndSuffix(), bqindex); } std::vector thirdFlatClusters; flattenClusters(secondClusters, thirdClusters, numOfThirdClusters, thirdFlatClusters); std::cerr << "save centroid..." << std::endl; - NGT::Clustering::saveClusters(prefix + "_centroid.tsv", thirdFlatClusters); + NGT::Clustering::saveClusters(prefix + QBG::Index::getThirdCentroidSuffix(), thirdFlatClusters); { std::vector cindex(numOfObjects); @@ -1105,7 +1130,7 @@ namespace QBG { } } std::cerr << "save index... " << cindex.size() << std::endl; - NGT::Clustering::saveVector(prefix + "_index.tsv", cindex); + NGT::Clustering::saveVector(prefix + QBG::Index::getObjTo3rdSuffix(), cindex); } std::cerr << "end of clustering" << std::endl; return; @@ -1117,103 +1142,122 @@ namespace QBG { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); - QBG::Index index(indexPath, true); + bool readOnly = false; + QBG::Index index(indexPath, readOnly); + index.getQuantizer().objectList.size(); + std::cerr << "clustering... " << std::endl; if (threeLayerClustering) { - - if (prefix.empty()) { - std::cerr << "Prefix is not specified." << std::endl; - prefix = indexPath + "/" + QBG::Index::getWorkspaceName(); - try { - NGT::Index::mkdir(prefix); - } catch(...) {} - prefix +="/kmeans-cluster"; - std::cerr << prefix << " is used" << std::endl; - } - auto &quantizer = static_cast&>(index.getQuantizer()); - auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace(); - size_t paddedDimension = objectSpace.getPaddedDimension(); - size_t dimension = objectSpace.getDimension(); - if (paddedDimension != dimension) { - std::cerr << "HierarachicalKmeans: Warning! Dimensions are inconsistent. Dimension=" << paddedDimension << ":" << dimension << std::endl; + try { + if (numOfObjects == 0) { + numOfObjects = index.getQuantizer().objectList.size() - 1; + } + if (numOfObjects != index.getQuantizer().objectList.size() - 1) { + std::cerr << "HierarchicalKmeans::clustering: Warning! # of objects is invalid." << std::endl; + std::cerr << " " << index.getQuantizer().objectList.size() - 1 << " is set to # of object instead of " << numOfObjects << std::endl; + numOfObjects = index.getQuantizer().objectList.size() - 1; + } + if (prefix.empty()) { + std::cerr << "Prefix is not specified." << std::endl; + prefix = indexPath + "/" + QBG::Index::getWorkspaceName(); + try { + NGT::Index::mkdir(prefix); + } catch(...) {} + prefix +="/" + QBG::Index::getHierarchicalClusteringPrefix(); + std::cerr << prefix << " is used" << std::endl; + } + auto &quantizer = static_cast&>(index.getQuantizer()); + auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace(); + size_t paddedDimension = objectSpace.getPaddedDimension(); + size_t dimension = objectSpace.getDimension(); + if (paddedDimension != dimension) { + std::cerr << "HierarachicalKmeans: Warning! Dimensions are inconsistent. Dimension=" << paddedDimension << ":" << dimension << std::endl; + } + multilayerClustering(prefix, index); + } catch(NGT::Exception &err) { + redirector.end(); + throw err; } - multilayerClustering(prefix, index); redirector.end(); return; } + try { + NGT::Clustering::ClusteringType clusteringType = NGT::Clustering::ClusteringTypeKmeansWithoutNGT; - NGT::Clustering::ClusteringType clusteringType = NGT::Clustering::ClusteringTypeKmeansWithoutNGT; - - uint32_t rootID = 0; - std::vector nodes; - nodes.push_back(new HKLeafNode); + uint32_t rootID = 0; + std::vector nodes; + nodes.push_back(new HKLeafNode); - std::vector object; - size_t iteration = 1000; - NGT::Clustering clustering(initMode, clusteringType, iteration, numOfClusters); - auto &quantizer = static_cast&>(index.getQuantizer()); - QBGObjectList &objectList = quantizer.objectList; - if (objectIDsFile.empty()) { - treeBasedTopdownClustering(prefix, index, rootID, object, nodes, clustering); - } else { - std::cerr << "Cluster ID=" << clusterID << std::endl; - if (clusterID < 0) { - std::stringstream msg; - msg << "Any target cluster ID is not specified."; - NGTThrowException(msg); - } - std::ifstream objectIDs(objectIDsFile); - if (!objectIDs) { - std::stringstream msg; - msg << "Cannot open the object id file. " << objectIDsFile; - NGTThrowException(msg); - } - auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace(); - uint32_t id = 1; - int32_t cid; - size_t ccount = 0; - while (objectIDs >> cid) { - std::cerr << cid << std::endl; - if (id % 100000 == 0) { - std::cerr << "# of processed objects=" << id << std::endl; + std::vector object; + size_t iteration = 1000; + NGT::Clustering clustering(initMode, clusteringType, iteration, numOfClusters); + auto &quantizer = static_cast&>(index.getQuantizer()); + QBGObjectList &objectList = quantizer.objectList; + if (objectIDsFile.empty()) { + treeBasedTopdownClustering(prefix, index, rootID, object, nodes, clustering); + } else { + std::cerr << "Cluster ID=" << clusterID << std::endl; + if (clusterID < 0) { + std::stringstream msg; + msg << "Any target cluster ID is not specified."; + NGTThrowException(msg); } - if (cid == -1) { - continue; + std::ifstream objectIDs(objectIDsFile); + if (!objectIDs) { + std::stringstream msg; + msg << "Cannot open the object id file. " << objectIDsFile; + NGTThrowException(msg); } - if (cid == clusterID) { - ccount++; - hierarchicalKmeans(id, rootID, object, objectList, objectSpace, nodes, clustering, maxSize); + auto &objectSpace = quantizer.globalCodebookIndex.getObjectSpace(); + uint32_t id = 1; + int32_t cid; + size_t ccount = 0; + while (objectIDs >> cid) { + std::cerr << cid << std::endl; + if (id % 100000 == 0) { + std::cerr << "# of processed objects=" << id << std::endl; + } + if (cid == -1) { + continue; + } + if (cid == clusterID) { + ccount++; + hierarchicalKmeans(id, rootID, object, objectList, objectSpace, nodes, clustering, maxSize); + } + id++; } - id++; - } - } - size_t objectCount = 0; - if (prefix.empty()) { - objectCount = extractCentroids(std::cout, nodes); - } else { - { - std::ofstream of(prefix + "_centroid.tsv"); - objectCount = extractCentroids(of, nodes); } - { - std::ofstream of(prefix + "_index.tsv"); - extractIndex(of, nodes, numOfObjects); - } - if (numOfFirstObjects > 0) { - std::ofstream btoqof(prefix + "_btoq.tsv"); - std::ofstream qcof(prefix + "_qcentroid.tsv"); - extractBtoQAndQCentroid(btoqof, qcof, nodes, numOfThirdClusters); - } - if (numOfRandomObjects > 0) { - std::ofstream of(prefix + "_random_object.tsv"); - if (extractCentroid) { - extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects - 1, quantizer, extractCentroid); - } else { - extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects, quantizer, extractCentroid); + size_t objectCount = 0; + if (prefix.empty()) { + objectCount = extractCentroids(std::cout, nodes); + } else { + { + std::ofstream of(prefix + QBG::Index::getThirdCentroidSuffix()); + objectCount = extractCentroids(of, nodes); + } + { + std::ofstream of(prefix + QBG::Index::getObjTo3rdSuffix()); + extractIndex(of, nodes, numOfObjects); + } + if (numOfFirstObjects > 0) { + std::ofstream btoqof(prefix + QBG::Index::get3rdTo2ndSuffix()); + std::ofstream qcof(prefix + QBG::Index::getSecondCentroidSuffix()); + extractBtoQAndQCentroid(btoqof, qcof, nodes, numOfThirdClusters); + } + if (numOfRandomObjects > 0) { + std::ofstream of(prefix + "_random_object.tsv"); + if (extractCentroid) { + extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects - 1, quantizer, extractCentroid); + } else { + extractRandomObjectsFromEachBlob(of, nodes, numOfObjects, numOfRandomObjects, quantizer, extractCentroid); + } } } - } - if (objectCount != numOfObjects) { - std::cerr << "# of objects is invalid. " << objectCount << ":" << numOfObjects << std::endl; + if (objectCount != numOfObjects) { + std::cerr << "# of objects is invalid. " << objectCount << ":" << numOfObjects << std::endl; + } + } catch(NGT::Exception &err) { + redirector.end(); + throw err; } redirector.end(); } diff --git a/lib/NGT/NGTQ/Optimizer.cpp b/lib/NGT/NGTQ/Optimizer.cpp new file mode 100644 index 0000000..7d723d6 --- /dev/null +++ b/lib/NGT/NGTQ/Optimizer.cpp @@ -0,0 +1,493 @@ +// +// Copyright (C) 2021 Yahoo Japan Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "QuantizedBlobGraph.h" +#include "Optimizer.h" + +QBG::Optimizer::Optimizer(QBG::BuildParameters ¶m) { +#ifdef NGTQ_QBG + clusteringType = param.optimization.clusteringType; + initMode = param.optimization.initMode; + + timelimit = param.optimization.timelimit; + iteration = param.optimization.iteration; + clusterIteration = param.optimization.clusterIteration; + clusterSizeConstraint = param.optimization.clusterSizeConstraint; + clusterSizeConstraintCoefficient = param.optimization.clusterSizeConstraintCoefficient; + convergenceLimitTimes = param.optimization.convergenceLimitTimes; + numberOfObjects = param.optimization.numberOfObjects; + numberOfClusters = param.optimization.numberOfClusters; + numberOfSubvectors = param.optimization.numberOfSubvectors; + nOfMatrices = param.optimization.nOfMatrices; + seedStartObjectSizeRate = param.optimization.seedStartObjectSizeRate; + seedStep = param.optimization.seedStep; + reject = param.optimization.reject; + repositioning = param.optimization.repositioning; + rotation = param.optimization.rotation; + globalType = param.optimization.globalType; + randomizedObjectExtraction = param.optimization.randomizedObjectExtraction; + showClusterInfo = param.optimization.showClusterInfo; + silence = param.silence; +#endif +} + + +void QBG::Optimizer::evaluate(string global, vector> &vectors, char clusteringType, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize) +{ +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + std::cerr << "evaluate: Not implemented." << std::endl; + abort(); +#else + vector> residualVectors; + { + // compute residual vectors by global centroids. +#ifdef NGT_CLUSTERING + vector globalCentroid; +#else + vector globalCentroid; +#endif + if (!global.empty()) { + cerr << "generate residual vectors." << endl; + try { +#ifdef NGT_CLUSTERING + NGT::Clustering::loadClusters(global, globalCentroid); +#else + loadClusters(global, globalCentroid); +#endif + } catch (...) { + cerr << "Cannot load vectors. " << global << endl; + return; + } + if (clusteringType == 'k') { +#ifdef NGT_CLUSTERING + NGT::Clustering::assign(vectors, globalCentroid); +#else + assign(vectors, globalCentroid); +#endif + } else { + cerr << "Using NGT" << endl; +#ifdef NGT_CLUSTERING + std::cerr << "Not implemented" << std::endl; + abort(); +#else + assignWithNGT(vectors, globalCentroid); +#endif + } + residualVectors.resize(vectors.size()); + cerr << "global centroid size=" << globalCentroid.size() << endl; + for (size_t cidx = 0; cidx < globalCentroid.size(); ++cidx) { + for (auto mit = globalCentroid[cidx].members.begin(); mit != globalCentroid[cidx].members.end(); ++mit) { + size_t vid = (*mit).vectorID; + residualVectors[vid] = vectors[vid]; +#ifdef NGT_CLUSTERING + NGT::Clustering::subtract(residualVectors[vid], globalCentroid[cidx].centroid); +#else + subtract(residualVectors[vid], globalCentroid[cidx].centroid); +#endif + } + } + } + } + + Matrix R; + Matrix::load(ofile + QBG::Index::getRotationFile(), R); + vector> qv(vectors.size()); // quantized vector + vector> xp; // residual vector + if (residualVectors.empty()) { + xp = vectors; + } else { + xp = residualVectors; + } + Matrix::mulSquare(xp, R); + for (size_t m = 0; m < numberOfSubvectors; m++) { +#ifdef NGT_CLUSTERING + vector subClusters; +#else + vector subClusters; +#endif + stringstream str; + str << ofile << "-" << m; +#ifdef NGT_CLUSTERING + NGT::Clustering::loadClusters(str.str(), subClusters); +#else + loadClusters(str.str(), subClusters); +#endif + vector> subVectors; + extractSubvector(xp, subVectors, m * subvectorSize, subvectorSize); + if (clusteringType == 'k') { +#ifdef NGT_CLUSTERING + NGT::Clustering::assign(subVectors, subClusters); +#else + assign(subVectors, subClusters); +#endif + } else { + cerr << "Using NGT for subvector" << endl; +#ifdef NGT_CLUSTERING + std::cerr << "not implemented" << std::endl; + abort(); +#else + assignWithNGT(subVectors, subClusters); +#endif + } +#ifdef NGT_CLUSTERING + double distortion = NGT::Clustering::calculateML2(subVectors, subClusters); +#else + double distortion = calculateML2(subVectors, subClusters); +#endif + cout << "distortion[" << m << "]=" << distortion << endl; + vector> subCentroids(vectors.size()); + for (size_t cidx = 0; cidx < subClusters.size(); ++cidx) { +#ifdef NGT_CLUSTERING + vector &members = subClusters[cidx].members; +#else + vector &members = subClusters[cidx].members; +#endif + for (size_t eidx = 0; eidx < members.size(); ++eidx) { +#ifdef NGT_CLUSTERING + NGT::Clustering::Entry &entry = members[eidx]; +#else + Entry &entry = members[eidx]; +#endif + assert(cidx == entry.centroidID); + subCentroids[entry.vectorID] = subClusters[cidx].centroid; + } + } + catSubvector(qv, subCentroids); + } +#ifdef NGT_CLUSTERING + double distortion = NGT::Clustering::distanceL2(qv, xp); +#else + double distortion = distanceL2(qv, xp); +#endif + cout << "distortion=" << distortion << endl; + return; +#endif +} + +void QBG::Optimizer::evaluate(vector> &vectors, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize) { +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + std::cerr << "evaluate: Not implemented." << std::endl; + abort(); +#else + cerr << "Evaluate" << endl; + Matrix R; + Matrix::load(ofile + QBG::Index::getRotationFile(), R); + vector> xp = vectors; + Matrix::mulSquare(xp, R); + for (size_t m = 0; m < numberOfSubvectors; m++) { +#ifdef NGT_CLUSTERING + vector subClusters; +#else + vector subClusters; +#endif + stringstream str; + str << ofile << "-" << m; +#ifdef NGT_CLUSTERING + NGT::Clustering::loadClusters(str.str(), subClusters); +#else + loadClusters(str.str(), subClusters); +#endif + vector> subVectors; + extractSubvector(xp, subVectors, m * subvectorSize, subvectorSize); +#ifdef NGT_CLUSTERING + NGT::Clustering::assign(subVectors, subClusters); + double distortion = NGT::Clustering::calculateML2(subVectors, subClusters); +#else + assign(subVectors, subClusters); + double distortion = calculateML2(subVectors, subClusters); +#endif + cout << "distortion[" << m << "]=" << distortion << endl; + for (size_t cidx = 0; cidx < subClusters.size(); ++cidx) { + cout << " members[" << cidx << "]=" << subClusters[cidx].members.size() << endl; + } + } + return; +#endif +} + +#ifdef NGTQ_QBG +void QBG::Optimizer::optimize(const std::string indexPath, size_t threadSize) { + NGT::StdOstreamRedirector redirector(silence); + redirector.begin(); + try { + QBG::Index index(indexPath); + if (index.getQuantizer().objectList.size() <= 1) { + NGTThrowException("optimize: No objects"); + } + std::cerr << "optimize: # of objects=" << numberOfObjects << std::endl; + if (numberOfObjects == 0) { + numberOfObjects = index.getQuantizer().objectList.size() - 1; + } + std::cerr << "optimize: # of clusters=" << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters << std::endl; + if (index.getQuantizer().property.localCentroidLimit == 0 && numberOfClusters == 0) { + std::stringstream msg; + msg << "optimize: # of clusters is illegal. " << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters; + NGTThrowException(msg); + } + if (index.getQuantizer().property.localCentroidLimit != 0 && numberOfClusters != 0 && + index.getQuantizer().property.localCentroidLimit != numberOfClusters) { + std::cerr << "optimize: warning! # of clusters is already specified. " << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters << std::endl; + } + if (numberOfClusters == 0) { + numberOfClusters = index.getQuantizer().property.localCentroidLimit; + } + + if (numberOfSubvectors == 0 && index.getQuantizer().property.localDivisionNo == 0) { + std::stringstream msg; + msg << "optimize: # of subvectors is illegal. " << numberOfSubvectors << ":" << index.getQuantizer().property.localDivisionNo; + NGTThrowException(msg); + } + if (numberOfSubvectors != 0 && index.getQuantizer().property.localDivisionNo != 0 && + numberOfSubvectors != index.getQuantizer().property.localDivisionNo) { + std::cerr << "optimize: warning! # of subvectros is already specified. " << numberOfSubvectors << ":" << index.getQuantizer().property.localDivisionNo << std::endl; + } + if (numberOfSubvectors == 0) { + numberOfSubvectors = index.getQuantizer().property.localDivisionNo; + } + + const std::string ws = indexPath + "/" + QBG::Index::getWorkspaceName(); + try { + NGT::Index::mkdir(ws); + } catch(...) {} + const std::string object = QBG::Index::getTrainObjectFile(indexPath); + std::ofstream ofs; + ofs.open(object); + index.extract(ofs, numberOfObjects, randomizedObjectExtraction); + if (globalType == GlobalTypeZero) { + assert(index.getQuantizer().objectList.pseudoDimension != 0); + std::vector> global(1); + global[0].resize(index.getQuantizer().property.dimension, 0.0); + NGT::Clustering::saveVectors(QBG::Index::getQuantizerCodebookFile(indexPath), global); + } else if (globalType == GlobalTypeMean) { + std::vector> vectors; + std::string objects = QBG::Index::getTrainObjectFile(indexPath); +#ifdef NGT_CLUSTERING + NGT::Clustering::loadVectors(objects, vectors); +#else + loadVectors(objects, vectors); +#endif + if (vectors.size() == 0 || vectors[0].size() == 0) { + NGTThrowException("Optimizer::optimize: invalid input vectors"); + } + std::vector> global(1); + global[0].resize(index.getQuantizer().property.dimension, 0); + for (auto v = vectors.begin(); v != vectors.end(); ++v) { + for (size_t i = 0; i < (*v).size(); i++) { + global[0][i] += (*v)[i]; + } + } + for (size_t i = 0; i < global[0].size(); i++) { + global[0][i] /= vectors.size(); + } + NGT::Clustering::saveVectors(QBG::Index::getQuantizerCodebookFile(indexPath), global); + } + + optimizeWithinIndex(indexPath); + + } catch(NGT::Exception &err) { + redirector.end(); + throw err; + } + + redirector.end(); +} +#endif + +#ifdef NGTQ_QBG +void QBG::Optimizer::optimizeWithinIndex(std::string indexPath) { + std::string object; + std::string pq; + std::string global; + { + object = QBG::Index::getTrainObjectFile(indexPath); + pq = QBG::Index::getPQFile(indexPath); + global = QBG::Index::getQuantizerCodebookFile(indexPath); + } + + try { + NGT::Index::mkdir(pq); + } catch(...) {} + pq += "/"; + optimize(object, pq, global); +} +#endif + + + +void QBG::Optimizer::optimize(std::string invector, std::string ofile, std::string global) { +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + std::cerr << "optimize: Not implemented." << std::endl; + abort(); +#else + vector> vectors; + + +#ifdef NGT_CLUSTERING + NGT::Clustering::loadVectors(invector, vectors); +#else + loadVectors(invector, vectors); +#endif + + if (vectors.size() == 0) { + std::stringstream msg; + msg << "Optimizer: error! the specified vetor file is empty. " << invector << ". the size=" << vectors.size(); + NGTThrowException(msg); + } + + dim = vectors[0].size(); + subvectorSize = dim / numberOfSubvectors; + if (dim % numberOfSubvectors != 0) { + std::stringstream msg; + msg << "# of subspaces (m) is illegal. " << dim << ":" << numberOfSubvectors; + NGTThrowException(msg); + } + + + timelimitTimer.start(); + + Matrix reposition; + if (repositioning) { + reposition.zero(dim, dim); + size_t dstidx = 0; + for (size_t didx = 0; didx < numberOfSubvectors; didx++) { + for (size_t sdidx = 0; sdidx < subvectorSize; sdidx++) { + size_t srcidx = didx + sdidx * numberOfSubvectors; + auto col = dstidx; + auto row = srcidx; + reposition.set(row, col, 1.0); + dstidx++; + } + } + std::cerr << "Optimizer: Each axis was repositioned." << std::endl; + } + + + vector>> localClusters; + vector errors; + + bool useEye = false; + nOfMatrices = nOfMatrices == 0 ? 1 : nOfMatrices; + if (!rotation) { + iteration = 1; + seedStartObjectSizeRate = 1.0; + seedStep = 2; + } + useEye = !rotation; + vector> rs(nOfMatrices); + for (auto &r: rs) { + if (useEye) { + r.eye(dim); + } else { + r.randomRotation(dim); + } + } + + for (size_t vsize = static_cast(vectors.size()) * seedStartObjectSizeRate; ; vsize *= seedStep) { + auto partialVectors = vectors; + if (vsize < vectors.size()) { + partialVectors.resize(vsize); + } + + optimize(partialVectors, + global, + ofile, + reposition, + rs, + localClusters, + errors); + if (rs.size() > 1) { + nOfMatrices = static_cast(nOfMatrices) * (1.0 - reject); + nOfMatrices = nOfMatrices == 0 ? 1 : nOfMatrices; + vector*, vector>*>>> sortedErrors; + for (size_t idx = 0; idx < errors.size(); idx++) { + sortedErrors.emplace_back(make_pair(errors[idx], make_pair(&rs[idx], &localClusters[idx]))); + } + sort(sortedErrors.begin(), sortedErrors.end()); + vector> tmpMatrix; + vector>> tmpLocalClusters; + for (size_t idx = 0; idx < nOfMatrices; idx++) { + tmpMatrix.emplace_back(*sortedErrors[idx].second.first); + tmpLocalClusters.emplace_back(*sortedErrors[idx].second.second); + } + if (tmpMatrix.size() != nOfMatrices) { + std::cerr << "something strange. " << tmpMatrix.size() << ":" << nOfMatrices << std::endl; + } + rs = std::move(tmpMatrix); + localClusters = std::move(tmpLocalClusters); + } + if (vsize >= vectors.size()) { + break; + } + } + + if (rs.size() != 1) { + std::cerr << "Optimizer: Warning. rs.size=" << rs.size() << std::endl; + } + auto minR = std::move(rs[0]); + auto minLocalClusters = std::move(localClusters[0]); + //-/size_t pos = std::distance(std::find(ofile.rbegin(), ofile.rend(), '.'), ofile.rend()) - 1; + std::cerr << "pass " << ofile << std::endl; + if (repositioning) { + Matrix repositionedR(reposition); + repositionedR.mul(minR); + Matrix::save(ofile + QBG::Index::getRotationFile(), repositionedR); + } else { + Matrix::save(ofile + QBG::Index::getRotationFile(), minR); + } + if (showClusterInfo) { + if (minLocalClusters.size() != numberOfSubvectors) { + std::stringstream msg; + msg << "Fatal error. minLocalClusters.size() != numberOfSubvectors " << + minLocalClusters.size() << ":" << numberOfSubvectors; + NGTThrowException(msg); + } + float totalRate = 0.0; + for (size_t m = 0; m < minLocalClusters.size(); m++) { + size_t min = std::numeric_limits::max(); + size_t max = 0; + size_t nOfVectors = 0; + for (size_t i = 0; i < minLocalClusters[m].size(); i++) { + nOfVectors += minLocalClusters[m][i].members.size(); + if (minLocalClusters[m][i].members.size() < min) { + min = minLocalClusters[m][i].members.size(); + } + if (minLocalClusters[m][i].members.size() > max) { + max = minLocalClusters[m][i].members.size(); + } + } + float rate = static_cast(max - min) / static_cast(nOfVectors); + totalRate += rate; + std::cout << "cluster " << m << " " << rate << "," << max - min << "," << min << "," << max << " : "; + for (size_t i = 0; i < minLocalClusters[m].size(); i++) { + std::cout << minLocalClusters[m][i].members.size() << " "; + } + std::cout << std::endl; + } + totalRate /= minLocalClusters.size(); + std::cout << "Range rate=" << totalRate << std::endl; + std::cout << "Error=" << errors[0] << std::endl; + } + for (size_t m = 0; m < numberOfSubvectors; m++) { + stringstream str; + str << ofile << QBG::Index::getSubvectorPrefix() << "-" << m; +#ifdef NGT_CLUSTERING + NGT::Clustering::saveClusters(str.str(), minLocalClusters[m]); +#else + saveClusters(str.str(), minLocalClusters[m]); +#endif + } +#endif +} diff --git a/lib/NGT/NGTQ/Optimizer.h b/lib/NGT/NGTQ/Optimizer.h index 297ca92..a67c1b7 100644 --- a/lib/NGT/NGTQ/Optimizer.h +++ b/lib/NGT/NGTQ/Optimizer.h @@ -17,7 +17,6 @@ #define NGT_CLUSTERING -#include "QuantizedBlobGraph.h" #ifdef NGT_CLUSTERING #include "NGT/Clustering.h" #else @@ -31,7 +30,9 @@ #include "Matrix.h" -namespace NGTQ { +namespace QBG { + class BuildParameters; + class Optimizer { public: enum GlobalType { @@ -41,16 +42,18 @@ namespace NGTQ { }; Optimizer() { - numberOfClusters = 0; - numberOfSubvectors = 0; clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT; initMode = NGT::Clustering::InitializationModeRandom; - convergenceLimitTimes = 5; + iteration = 100; clusterIteration = 100; clusterSizeConstraint = false; clusterSizeConstraintCoefficient = 10.0; - iteration = 100; + convergenceLimitTimes = 5; + + numberOfClusters = 0; + numberOfSubvectors = 0; + repositioning = false; rotation = true; globalType = GlobalTypeNone; @@ -59,6 +62,8 @@ namespace NGTQ { showClusterInfo = false; } + Optimizer(QBG::BuildParameters ¶m); + static void extractSubvector(vector> &vectors, vector> &subvectors, size_t start , size_t size) { @@ -125,168 +130,9 @@ namespace NGTQ { return distance; } - void evaluate(string global, vector> &vectors, char clusteringType, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize) - { - vector> residualVectors; - { - // compute residual vectors by global centroids. -#ifdef NGT_CLUSTERING - vector globalCentroid; -#else - vector globalCentroid; -#endif - if (!global.empty()) { - cerr << "generate residual vectors." << endl; - try { -#ifdef NGT_CLUSTERING - NGT::Clustering::loadClusters(global, globalCentroid); -#else - loadClusters(global, globalCentroid); -#endif - } catch (...) { - cerr << "Cannot load vectors. " << global << endl; - return; - } - if (clusteringType == 'k') { -#ifdef NGT_CLUSTERING - NGT::Clustering::assign(vectors, globalCentroid); -#else - assign(vectors, globalCentroid); -#endif - } else { - cerr << "Using NGT" << endl; -#ifdef NGT_CLUSTERING - std::cerr << "Not implemented" << std::endl; - abort(); -#else - assignWithNGT(vectors, globalCentroid); -#endif - } - residualVectors.resize(vectors.size()); - cerr << "global centroid size=" << globalCentroid.size() << endl; - for (size_t cidx = 0; cidx < globalCentroid.size(); ++cidx) { - for (auto mit = globalCentroid[cidx].members.begin(); mit != globalCentroid[cidx].members.end(); ++mit) { - size_t vid = (*mit).vectorID; - residualVectors[vid] = vectors[vid]; -#ifdef NGT_CLUSTERING - NGT::Clustering::subtract(residualVectors[vid], globalCentroid[cidx].centroid); -#else - subtract(residualVectors[vid], globalCentroid[cidx].centroid); -#endif - } - } - } - } - - Matrix R; - Matrix::load(ofile + "_R.tsv", R); - vector> qv(vectors.size()); // quantized vector - vector> xp; // residual vector - if (residualVectors.empty()) { - xp = vectors; - } else { - xp = residualVectors; - } - Matrix::mulSquare(xp, R); - for (size_t m = 0; m < numberOfSubvectors; m++) { -#ifdef NGT_CLUSTERING - vector subClusters; -#else - vector subClusters; -#endif - stringstream str; - str << ofile << "-" << m << ".tsv"; -#ifdef NGT_CLUSTERING - NGT::Clustering::loadClusters(str.str(), subClusters); -#else - loadClusters(str.str(), subClusters); -#endif - vector> subVectors; - extractSubvector(xp, subVectors, m * subvectorSize, subvectorSize); - if (clusteringType == 'k') { -#ifdef NGT_CLUSTERING - NGT::Clustering::assign(subVectors, subClusters); -#else - assign(subVectors, subClusters); -#endif - } else { - cerr << "Using NGT for subvector" << endl; -#ifdef NGT_CLUSTERING - std::cerr << "not implemented" << std::endl; - abort(); -#else - assignWithNGT(subVectors, subClusters); -#endif - } -#ifdef NGT_CLUSTERING - double distortion = NGT::Clustering::calculateML2(subVectors, subClusters); -#else - double distortion = calculateML2(subVectors, subClusters); -#endif - cout << "distortion[" << m << "]=" << distortion << endl; - vector> subCentroids(vectors.size()); - for (size_t cidx = 0; cidx < subClusters.size(); ++cidx) { -#ifdef NGT_CLUSTERING - vector &members = subClusters[cidx].members; -#else - vector &members = subClusters[cidx].members; -#endif - for (size_t eidx = 0; eidx < members.size(); ++eidx) { -#ifdef NGT_CLUSTERING - NGT::Clustering::Entry &entry = members[eidx]; -#else - Entry &entry = members[eidx]; -#endif - assert(cidx == entry.centroidID); - subCentroids[entry.vectorID] = subClusters[cidx].centroid; - } - } - catSubvector(qv, subCentroids); - } -#ifdef NGT_CLUSTERING - double distortion = NGT::Clustering::distanceL2(qv, xp); -#else - double distortion = distanceL2(qv, xp); -#endif - cout << "distortion=" << distortion << endl; - return; - } + void evaluate(string global, vector> &vectors, char clusteringType, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize); - void evaluate(vector> &vectors, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize) { - cerr << "Evaluate" << endl; - Matrix R; - Matrix::load(ofile + "_R.tsv", R); - vector> xp = vectors; - Matrix::mulSquare(xp, R); - for (size_t m = 0; m < numberOfSubvectors; m++) { -#ifdef NGT_CLUSTERING - vector subClusters; -#else - vector subClusters; -#endif - stringstream str; - str << ofile << "-" << m << ".tsv"; -#ifdef NGT_CLUSTERING - NGT::Clustering::loadClusters(str.str(), subClusters); -#else - loadClusters(str.str(), subClusters); -#endif - vector> subVectors; - extractSubvector(xp, subVectors, m * subvectorSize, subvectorSize); -#ifdef NGT_CLUSTERING - NGT::Clustering::assign(subVectors, subClusters); - double distortion = NGT::Clustering::calculateML2(subVectors, subClusters); -#else - assign(subVectors, subClusters); - double distortion = calculateML2(subVectors, subClusters); -#endif - cout << "distortion[" << m << "]=" << distortion << endl; - for (size_t cidx = 0; cidx < subClusters.size(); ++cidx) { - cout << " members[" << cidx << "]=" << subClusters[cidx].members.size() << endl; - } - } - return; - } + void evaluate(vector> &vectors, string &ofile, size_t &numberOfSubvectors, size_t &subvectorSize); void generateResidualObjects(string global, vector> &vectors) @@ -520,267 +366,14 @@ namespace NGTQ { } #ifdef NGTQ_QBG - void optimize(const std::string indexPath, size_t threadSize = 0) { - NGT::StdOstreamRedirector redirector(silence); - redirector.begin(); - { - QBG::Index index(indexPath); - if (index.getQuantizer().objectList.size() <= 1) { - NGTThrowException("optimize: No objects"); - } - if (numberOfObjects == 0) { - numberOfObjects = index.getQuantizer().objectList.size() - 1; - } - std::cerr << "optimize: # of clusters=" << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters << std::endl; - if (index.getQuantizer().property.localCentroidLimit == 0 && numberOfClusters == 0) { - std::stringstream msg; - msg << "optimize: # of clusters is illegal. " << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters; - NGTThrowException(msg); - } - if (index.getQuantizer().property.localCentroidLimit != 0 && numberOfClusters != 0 && - index.getQuantizer().property.localCentroidLimit != numberOfClusters) { - std::cerr << "optimize: warning! # of clusters is already specified. " << index.getQuantizer().property.localCentroidLimit << ":" << numberOfClusters << std::endl; - } - if (numberOfClusters == 0) { - numberOfClusters = index.getQuantizer().property.localCentroidLimit; - } - - if (numberOfSubvectors == 0 && index.getQuantizer().property.localDivisionNo == 0) { - std::stringstream msg; - msg << "optimize: # of subvectors is illegal. " << numberOfSubvectors << ":" << index.getQuantizer().property.localDivisionNo; - NGTThrowException(msg); - } - if (numberOfSubvectors != 0 && index.getQuantizer().property.localDivisionNo != 0 && - numberOfSubvectors != index.getQuantizer().property.localDivisionNo) { - std::cerr << "optimize: warning! # of subvectros is already specified. " << numberOfSubvectors << ":" << index.getQuantizer().property.localDivisionNo << std::endl; - } - if (numberOfSubvectors == 0) { - numberOfSubvectors = index.getQuantizer().property.localDivisionNo; - } - - const std::string ws = indexPath + "/" + QBG::Index::getWorkspaceName(); - try { - NGT::Index::mkdir(ws); - } catch(...) {} - const std::string object = QBG::Index::getTrainObjectFileName(indexPath); - std::ofstream ofs; - ofs.open(object); - index.extract(ofs, numberOfObjects, randomizedObjectExtraction); - if (globalType == GlobalTypeZero) { - assert(index.getQuantizer().objectList.pseudoDimension != 0); - std::vector> global(1); - global[0].resize(index.getQuantizer().property.dimension, 0.0); - NGT::Clustering::saveVectors(QBG::Index::getQuantizerCodebookFileName(indexPath), global); - } else if (globalType == GlobalTypeMean) { - std::vector> vectors; - std::string objects = QBG::Index::getTrainObjectFileName(indexPath); -#ifdef NGT_CLUSTERING - NGT::Clustering::loadVectors(objects, vectors); -#else - loadVectors(objects, vectors); -#endif - if (vectors.size() == 0 || vectors[0].size() == 0) { - NGTThrowException("Optimizer::optimize: invalid input vectors"); - } - std::vector> global(1); - global[0].resize(index.getQuantizer().property.dimension, 0); - for (auto v = vectors.begin(); v != vectors.end(); ++v) { - for (size_t i = 0; i < (*v).size(); i++) { - global[0][i] += (*v)[i]; - } - } - for (size_t i = 0; i < global[0].size(); i++) { - global[0][i] /= vectors.size(); - } - NGT::Clustering::saveVectors(QBG::Index::getQuantizerCodebookFileName(indexPath), global); - } - } - - optimizeWithinIndex(indexPath); - - redirector.end(); - } + void optimize(const std::string indexPath, size_t threadSize = 0); #endif #ifdef NGTQ_QBG - void optimizeWithinIndex(std::string indexPath) { - std::string object; - std::string pq; - std::string global; - { - object = QBG::Index::getTrainObjectFileName(indexPath); - pq = QBG::Index::getPQFileName(indexPath); - global = QBG::Index::getQuantizerCodebookFileName(indexPath); - } - - try { - NGT::Index::mkdir(pq); - } catch(...) {} - pq += "/opt.tsv"; - optimize(object, pq, global); - } + void optimizeWithinIndex(std::string indexPath); #endif - void optimize(std::string invector, std::string ofile, std::string global) { - vector> vectors; - - -#ifdef NGT_CLUSTERING - NGT::Clustering::loadVectors(invector, vectors); -#else - loadVectors(invector, vectors); -#endif - - if (vectors.size() == 0) { - std::stringstream msg; - msg << "Optimizer: error! the specified vetor file is empty. " << invector << ". the size=" << vectors.size(); - NGTThrowException(msg); - } - - dim = vectors[0].size(); - subvectorSize = dim / numberOfSubvectors; - if (dim % numberOfSubvectors != 0) { - std::stringstream msg; - msg << "# of subspaces (m) is illegal. " << dim << ":" << numberOfSubvectors; - NGTThrowException(msg); - } - - - timelimitTimer.start(); - - Matrix reposition; - if (repositioning) { - reposition.zero(dim, dim); - size_t dstidx = 0; - for (size_t didx = 0; didx < numberOfSubvectors; didx++) { - for (size_t sdidx = 0; sdidx < subvectorSize; sdidx++) { - size_t srcidx = didx + sdidx * numberOfSubvectors; - auto col = dstidx; - auto row = srcidx; - reposition.set(row, col, 1.0); - dstidx++; - } - } - std::cerr << "Optimizer: Each axis was repositioned." << std::endl; - } - - - vector>> localClusters; - vector errors; - - bool useEye = false; - nOfMatrices = nOfMatrices == 0 ? 1 : nOfMatrices; - if (!rotation) { - iteration = 1; - seedStartObjectSizeRate = 1.0; - seedStep = 2; - } - useEye = !rotation; - vector> rs(nOfMatrices); - for (auto &r: rs) { - if (useEye) { - r.eye(dim); - } else { - r.randomRotation(dim); - } - } - - for (size_t vsize = static_cast(vectors.size()) * seedStartObjectSizeRate; ; vsize *= seedStep) { - auto partialVectors = vectors; - if (vsize < vectors.size()) { - partialVectors.resize(vsize); - } - - optimize(partialVectors, - global, - ofile, - reposition, - rs, - localClusters, - errors); - if (rs.size() > 1) { - nOfMatrices = static_cast(nOfMatrices) * (1.0 - reject); - nOfMatrices = nOfMatrices == 0 ? 1 : nOfMatrices; - vector*, vector>*>>> sortedErrors; - for (size_t idx = 0; idx < errors.size(); idx++) { - sortedErrors.emplace_back(make_pair(errors[idx], make_pair(&rs[idx], &localClusters[idx]))); - } - sort(sortedErrors.begin(), sortedErrors.end()); - vector> tmpMatrix; - vector>> tmpLocalClusters; - for (size_t idx = 0; idx < nOfMatrices; idx++) { - tmpMatrix.emplace_back(*sortedErrors[idx].second.first); - tmpLocalClusters.emplace_back(*sortedErrors[idx].second.second); - } - if (tmpMatrix.size() != nOfMatrices) { - std::cerr << "something strange. " << tmpMatrix.size() << ":" << nOfMatrices << std::endl; - } - rs = std::move(tmpMatrix); - localClusters = std::move(tmpLocalClusters); - } - if (vsize >= vectors.size()) { - break; - } - } - - if (rs.size() != 1) { - std::cerr << "Optimizer: Warning. rs.size=" << rs.size() << std::endl; - } - auto minR = std::move(rs[0]); - auto minLocalClusters = std::move(localClusters[0]); - size_t pos = std::distance(std::find(ofile.rbegin(), ofile.rend(), '.'), ofile.rend()) - 1; - string of = ofile.substr(0, pos); - if (repositioning) { - Matrix repositionedR(reposition); - repositionedR.mul(minR); - Matrix::save(of + "_R.tsv", repositionedR); - } else { - Matrix::save(of + "_R.tsv", minR); - } - if (showClusterInfo) { - if (minLocalClusters.size() != numberOfSubvectors) { - std::stringstream msg; - msg << "Fatal error. minLocalClusters.size() != numberOfSubvectors " << - minLocalClusters.size() << ":" << numberOfSubvectors; - NGTThrowException(msg); - } - float totalRate = 0.0; - for (size_t m = 0; m < minLocalClusters.size(); m++) { - size_t min = std::numeric_limits::max(); - size_t max = 0; - size_t nOfVectors = 0; - for (size_t i = 0; i < minLocalClusters[m].size(); i++) { - nOfVectors += minLocalClusters[m][i].members.size(); - if (minLocalClusters[m][i].members.size() < min) { - min = minLocalClusters[m][i].members.size(); - } - if (minLocalClusters[m][i].members.size() > max) { - max = minLocalClusters[m][i].members.size(); - } - } - float rate = static_cast(max - min) / static_cast(nOfVectors); - totalRate += rate; - std::cout << "cluster " << m << " " << rate << "," << max - min << "," << min << "," << max << " : "; - for (size_t i = 0; i < minLocalClusters[m].size(); i++) { - std::cout << minLocalClusters[m][i].members.size() << " "; - } - std::cout << std::endl; - } - totalRate /= minLocalClusters.size(); - std::cout << "Range rate=" << totalRate << std::endl; - std::cout << "Error=" << errors[0] << std::endl; - } - for (size_t m = 0; m < numberOfSubvectors; m++) { - stringstream str; - str << of << "-" << m << ".tsv"; -#ifdef NGT_CLUSTERING - NGT::Clustering::saveClusters(str.str(), minLocalClusters[m]); -#else - saveClusters(str.str(), minLocalClusters[m]); -#endif - } - - } + void optimize(std::string invector, std::string ofile, std::string global); NGT::Clustering::ClusteringType clusteringType; NGT::Clustering::InitializationMode initMode; @@ -808,4 +401,4 @@ namespace NGTQ { bool silence; bool showClusterInfo; }; -} // namespace NGTQ +} // namespace QBG diff --git a/lib/NGT/NGTQ/QbgCli.cpp b/lib/NGT/NGTQ/QbgCli.cpp index 4fc38a8..90e8f29 100644 --- a/lib/NGT/NGTQ/QbgCli.cpp +++ b/lib/NGT/NGTQ/QbgCli.cpp @@ -25,47 +25,42 @@ typedef NGTQ::Quantizer::ObjectList QBGObjectList; -class CreateParameters { +class QbgCliBuildParameters : public QBG::BuildParameters { public: - CreateParameters(NGT::Args &args) { - try { - index = args.get("#1"); - } catch (...) { - std::stringstream msg; - msg << "Command::CreateParameters: Error: An index is not specified."; - NGTThrowException(msg); - } - try { - objectPath = args.get("#2"); - } catch (...) {} + QbgCliBuildParameters(NGT::Args &a):args(a){ + args.parse("Zv"); + } + + void getBuildParameters() { + getHierarchicalClustringParameters(); + getOptimizationParameters(); + } + void getCreationParameters() { char objectType = args.getChar("o", 'f'); char distanceType = args.getChar("D", '2'); - numOfObjects = args.getl("n", 0); + creation.numOfObjects = args.getl("n", 0); - property.threadSize = args.getl("p", 24); - property.dimension = args.getl("d", 0); - property.globalRange = args.getf("R", 0); - property.localRange = args.getf("r", 0); - property.globalCentroidLimit = args.getl("C", 0); + creation.threadSize = args.getl("p", 24); + creation.dimension = args.getl("d", 0); #ifdef NGTQ_QBG - property.localCentroidLimit = args.getl("c", 16); + creation.localCentroidLimit = args.getl("c", 16); #else - property.localCentroidLimit = args.getl("c", 65000); + creation.localCentroidLimit = args.getl("c", 65000); #endif - property.localDivisionNo = args.getl("N", 8); - property.batchSize = args.getl("b", 1000); - property.localClusteringSampleCoefficient = args.getl("s", 10); + creation.localDivisionNo = args.getl("N", 0); + creation.batchSize = args.getl("b", 1000); + creation.localClusteringSampleCoefficient = args.getl("s", 10); { char localCentroidType = args.getChar("T", 'f'); - property.singleLocalCodebook = localCentroidType == 't' ? true : false; + creation.singleLocalCodebook = localCentroidType == 't' ? true : false; } { char centroidCreationMode = args.getChar("M", 'l'); switch(centroidCreationMode) { - case 'd': property.centroidCreationMode = NGTQ::CentroidCreationModeDynamic; break; - case 's': property.centroidCreationMode = NGTQ::CentroidCreationModeStatic; break; - case 'l': property.centroidCreationMode = NGTQ::CentroidCreationModeStaticLayer; break; + case 'd': creation.centroidCreationMode = NGTQ::CentroidCreationModeDynamic; break; + case 's': creation.centroidCreationMode = NGTQ::CentroidCreationModeStatic; break; + case 'l': creation.centroidCreationMode = NGTQ::CentroidCreationModeStaticLayer; break; default: std::stringstream msg; msg << "Command::CreateParameters: Error: Invalid centroid creation mode. " << centroidCreationMode; @@ -75,9 +70,9 @@ class CreateParameters { { char localCentroidCreationMode = args.getChar("L", 's'); switch(localCentroidCreationMode) { - case 'd': property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamic; break; - case 's': property.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic; break; - case 'k': property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamicKmeans; break; + case 'd': creation.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamic; break; + case 's': creation.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic; break; + case 'k': creation.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamicKmeans; break; default: std::stringstream msg; msg << "Command::CreateParameters: Error: Invalid centroid creation mode. " << localCentroidCreationMode; @@ -85,26 +80,26 @@ class CreateParameters { } } #ifdef NGTQ_QBG - property.localIDByteSize = args.getl("B", 1); + creation.localIDByteSize = args.getl("B", 1); #endif - globalProperty.edgeSizeForCreation = args.getl("E", 10); - globalProperty.edgeSizeForSearch = args.getl("S", 40); + creation.globalEdgeSizeForCreation = args.getl("E", 10); + creation.globalEdgeSizeForSearch = args.getl("S", 40); { char indexType = args.getChar("i", 't'); - globalProperty.indexType = indexType == 't' ? NGT::Property::GraphAndTree : NGT::Property::Graph; - localProperty.indexType = globalProperty.indexType; + creation.globalIndexType = indexType == 't' ? NGT::Property::GraphAndTree : NGT::Property::Graph; + creation.localIndexType = creation.globalIndexType; } - globalProperty.insertionRadiusCoefficient = args.getf("e", 0.1) + 1.0; - localProperty.insertionRadiusCoefficient = globalProperty.insertionRadiusCoefficient; + creation.globalInsertionRadiusCoefficient = args.getf("e", 0.1) + 1.0; + creation.localInsertionRadiusCoefficient = creation.globalInsertionRadiusCoefficient; switch (objectType) { - case 'f': property.dataType = NGTQ::DataTypeFloat; break; + case 'f': creation.dataType = NGTQ::DataTypeFloat; break; #ifdef NGT_HALF_FLOAT - case 'h': property.dataType = NGTQ::DataTypeFloat16; break; + case 'h': creation.dataType = NGTQ::DataTypeFloat16; break; #endif - case 'c': property.dataType = NGTQ::DataTypeUint8; break; + case 'c': creation.dataType = NGTQ::DataTypeUint8; break; default: std::stringstream msg; msg << "Command::CreateParameters: Error: Invalid object type. " << objectType; @@ -112,27 +107,28 @@ class CreateParameters { } switch (distanceType) { - case '2': property.distanceType = NGTQ::DistanceType::DistanceTypeL2; break; - case '1': property.distanceType = NGTQ::DistanceType::DistanceTypeL1; break; - case 'a': property.distanceType = NGTQ::DistanceType::DistanceTypeAngle; break; - case 'C': property.distanceType = NGTQ::DistanceType::DistanceTypeNormalizedCosine; break; - case 'E': property.distanceType = NGTQ::DistanceType::DistanceTypeL2; break; + case '2': creation.distanceType = NGTQ::DistanceType::DistanceTypeL2; break; + case '1': creation.distanceType = NGTQ::DistanceType::DistanceTypeL1; break; + case 'a': creation.distanceType = NGTQ::DistanceType::DistanceTypeAngle; break; + case 'C': creation.distanceType = NGTQ::DistanceType::DistanceTypeNormalizedCosine; break; + case 'E': creation.distanceType = NGTQ::DistanceType::DistanceTypeL2; break; default: std::stringstream msg; msg << "Command::CreateParameters: Error: Invalid distance type. " << distanceType; NGTThrowException(msg); } #ifdef NGTQ_QBG - property.genuineDimension = property.dimension; - property.dimension = args.getl("P", property.genuineDimension); + creation.genuineDimension = creation.dimension; + creation.dimension = args.getl("P", creation.genuineDimension); + creation.dimensionOfSubvector = args.getl("Q", 0); { char objectType = args.getChar("O", 'f'); switch (objectType) { - case 'f': property.genuineDataType = ObjectFile::DataTypeFloat; break; + case 'f': creation.genuineDataType = ObjectFile::DataTypeFloat; break; #ifdef NGT_HALF_FLOAT - case 'h': property.genuineDataType = ObjectFile::DataTypeFloat16; break; + case 'h': creation.genuineDataType = ObjectFile::DataTypeFloat16; break; #endif - case 'c': property.genuineDataType = ObjectFile::DataTypeUint8; break; + case 'c': creation.genuineDataType = ObjectFile::DataTypeUint8; break; default: std::stringstream msg; msg << "Command::CreateParameters: Error: Invalid genuine object type. " << objectType; @@ -141,149 +137,234 @@ class CreateParameters { } #endif + } - std::string index; - std::string objectPath; - size_t numOfObjects; - NGTQ::Property property; - NGT::Property globalProperty; - NGT::Property localProperty; + void getHierarchicalClustringParameters() { + hierarchicalClustering.maxSize = args.getl("r", 1000); + hierarchicalClustering.numOfObjects = args.getl("O", 0); + hierarchicalClustering.numOfClusters = args.getl("E", 2); + try { + hierarchicalClustering.numOfTotalClusters = args.getl("C", 0); + } catch (...) { + hierarchicalClustering.numOfTotalClusters = 0; + } + hierarchicalClustering.numOfTotalBlobs = args.getl("b", 0); + hierarchicalClustering.clusterID = args.getl("c", -1); + silence = !args.getBool("v"); + + char iMode = args.getChar("i", '-'); + hierarchicalClustering.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; + switch (iMode) { + case 'l': + case 'h': hierarchicalClustering.initMode = NGT::Clustering::InitializationModeHead; break; + case 'r': hierarchicalClustering.initMode = NGT::Clustering::InitializationModeRandom; break; + case 'R': hierarchicalClustering.initMode = NGT::Clustering::InitializationModeRandomFixedSeed; break; + case 'P': hierarchicalClustering.initMode = NGT::Clustering::InitializationModeKmeansPlusPlusFixedSeed; break; + default: + case '-': + case 'p': hierarchicalClustering.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; break; + } -}; + hierarchicalClustering.numOfRandomObjects = args.getl("r", 0); + char rmode = args.getChar("R", '-'); + if (rmode == 'c') { + hierarchicalClustering.extractCentroid = true; + } else { + hierarchicalClustering.extractCentroid = false; + } -class SearchParameters : public NGT::Command::SearchParameters { -public: - SearchParameters(NGT::Args &args): NGT::Command::SearchParameters(args, "0.02") { - stepOfResultExpansion = 2; - std::string resultExpansion = args.getString("p", "3.0"); - std::vector tokens; - NGT::Common::tokenize(resultExpansion, tokens, ":"); - if (tokens.size() >= 1) { beginOfResultExpansion = endOfResultExpansion = NGT::Common::strtod(tokens[0]); } - if (tokens.size() >= 2) { endOfResultExpansion = NGT::Common::strtod(tokens[1]); } - if (tokens.size() >= 3) { stepOfResultExpansion = NGT::Common::strtod(tokens[2]); } + hierarchicalClustering.numOfFirstObjects = 0; + hierarchicalClustering.numOfFirstClusters = 0; + hierarchicalClustering.numOfSecondObjects = 0; + hierarchicalClustering.numOfSecondClusters = 0; + hierarchicalClustering.numOfThirdClusters = 0; + + hierarchicalClustering.threeLayerClustering = true; + + std::string blob = args.getString("B", ""); + if (blob == "-") { + hierarchicalClustering.threeLayerClustering = false; + } else { + hierarchicalClustering.threeLayerClustering = true; + } + if (hierarchicalClustering.threeLayerClustering) { + std::vector tokens; + NGT::Common::tokenize(blob, tokens, ","); + if (tokens.size() > 0) { + std::vector ftokens; + NGT::Common::tokenize(tokens[0], ftokens, ":"); + if (ftokens.size() >= 1) { + hierarchicalClustering.numOfFirstObjects = NGT::Common::strtof(ftokens[0]); + } + if (ftokens.size() >= 2) { + hierarchicalClustering.numOfFirstClusters = NGT::Common::strtof(ftokens[1]); + } + } + if (tokens.size() > 1) { + std::vector ftokens; + NGT::Common::tokenize(tokens[1], ftokens, ":"); + if (ftokens.size() >= 1) { + hierarchicalClustering.numOfSecondObjects = NGT::Common::strtof(ftokens[0]); + } + if (ftokens.size() >= 2) { + hierarchicalClustering.numOfSecondClusters = NGT::Common::strtof(ftokens[1]); + } + } + if (tokens.size() > 2) { + std::vector ftokens; + NGT::Common::tokenize(tokens[2], ftokens, ":"); + if (ftokens.size() >= 1) { + if (ftokens[0] == "" || ftokens[0] == "-") { + hierarchicalClustering.numOfObjects = 0; + } else { + hierarchicalClustering.numOfObjects = NGT::Common::strtof(ftokens[0]); + } + } + if (ftokens.size() >= 2) { + hierarchicalClustering.numOfThirdClusters = NGT::Common::strtof(ftokens[1]); + } + } + } } - float beginOfResultExpansion; - float endOfResultExpansion; - float stepOfResultExpansion; -}; -static void optimizationParameters(NGT::Args &args, NGTQ::Optimizer &optimizer) { - args.parse("Zv"); - optimizer.numberOfObjects = args.getl("o", 1000); - optimizer.numberOfClusters = args.getl("n", 0); - optimizer.numberOfSubvectors = args.getl("m", 0); + void getOptimizationParameters() { + optimization.numberOfObjects = args.getl("o", 1000); + optimization.numberOfClusters = args.getl("n", 0); + optimization.numberOfSubvectors = args.getl("m", 0); - optimizer.randomizedObjectExtraction = true; + optimization.randomizedObjectExtraction = true; #ifdef NGT_CLUSTERING - string cType; - try { - cType = args.getString("C", "k"); - } catch(...) {} - - optimizer.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT; - if (cType == "k") { - optimizer.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithoutNGT; - } else if (cType == "KS") { - optimizer.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT; - } else if (cType == "i") { - optimizer.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithIteration; - } else { - std::stringstream msg; - msg << "invalid clustering type. " << cType; - NGTThrowException(msg); - } + string cType; + try { + cType = args.getString("C", "k"); + } catch(...) {} + + optimization.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT; + if (cType == "k") { + optimization.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithoutNGT; + } else if (cType == "KS") { + optimization.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithNGT; + } else if (cType == "i") { + optimization.clusteringType = NGT::Clustering::ClusteringTypeKmeansWithIteration; + } else { + std::stringstream msg; + msg << "invalid clustering type. " << cType; + NGTThrowException(msg); + } #else - char clusteringType; - try { - clusteringType = args.getChar("C", 'k'); - } catch(...) {} + char clusteringType; + try { + clusteringType = args.getChar("C", 'k'); + } catch(...) {} #endif #ifdef NGT_CLUSTERING - char iMode = args.getChar("i", '-'); - optimizer.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; - switch (iMode) { - case 'h': optimizer.initMode = NGT::Clustering::InitializationModeHead; break; - case 'r': optimizer.initMode = NGT::Clustering::InitializationModeRandom; break; - case 'p': optimizer.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; break; - case 'R': optimizer.initMode = NGT::Clustering::InitializationModeRandomFixedSeed; break; - case 'P': optimizer.initMode = NGT::Clustering::InitializationModeKmeansPlusPlusFixedSeed; break; - default: - case '-': - case 'b': optimizer.initMode = NGT::Clustering::InitializationModeBest; break; - } + char iMode = args.getChar("i", '-'); + optimization.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; + switch (iMode) { + case 'h': optimization.initMode = NGT::Clustering::InitializationModeHead; break; + case 'r': optimization.initMode = NGT::Clustering::InitializationModeRandom; break; + case 'p': optimization.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; break; + case 'R': optimization.initMode = NGT::Clustering::InitializationModeRandomFixedSeed; break; + case 'P': optimization.initMode = NGT::Clustering::InitializationModeKmeansPlusPlusFixedSeed; break; + default: + case '-': + case 'b': optimization.initMode = NGT::Clustering::InitializationModeBest; break; + } #else - optimizer.initMode = args.getChar("i", '-'); + optimization.initMode = args.getChar("i", '-'); #endif - optimizer.convergenceLimitTimes = args.getl("c", 5); - optimizer.iteration = args.getl("t", 100); - optimizer.clusterIteration = args.getl("I", 100); - - optimizer.clusterSizeConstraint = false; - if (args.getChar("s", 'f') == 't') { - optimizer.clusterSizeConstraintCoefficient = 5.0; - optimizer.clusterSizeConstraint = true; - } else if (args.getChar("s", 'f') == 'f') { - optimizer.clusterSizeConstraint = false; - } else { - optimizer.clusterSizeConstraint = true; - optimizer.clusterSizeConstraintCoefficient = args.getf("s", 5.0); - } + optimization.convergenceLimitTimes = args.getl("c", 5); + optimization.iteration = args.getl("t", 100); + optimization.clusterIteration = args.getl("I", 100); + + optimization.clusterSizeConstraint = false; + if (args.getChar("s", 'f') == 't') { + optimization.clusterSizeConstraintCoefficient = 5.0; + optimization.clusterSizeConstraint = true; + } else if (args.getChar("s", 'f') == 'f') { + optimization.clusterSizeConstraint = false; + } else { + optimization.clusterSizeConstraint = true; + optimization.clusterSizeConstraintCoefficient = args.getf("s", 5.0); + } - optimizer.nOfMatrices = args.getl("M", 2); - optimizer.seedStartObjectSizeRate = args.getf("S", 0.1); - optimizer.seedStep = args.getl("X", 2); - optimizer.reject = args.getf("R", 0.9); - optimizer.timelimit = args.getf("L", 24 * 1); - optimizer.timelimit *= 60.0 * 60.0; - optimizer.showClusterInfo = args.getBool("Z"); - optimizer.silence = !args.getBool("v"); + optimization.nOfMatrices = args.getl("M", 2); + optimization.seedStartObjectSizeRate = args.getf("S", 0.1); + optimization.seedStep = args.getl("X", 2); + optimization.reject = args.getf("R", 0.9); + optimization.timelimit = args.getf("L", 24 * 1); + optimization.timelimit *= 60.0 * 60.0; + optimization.showClusterInfo = args.getBool("Z"); + silence = !args.getBool("v"); #ifdef NGTQG_NO_ROTATION - char positionMode = args.getChar("P", 'n'); + char positionMode = args.getChar("P", 'n'); #else - char positionMode = args.getChar("P", 'r'); + char positionMode = args.getChar("P", 'r'); #endif - switch (positionMode) { - case 'r': - optimizer.rotation = true; - optimizer.repositioning = false; - break; - case 'R': - optimizer.rotation = true; - optimizer.repositioning = true; - break; - case 'p': - optimizer.rotation = false; - optimizer.repositioning = true; - break; - case 'n': - default: - optimizer.rotation = false; - optimizer.repositioning = false; + switch (positionMode) { + case 'r': + optimization.rotation = true; + optimization.repositioning = false; + break; + case 'R': + optimization.rotation = true; + optimization.repositioning = true; + break; + case 'p': + optimization.rotation = false; + optimization.repositioning = true; + break; + case 'n': + default: + optimization.rotation = false; + optimization.repositioning = false; + } + char globalType = args.getChar("G", '-'); + switch (globalType) { + case 'z': + optimization.globalType = QBG::Optimizer::GlobalTypeZero; break; + case 'm': + optimization.globalType = QBG::Optimizer::GlobalTypeMean; break; + default: + case 'n': + optimization.globalType = QBG::Optimizer::GlobalTypeNone; break; + break; + } } - char globalType = args.getChar("G", '-'); - switch (globalType) { - case 'z': - optimizer.globalType = NGTQ::Optimizer::GlobalTypeZero; break; - case 'm': - optimizer.globalType = NGTQ::Optimizer::GlobalTypeMean; break; - default: - case 'n': - optimizer.globalType = NGTQ::Optimizer::GlobalTypeNone; break; - break; +protected: + NGT::Args &args; +}; + + +class SearchParameters : public NGT::Command::SearchParameters { +public: + SearchParameters(NGT::Args &args): NGT::Command::SearchParameters(args, "0.02") { + stepOfResultExpansion = 2; + std::string resultExpansion = args.getString("p", "3.0"); + std::vector tokens; + NGT::Common::tokenize(resultExpansion, tokens, ":"); + if (tokens.size() >= 1) { beginOfResultExpansion = endOfResultExpansion = NGT::Common::strtod(tokens[0]); } + if (tokens.size() >= 2) { endOfResultExpansion = NGT::Common::strtod(tokens[1]); } + if (tokens.size() >= 3) { stepOfResultExpansion = NGT::Common::strtod(tokens[2]); } } -} + float beginOfResultExpansion; + float endOfResultExpansion; + float stepOfResultExpansion; +}; + void QBG::CLI::buildQG(NGT::Args &args) { const std::string usage = "Usage: qbg build-qg [-Q dimension-of-subvector] [-E max-number-of-edges] index"; - args.parse("Zv"); + QbgCliBuildParameters buildParameters(args); + buildParameters.getBuildParameters(); string indexPath; try { @@ -300,14 +381,8 @@ QBG::CLI::buildQG(NGT::Args &args) const std::string qgPath = indexPath + "/qg"; if (phase == 0 || phase == 1) { - NGTQ::Optimizer optimizer; - optimizer.globalType = NGTQ::Optimizer::GlobalTypeZero; - try { - optimizationParameters(args, optimizer); - } catch(NGT::Exception &err) { - std::cerr << err.what() << std::endl; - cerr << usage << endl; - } + QBG::Optimizer optimizer(buildParameters); + optimizer.globalType = QBG::Optimizer::GlobalTypeZero; #ifdef NGTQG_NO_ROTATION if (optimizer.rotation || optimizer.repositioning) { @@ -317,11 +392,6 @@ QBG::CLI::buildQG(NGT::Args &args) } #endif - if (optimizer.globalType == NGTQ::Optimizer::GlobalTypeNone) { - std::cerr << "build-qg: Warning! None is unavailable for the global type. Zero is set to the global type." << std::endl; - optimizer.globalType = NGTQ::Optimizer::GlobalTypeZero; - } - std::cerr << "optimizing..." << std::endl; optimizer.optimize(qgPath); } @@ -333,7 +403,7 @@ QBG::CLI::buildQG(NGT::Args &args) if (phase == 0 || phase == 3) { std::cerr << "building the quantized graph... " << std::endl; bool silence = true; - NGTQG::Index::quantize(indexPath, maxNumOfEdges, silence); + NGTQG::Index::realign(indexPath, maxNumOfEdges, silence); } } @@ -500,9 +570,11 @@ QBG::CLI::searchQG(NGT::Args &args) { void QBG::CLI::createQG(NGT::Args &args) { - const std::string usage = "Usage: qbg create-qbg [-Q dimension-of-subvector] index"; + const std::string usage = "Usage: qbg create-qg [-Q dimension-of-subvector] index"; - args.parse("v"); + QbgCliBuildParameters buildParameters(args); + buildParameters.getCreationParameters(); + string indexPath; try { indexPath = args.get("#1"); @@ -511,18 +583,9 @@ QBG::CLI::createQG(NGT::Args &args) cerr << usage << endl; return; } - char mode = args.getChar("m", '-'); - size_t dimensionOfSubvector = args.getl("Q", 0); - size_t seudoDimension = args.getl("P", 0); - bool silence = !args.getBool("v"); - std::cerr << "creating..." << std::endl; - NGTQG::Index::create(indexPath, dimensionOfSubvector, seudoDimension); - - if (mode == '-') { - std::cerr << "appending..." << std::endl; - QBG::Index::appendFromObjectRepository(indexPath, indexPath + "/qg", silence); - } + NGTQG::Index::create(indexPath, buildParameters); + NGTQG::Index::append(indexPath, buildParameters); } void @@ -541,32 +604,6 @@ QBG::CLI::appendQG(NGT::Args &args) } -void -QBG::CLI::quantizeQG(NGT::Args &args) -{ -#ifdef NGTQ_QBG - const std::string usage = "Usage: ngtqg quantize [-E max-number-of-edges] index"; -#else - const std::string usage = "Usage: ngtqg quantize [-Q dimension-of-subvector] [-E max-number-of-edges] index"; -#endif - string indexPath; - try { - indexPath = args.get("#1"); - } catch (...) { - cerr << "An index is not specified." << endl; - cerr << usage << endl; - return; - } - size_t maxNumOfEdges = args.getl("E", 128); - -#ifdef NGTQ_QBG - NGTQG::Index::quantize(indexPath, maxNumOfEdges); -#else - size_t dimensionOfSubvector = args.getl("Q", 0); - NGTQG::Index::quantize(indexPath, dimensionOfSubvector, maxNumOfEdges); -#endif -} - void QBG::CLI::create(NGT::Args &args) { @@ -580,10 +617,10 @@ QBG::CLI::create(NGT::Args &args) "index(OUT) data.tsv(IN) rotation(IN)"; try { - CreateParameters createParameters(args); - cerr << "qbg: Create" << endl; -#ifdef NGTQ_QBG + QbgCliBuildParameters buildParameters(args); + buildParameters.getCreationParameters(); + std::vector r; auto *rotation = &r; { @@ -609,20 +646,13 @@ QBG::CLI::create(NGT::Args &args) } std::cerr << "rotation matrix size=" << r.size() << std::endl; } - - QBG::Index::create(createParameters.index, createParameters.property, createParameters.globalProperty, - createParameters.localProperty, rotation, createParameters.objectPath); -#else - QBG::Index::create(createParameters.index, createParameters.property, createParameters.globalProperty, - createParameters.localProperty); -#endif - -#ifndef NGTQ_QBG - if (!createParameters.objectPath.empty()) { - cerr << "qbg: Append" << endl; - QBG::Index::append(createParameters.index, createParameters.objectPath, createParameters.numOfObjects); - } -#endif + std::string indexPath = args.get("#1"); + std::string objectPath; + try { + objectPath = args.get("#2"); + } catch(...) {} + + QBG::Index::create(indexPath, buildParameters, rotation, objectPath); } catch(NGT::Exception &err) { std::cerr << err.what() << std::endl; cerr << usage << endl; @@ -883,98 +913,6 @@ QBG::CLI::append(NGT::Args &args) } } -static void hierarchicalKmeansParameters(NGT::Args &args, QBG::HierarchicalKmeans &hierarchicalKmeans) -{ - args.parse("Zv"); - hierarchicalKmeans.maxSize = args.getl("r", 1000); - hierarchicalKmeans.numOfObjects = args.getl("O", 0); - hierarchicalKmeans.numOfClusters = args.getl("E", 2); - try { - hierarchicalKmeans.numOfTotalClusters = args.getl("C", 0); - } catch (...) { - hierarchicalKmeans.numOfTotalClusters = 0; - } - hierarchicalKmeans.numOfTotalBlobs = args.getl("b", 0); - hierarchicalKmeans.clusterID = args.getl("c", -1); - hierarchicalKmeans.silence = !args.getBool("v"); - - char iMode = args.getChar("i", '-'); - hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; - switch (iMode) { - case 'l': - case 'h': hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeHead; break; - case 'r': hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeRandom; break; - case 'R': hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeRandomFixedSeed; break; - case 'P': hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeKmeansPlusPlusFixedSeed; break; - default: - case '-': - case 'p': hierarchicalKmeans.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; break; - } - - hierarchicalKmeans.numOfRandomObjects = args.getl("r", 0); - char rmode = args.getChar("R", '-'); - if (rmode == 'c') { - hierarchicalKmeans.extractCentroid = true; - } else { - hierarchicalKmeans.extractCentroid = false; - } - - hierarchicalKmeans.numOfFirstObjects = 0; - hierarchicalKmeans.numOfFirstClusters = 0; - hierarchicalKmeans.numOfSecondObjects = 0; - hierarchicalKmeans.numOfSecondClusters = 0; - hierarchicalKmeans.numOfThirdClusters = 0; - - hierarchicalKmeans.threeLayerClustering = true; - - std::string blob = args.getString("B", ""); - if (blob == "-") { - hierarchicalKmeans.threeLayerClustering = false; - } else { - hierarchicalKmeans.threeLayerClustering = true; - } - if (hierarchicalKmeans.threeLayerClustering) { - std::vector tokens; - NGT::Common::tokenize(blob, tokens, ","); - if (tokens.size() > 0) { - std::vector ftokens; - NGT::Common::tokenize(tokens[0], ftokens, ":"); - if (ftokens.size() >= 1) { - hierarchicalKmeans.numOfFirstObjects = NGT::Common::strtof(ftokens[0]); - } - if (ftokens.size() >= 2) { - hierarchicalKmeans.numOfFirstClusters = NGT::Common::strtof(ftokens[1]); - } - } - if (tokens.size() > 1) { - std::vector ftokens; - NGT::Common::tokenize(tokens[1], ftokens, ":"); - if (ftokens.size() >= 1) { - hierarchicalKmeans.numOfSecondObjects = NGT::Common::strtof(ftokens[0]); - } - if (ftokens.size() >= 2) { - hierarchicalKmeans.numOfSecondClusters = NGT::Common::strtof(ftokens[1]); - } - } - if (tokens.size() > 2) { - std::vector ftokens; - NGT::Common::tokenize(tokens[2], ftokens, ":"); - if (ftokens.size() >= 1) { - hierarchicalKmeans.numOfObjects = NGT::Common::strtof(ftokens[0]); - } - if (ftokens.size() >= 2) { - hierarchicalKmeans.numOfThirdClusters = NGT::Common::strtof(ftokens[1]); - } - } - std::cerr << "blob param=:" << std::endl; - std::cerr << "numOfFirstObjects=" << hierarchicalKmeans.numOfFirstObjects << std::endl; - std::cerr << "numOfFirstClusters=" << hierarchicalKmeans.numOfFirstClusters << std::endl; - std::cerr << "numOfSecondClusters=" << hierarchicalKmeans.numOfSecondClusters << std::endl; - std::cerr << "numOfSecondObjects=" << hierarchicalKmeans.numOfSecondObjects << std::endl; - std::cerr << "numOfThirdClusters=" << hierarchicalKmeans.numOfThirdClusters << std::endl; - std::cerr << "numOfThirdObjects=" << hierarchicalKmeans.numOfObjects << std::endl; - } -} void QBG::CLI::buildIndex(NGT::Args &args) @@ -1120,7 +1058,8 @@ QBG::CLI::build(NGT::Args &args) { const std::string usage = "Usage: qbg build [-Q dimension-of-subvector] [-E max-number-of-edges] index"; - args.parse("Zv"); + QbgCliBuildParameters buildParameters(args); + buildParameters.getBuildParameters(); string indexPath; try { @@ -1131,32 +1070,25 @@ QBG::CLI::build(NGT::Args &args) return; } - HierarchicalKmeans hierarchicalKmeans; + size_t phase = args.getl("p", 0); - try { - hierarchicalKmeansParameters(args, hierarchicalKmeans); - } catch (NGT::Exception &err) { - cerr << "Error " << err.what() << endl; - cerr << usage << endl; - return; - } + HierarchicalKmeans hierarchicalKmeans(buildParameters); - hierarchicalKmeans.clustering(indexPath); + if (phase == 0 || phase == 1) { + hierarchicalKmeans.clustering(indexPath); + } - NGTQ::Optimizer optimizer; + QBG::Optimizer optimizer(buildParameters); - try { - optimizationParameters(args, optimizer); - } catch(NGT::Exception &err) { - std::cerr << err.what() << std::endl; - cerr << usage << endl; + if (phase == 0 || phase == 2) { + std::cerr << "optimizing..." << std::endl; + optimizer.optimize(indexPath); } - - std::cerr << "optimizing..." << std::endl; - optimizer.optimize(indexPath); - std::cerr << "building..." << std::endl; - QBG::Index::build(indexPath, optimizer.silence); + if (phase == 0 || phase == 3) { + std::cerr << "building..." << std::endl; + QBG::Index::build(indexPath, optimizer.silence); + } } @@ -1167,6 +1099,8 @@ QBG::CLI::hierarchicalKmeans(NGT::Args &args) const std::string usage = "qbg kmeans -O #-of-objects -B x1:y1,x2,y2,x3 index [prefix] [object-ID-file]"; std::string indexPath; + QbgCliBuildParameters buildParameters(args); + try { indexPath = args.get("#1"); } catch (...) { @@ -1191,15 +1125,8 @@ QBG::CLI::hierarchicalKmeans(NGT::Args &args) cerr << "Object ID file is not specified" << endl; } - HierarchicalKmeans hierarchicalKmeans; + HierarchicalKmeans hierarchicalKmeans(buildParameters); - try { - hierarchicalKmeansParameters(args, hierarchicalKmeans); - } catch (NGT::Exception &err) { - cerr << "Error " << err.what() << endl; - cerr << usage << endl; - return; - } hierarchicalKmeans.clustering(indexPath, prefix, objectIDsFile); } @@ -1525,6 +1452,8 @@ QBG::CLI::optimize(NGT::Args &args) string usage = "Usage: qbg optimize -n number-of-clusters -m number-of subspaces [-O t|f] [-s t|f] [-I cluster-iteration] [-t R-max-iteration] [-c convergence-limit-times] vector-file [output-file-prefix]\n" " qbg optimize -e E -n number-of-clusters -m number-of index [subspaces] [vector-file] [local-centroid-file] [global-centroid-file]"; + QbgCliBuildParameters buildParameters(args); + std::string indexPath; try { indexPath = args.get("#1"); @@ -1549,14 +1478,8 @@ QBG::CLI::optimize(NGT::Args &args) global = args.get("#4"); } catch(...) {} - NGTQ::Optimizer optimizer; + QBG::Optimizer optimizer(buildParameters); - try { - optimizationParameters(args, optimizer); - } catch(NGT::Exception &err) { - std::cerr << err.what() << std::endl; - cerr << usage << endl; - } if (invector.empty() || ofile.empty() || global.empty()) { optimizer.optimize(indexPath); diff --git a/lib/NGT/NGTQ/QbgCli.h b/lib/NGT/NGTQ/QbgCli.h index 1d849cb..aa33c6c 100644 --- a/lib/NGT/NGTQ/QbgCli.h +++ b/lib/NGT/NGTQ/QbgCli.h @@ -39,7 +39,6 @@ namespace QBG { void gtRange(NGT::Args &args) {}; void optimize(NGT::Args &args) {}; void build(NGT::Args &args) {}; - void quantizeQG(NGT::Args &args) {}; void createQG(NGT::Args &args) {}; void buildQG(NGT::Args &args) {}; void appendQG(NGT::Args &args) {}; @@ -58,7 +57,6 @@ namespace QBG { void gtRange(NGT::Args &args); void optimize(NGT::Args &args); void build(NGT::Args &args); - void quantizeQG(NGT::Args &args); void createQG(NGT::Args &args); void buildQG(NGT::Args &args); void appendQG(NGT::Args &args); @@ -115,8 +113,6 @@ namespace QBG { build(args); } else if (command == "create-qg") { createQG(args); - } else if (command == "quantize-qg") { - quantizeQG(args); } else if (command == "build-qg") { buildQG(args); } else if (command == "append-qg") { diff --git a/lib/NGT/NGTQ/QuantizedBlobGraph.h b/lib/NGT/NGTQ/QuantizedBlobGraph.h index 34f45e0..f33ec52 100644 --- a/lib/NGT/NGTQ/QuantizedBlobGraph.h +++ b/lib/NGT/NGTQ/QuantizedBlobGraph.h @@ -21,11 +21,191 @@ #ifdef NGTQ_QBG #include "NGT/NGTQ/QuantizedGraph.h" +#include "NGT/NGTQ/Optimizer.h" #include namespace QBG { + + class BuildParameters { + public: + BuildParameters(){ setDefault(); } + + void setDefault() { + creation.numOfObjects = 0; + creation.threadSize = 24; + creation.localCentroidLimit = 16; + creation.dimension = 0; +#ifdef NGTQ_QBG + creation.genuineDimension = 0; + creation.dimensionOfSubvector = 1; + creation.genuineDataType = ObjectFile::DataTypeFloat; +#endif + creation.dataType = NGTQ::DataTypeFloat; + creation.distanceType = NGTQ::DistanceType::DistanceTypeL2; + creation.singleLocalCodebook = false; + creation.localDivisionNo = 0; + creation.batchSize = 1000; + creation.centroidCreationMode = NGTQ::CentroidCreationModeStaticLayer; + creation.localCentroidCreationMode = NGTQ::CentroidCreationModeStatic; + creation.localIDByteSize = 1; + creation.localClusteringSampleCoefficient = 10; + creation.globalEdgeSizeForCreation = 10; + creation.globalEdgeSizeForSearch = 40; + creation.globalIndexType = NGT::Property::GraphAndTree; + creation.globalInsertionRadiusCoefficient = 1.1; + creation.localIndexType = NGT::Property::GraphAndTree; + creation.localInsertionRadiusCoefficient = 1.1; + hierarchicalClustering.maxSize = 1000; + hierarchicalClustering.numOfObjects = 0; + hierarchicalClustering.numOfClusters = 2; + hierarchicalClustering.numOfTotalClusters = 0; + hierarchicalClustering.numOfTotalBlobs = 0; + hierarchicalClustering.clusterID = -1; + hierarchicalClustering.initMode = NGT::Clustering::InitializationModeKmeansPlusPlus; + hierarchicalClustering.numOfRandomObjects = 0; + hierarchicalClustering.numOfFirstObjects = 0; + hierarchicalClustering.numOfFirstClusters = 0; + hierarchicalClustering.numOfSecondObjects = 0; + hierarchicalClustering.numOfSecondClusters = 0; + hierarchicalClustering.numOfThirdClusters = 0; + hierarchicalClustering.extractCentroid = false; + hierarchicalClustering.threeLayerClustering = true; + + optimization.timelimit = 24 * 1 * 60.0 * 60.0; + optimization.iteration = 100; + optimization.clusterIteration = 100; + optimization.clusterSizeConstraint = false; + optimization.clusterSizeConstraintCoefficient = 5.0; + optimization.convergenceLimitTimes = 5; + optimization.numberOfObjects = 1000; + optimization.numberOfClusters = 0; + optimization.numberOfSubvectors = 0; + optimization.nOfMatrices = 2; + optimization.seedStartObjectSizeRate = 0.1; + optimization.seedStep = 2; + optimization.reject = 0.9; + optimization.repositioning = false; + optimization.rotation = true; + optimization.globalType = QBG::Optimizer::GlobalTypeNone; + optimization.randomizedObjectExtraction = true; + } + + void setProperties(NGTQ::Property &property, NGT::Property &globalProperty, + NGT::Property &localProperty) { + property.threadSize = creation.threadSize; + property.globalCentroidLimit = 0; + property.localCentroidLimit = creation.localCentroidLimit; + property.dimension = creation.dimension; + property.globalRange = 0; + property.localRange = 0; + property.localCentroidLimit = creation.localCentroidLimit; +#ifdef NGTQ_QBG + property.genuineDimension = creation.genuineDimension; + //-/property.dimensionOfSubvector = creation.dimensionOfSubvector; + property.genuineDataType = creation.genuineDataType; +#endif + property.dataType = creation.dataType; + property.distanceType = creation.distanceType; + property.singleLocalCodebook = false; + property.localDivisionNo = creation.localDivisionNo; + property.batchSize = creation.batchSize; + property.centroidCreationMode = creation.centroidCreationMode; + property.localCentroidCreationMode = creation.localCentroidCreationMode; + property.localIDByteSize = creation.localIDByteSize; + property.localClusteringSampleCoefficient = creation.localClusteringSampleCoefficient; + globalProperty.edgeSizeForCreation = creation.globalEdgeSizeForCreation; + globalProperty.edgeSizeForSearch = creation.globalEdgeSizeForSearch; + globalProperty.indexType = creation.globalIndexType; + globalProperty.insertionRadiusCoefficient = creation.globalInsertionRadiusCoefficient; + globalProperty.graphType = creation.globalGraphType; + localProperty.indexType = creation.localIndexType; + localProperty.insertionRadiusCoefficient = creation.localInsertionRadiusCoefficient; + localProperty.graphType = creation.localGraphType; + } + struct { + size_t numOfObjects; + size_t threadSize; + size_t localCentroidLimit; + size_t dimension; +#ifdef NGTQ_QBG + size_t genuineDimension; + size_t dimensionOfSubvector; + ObjectFile::DataType genuineDataType; +#endif + NGTQ::DataType dataType; + NGTQ::DistanceType distanceType; + bool singleLocalCodebook; + size_t localDivisionNo; + size_t batchSize; + NGTQ::CentroidCreationMode centroidCreationMode; + NGTQ::CentroidCreationMode localCentroidCreationMode; + size_t localIDByteSize; + size_t localClusteringSampleCoefficient; + + size_t globalEdgeSizeForCreation; + size_t globalEdgeSizeForSearch; + NGT::Property::IndexType globalIndexType; + float globalInsertionRadiusCoefficient; + NGT::Property::GraphType globalGraphType; + + NGT::Property::IndexType localIndexType; + float localInsertionRadiusCoefficient; + NGT::Property::GraphType localGraphType; + } creation; + + struct { + size_t maxSize; + size_t numOfObjects; + size_t numOfClusters; + size_t numOfTotalClusters; + size_t numOfTotalBlobs; + int32_t clusterID; + + NGT::Clustering::InitializationMode initMode; + + size_t numOfRandomObjects; + + size_t numOfFirstObjects; + size_t numOfFirstClusters; + size_t numOfSecondObjects; + size_t numOfSecondClusters; + size_t numOfThirdClusters; + bool extractCentroid; + + bool threeLayerClustering; + } hierarchicalClustering; + + struct { + NGT::Clustering::ClusteringType clusteringType; + NGT::Clustering::InitializationMode initMode; + + float timelimit; + size_t iteration; + size_t clusterIteration; + bool clusterSizeConstraint; + float clusterSizeConstraintCoefficient; + size_t convergenceLimitTimes; + size_t numberOfObjects; + size_t numberOfClusters; + size_t numberOfSubvectors; + size_t nOfMatrices; + float seedStartObjectSizeRate; + size_t seedStep; + float reject; + bool repositioning; + bool rotation; + QBG::Optimizer::GlobalType globalType; + bool randomizedObjectExtraction; + bool showClusterInfo; + + } optimization; + + bool silence; + }; + + class SearchContainer : public NGT::SearchContainer { public: SearchContainer(NGT::Object &q): NGT::SearchContainer(q), @@ -72,10 +252,10 @@ namespace QBG { abort(); #else - (*this).resize(quantizedIndex.getGlobalCodebookSize()); + (*this).resize(quantizedIndex.getInvertedIndexSize()); NGT::Timer timer; timer.start(); - for (size_t gid = 1; gid < quantizedIndex.getGlobalCodebookSize(); gid++) { + for (size_t gid = 1; gid < quantizedIndex.getInvertedIndexSize(); gid++) { if (gid % 100000 == 0) { timer.stop(); std::cerr << "The number of processed blobs=" << gid << " VmSize=" << NGT::Common::getProcessVmSizeStr() << " Elapsed time=" << timer << std::endl; @@ -116,7 +296,7 @@ namespace QBG { class Index : public NGTQ::Index { public: - Index(const std::string &indexPath, bool readOnly = false, bool silence = false) : + Index(const std::string &indexPath, bool readOnly = false, bool silence = true) : NGTQ::Index(indexPath, readOnly), path(indexPath), quantizedBlobGraph(*this) { searchable = false; NGT::StdOstreamRedirector redirector(silence); @@ -126,8 +306,10 @@ namespace QBG { searchable = true; } catch (NGT::Exception &err) { if (readOnly) { + stringstream msg; + msg << "QBG::Index: No quantized blob graph. " << err.what(); + NGTThrowException(msg); } else { - quantizedBlobGraph.construct(*this); } } redirector.end(); @@ -143,6 +325,22 @@ namespace QBG { objectVector.resize(dim, 0); return globalIndex.allocateObject(objectVector); } + +#ifdef NGTQ_QBG + static void create(const std::string &index, + BuildParameters &buildParameters, + std::vector *rotation,const std::string &objectFile) { + NGTQ::Property property; + NGT::Property globalProperty; + NGT::Property localProperty; + buildParameters.setProperties(property, globalProperty, localProperty); + property.quantizerType = NGTQ::QuantizerTypeQBG; + NGTQ::Index::create(index, property, globalProperty, localProperty, rotation, objectFile); + try { + NGT::Index::mkdir(index + getWorkspaceName()); + } catch(...) {} + } +#endif static void create(const std::string &index, NGTQ::Property &property, NGT::Property &globalProperty, @@ -185,7 +383,7 @@ namespace QBG { static void append(const std::string &indexName, // index file const std::string &data, // data file size_t dataSize = 0, // data size - bool silence = false + bool silence = true ) { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); @@ -233,7 +431,7 @@ namespace QBG { static void appendBinary(const std::string &indexName, // index file const std::string &data, // data file size_t dataSize = 0, // data size - bool silence = false + bool silence = true ) { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); @@ -265,7 +463,7 @@ namespace QBG { static void appendFromObjectRepository(const std::string &ngtIndex, // QG const std::string &qgIndex, // NGT - bool silence = false) { + bool silence = true) { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); @@ -791,15 +989,15 @@ namespace QBG { } } - static void buildNGTQ(const std::string &indexPath, bool silence = false) { - load(indexPath, QBG::Index::getQuantizerCodebookFileName(indexPath), "", ""); + static void buildNGTQ(const std::string &indexPath, bool silence = true) { + load(indexPath, QBG::Index::getQuantizerCodebookFile(indexPath), "", ""); buildNGTQ(indexPath, "", "-", "-", 1, 0, silence); std::cerr << "NGTQ and NGTQBG indices are completed." << std::endl; std::cerr << " vmsize=" << NGT::Common::getProcessVmSizeStr() << std::endl; std::cerr << " peak vmsize=" << NGT::Common::getProcessVmPeakStr() << std::endl; } - static void build(const std::string &indexPath, bool silence = false) { + static void build(const std::string &indexPath, bool silence = true) { load(indexPath, "", "", ""); buildNGTQ(indexPath, "", "", "", 1, 0, silence); buildQBG(indexPath, silence); @@ -812,7 +1010,7 @@ namespace QBG { std::string quantizerCodebookFile = "", std::string codebookIndexFile = "", std::string objectIndexFile = "", - size_t beginID = 1, size_t endID = 0, bool silence = false) { + size_t beginID = 1, size_t endID = 0, bool silence = true) { buildNGTQ(indexPath, quantizerCodebookFile, codebookIndexFile, objectIndexFile, beginID, endID, silence); buildQBG(indexPath, silence); std::cerr << "NGTQ and NGTQBG indices are completed." << std::endl; @@ -836,14 +1034,14 @@ namespace QBG { std::string quantizerCodebookFile = "", std::string codebookIndexFile = "", std::string objectIndexFile = "", - size_t beginID = 1, size_t endID = 0, bool silence = false) { + size_t beginID = 1, size_t endID = 0, bool silence = true) { std::vector> quantizerCodebook; std::vector codebookIndex; std::vector objectIndex; { std::string codebookPath = quantizerCodebookFile; if (codebookPath.empty()) { - codebookPath = QBG::Index::getQuantizerCodebookFileName(indexPath); + codebookPath = QBG::Index::getQuantizerCodebookFile(indexPath); } std::ifstream stream(codebookPath); if (!stream) { @@ -873,7 +1071,7 @@ namespace QBG { { std::string codebookIndexPath = codebookIndexFile; if (codebookIndexPath.empty()) { - codebookIndexPath = QBG::Index::getCodebookIndexFileName(indexPath); + codebookIndexPath = QBG::Index::getCodebookIndexFile(indexPath); } if (codebookIndexPath != "-") { cerr << "buildNGTQ: codebook index is " << codebookIndexPath << "." << endl; @@ -900,7 +1098,7 @@ namespace QBG { { std::string objectIndexPath = objectIndexFile; if (objectIndexPath.empty()) { - objectIndexPath = QBG::Index::getObjectIndexFileName(indexPath); + objectIndexPath = QBG::Index::getObjectIndexFile(indexPath); } if (objectIndexPath != "-") { std::ifstream stream(objectIndexPath); @@ -930,7 +1128,7 @@ namespace QBG { std::vector> &quantizerCodebook, std::vector &codebookIndex, std::vector &objectIndex, - size_t beginID = 1, size_t endID = 0, bool silence = false) { + size_t beginID = 1, size_t endID = 0, bool silence = true) { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); NGT::Timer timer; @@ -954,6 +1152,13 @@ namespace QBG { } index.createIndex(quantizerCodebook, codebookIndex, objectIndex, beginID, endID); } + + const string com = "rm -rf " + indexPath + "/" + getWorkspaceName(); + std::cerr << "pass com=" << com << std::endl; + if (system(com.c_str()) == -1) { + std::cerr << "Warning. cannot remove the workspace directory. " << std::endl; + } + timer.stop(); std::cerr << "NGTQ index is completed." << std::endl; std::cerr << " time=" << timer << std::endl; @@ -964,11 +1169,20 @@ namespace QBG { redirector.end(); } - static void buildQBG(const std::string &indexPath, bool silence = false) { + static void buildQBG(const std::string &indexPath, bool silence = true) { std::cerr << "build QBG" << std::endl; NGT::Timer timer; timer.start(); - QBG::Index index(indexPath, false, silence); + auto readOnly = false; + QBG::Index index(indexPath, readOnly, silence); + try { + index.load(); + stringstream msg; + msg << "QBG::Index::buildQBG: The index is already built. "; + NGTThrowException(msg); + } catch (...) {} + index.quantizedBlobGraph.construct(index); + timer.stop(); std::cerr << "QBG index is completed." << std::endl; std::cerr << " time=" << timer << std::endl; @@ -1086,13 +1300,13 @@ namespace QBG { load(std::string indexPath, std::string blobs = "", std::string localCodebooks = "", std::string rotationPath = "", int threadSize = 0) { if (blobs.empty()) { - blobs = QBG::Index::getBlobFileName(indexPath); + blobs = QBG::Index::getBlobFile(indexPath); } if (localCodebooks.empty()) { - localCodebooks = QBG::Index::getPQFileName(indexPath) + "/opt-@.tsv"; + localCodebooks = QBG::Index::getPQFile(indexPath) + "/" + QBG::Index::getSubvectorPrefix() + "-@"; } if (rotationPath.empty()) { - rotationPath = QBG::Index::getRotationFileName(indexPath); + rotationPath = QBG::Index::getRotationFile(indexPath); } threadSize = threadSize == 0 ? std::thread::hardware_concurrency() : threadSize; @@ -1141,14 +1355,26 @@ namespace QBG { #endif } - static const std::string getTrainObjectFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/objects.tsv"; } - static const std::string getPQFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/kmeans-cluster_opt"; } - static const std::string getBlobFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/kmeans-cluster_centroid.tsv"; } - - static const std::string getQuantizerCodebookFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/kmeans-cluster_qcentroid.tsv"; } - static const std::string getCodebookIndexFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/kmeans-cluster_bqindex.tsv"; } - static const std::string getObjectIndexFileName(std::string indexPath) { return indexPath + "/" + getWorkspaceName() + "/kmeans-cluster_index.tsv"; } - static const std::string getRotationFileName(std::string indexPath) { return getPQFileName(indexPath) + "/opt_R.tsv"; } + static const std::string getSubvectorPrefix() { return "sv"; } + static const std::string getHierarchicalClusteringPrefix() { return "hkc"; } + static const std::string getSecondCentroidSuffix() { return "_2c"; } + static const std::string getThirdCentroidSuffix() { return "_3c"; } + static const std::string get3rdTo2ndSuffix() { return "_3to2"; } + static const std::string getObjTo3rdSuffix() { return "_oto3"; } + static const std::string getResidualFile() { return "r"; } + static const std::string getRotatedResidualFile() { return "Rr"; } + static const std::string getObjectFile() { return "obj"; } + static const std::string getRotationFile() { return "R"; } + static const std::string getWorkSpacePrefix(std::string indexPath) { return indexPath + "/" + getWorkspaceName(); } + static const std::string getTrainObjectFile(std::string indexPath) { return getWorkSpacePrefix(indexPath) + "/" + getObjectFile(); } + static const std::string getPrefix(std::string indexPath) { return getWorkSpacePrefix(indexPath) + "/" + getHierarchicalClusteringPrefix(); } + static const std::string getPQFile(std::string indexPath) { return getPrefix(indexPath) + "_opt"; } + static const std::string getBlobFile(std::string indexPath) { return getPrefix(indexPath) + getThirdCentroidSuffix(); } + static const std::string getQuantizerCodebookFile(std::string indexPath) { return getPrefix(indexPath) + getSecondCentroidSuffix(); } + static const std::string getCodebookIndexFile(std::string indexPath) { return getPrefix(indexPath) + get3rdTo2ndSuffix(); } + static const std::string getObjectIndexFile(std::string indexPath) { return getPrefix(indexPath) + getObjTo3rdSuffix(); } + static const std::string getRotationFile(std::string indexPath) { return getPQFile(indexPath) + "/" + getRotationFile +(); } static const std::string getWorkspaceName() { return "ws"; } diff --git a/lib/NGT/NGTQ/QuantizedGraph.cpp b/lib/NGT/NGTQ/QuantizedGraph.cpp new file mode 100644 index 0000000..183b314 --- /dev/null +++ b/lib/NGT/NGTQ/QuantizedGraph.cpp @@ -0,0 +1,80 @@ +// +// Copyright (C) 2020 Yahoo Japan Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "NGT/NGTQ/QuantizedGraph.h" +#include "NGT/NGTQ/QuantizedBlobGraph.h" +#include "NGT/NGTQ/Optimizer.h" + +#ifdef NGTQ_QBG +void NGTQG::Index::quantize(const std::string indexPath, size_t dimensionOfSubvector, size_t maxNumOfEdges, bool silence) { + { + NGT::Index index(indexPath); + const std::string quantizedIndexPath = indexPath + "/qg"; + struct stat st; + if (stat(quantizedIndexPath.c_str(), &st) != 0) { + NGT::Property ngtProperty; + index.getProperty(ngtProperty); + QBG::BuildParameters buildParameters; + buildParameters.creation.dimensionOfSubvector = dimensionOfSubvector; + buildParameters.silence = silence; + + NGTQG::Index::create(indexPath, buildParameters); + + NGTQG::Index::append(indexPath, buildParameters); + + QBG::Optimizer optimizer(buildParameters); +#ifdef NGTQG_NO_ROTATION + if (optimizer.rotation || optimizer.repositioning) { + std::cerr << "build-qg: Warning! Although rotation or repositioning is specified, turn off rotation and repositioning because of unavailable options." << std::endl; + optimizer.rotation = false; + optimizer.repositioning = false; + } +#endif + + if (optimizer.globalType == QBG::Optimizer::GlobalTypeNone) { + std::cerr << "build-qg: Warning! None is unavailable for the global type. Zero is set to the global type." << std::endl; + optimizer.globalType = QBG::Optimizer::GlobalTypeZero; + } + + optimizer.optimize(quantizedIndexPath); + + QBG::Index::buildNGTQ(quantizedIndexPath, silence); + + NGTQG::Index::realign(indexPath, maxNumOfEdges, silence); + } + } + +} + +void NGTQG::Index::create(const std::string indexPath, QBG::BuildParameters &buildParameters) { + auto dimensionOfSubvector = buildParameters.creation.dimensionOfSubvector; + auto dimension = buildParameters.creation.dimension; + if (dimension != 0 && buildParameters.creation.localDivisionNo != 0) { + if (dimension % buildParameters.creation.localDivisionNo != 0) { + std::stringstream msg; + msg << "NGTQBG:Index::create: Invalid dimension and local division No. " << dimension << ":" << buildParameters.creation.localDivisionNo; + NGTThrowException(msg); + } + dimensionOfSubvector = dimension / buildParameters.creation.localDivisionNo; + } + create(indexPath, dimensionOfSubvector, dimension); +} + + +void NGTQG::Index::append(const std::string indexPath, QBG::BuildParameters &buildParameters) { + QBG::Index::appendFromObjectRepository(indexPath, indexPath + "/qg", buildParameters.silence); +} +#endif diff --git a/lib/NGT/NGTQ/QuantizedGraph.h b/lib/NGT/NGTQ/QuantizedGraph.h index 29da6dd..90cc907 100644 --- a/lib/NGT/NGTQ/QuantizedGraph.h +++ b/lib/NGT/NGTQ/QuantizedGraph.h @@ -27,6 +27,9 @@ #undef NGTQG_BLOB_GRAPH #endif +namespace QBG { + class BuildParameters; +}; namespace NGTQG { class SearchContainer : public NGT::SearchContainer { @@ -552,6 +555,9 @@ namespace NGTQG { NGTQ::Index::create(quantizedIndexPath, property, globalProperty, localProperty); } +#ifdef NGTQ_QBG + static void create(const std::string indexPath, QBG::BuildParameters &buildParameters); +#endif static void create(const std::string indexPath, size_t dimensionOfSubvector, size_t pseudoDimension) { NGT::Index index(indexPath); @@ -588,15 +594,14 @@ namespace NGTQG { return; } + static void append(const std::string indexPath, QBG::BuildParameters &buildParameters); + #ifdef NGTQ_QBG - static void quantize(const std::string indexPath, size_t maxNumOfEdges, bool silence = false) { -#else - static void quantize(const std::string indexPath, float dimensionOfSubvector, size_t maxNumOfEdges, bool silence = false) { -#endif + static void quantize(const std::string indexPath, size_t dimensionOfSubvector, size_t maxNumOfEdges, bool silence = true); + + static void realign(const std::string indexPath, size_t maxNumOfEdges, bool silence = true) { NGT::StdOstreamRedirector redirector(silence); redirector.begin(); - -#ifdef NGTQ_QBG { std::string quantizedIndexPath = indexPath + "/qg"; struct stat st; @@ -610,7 +615,12 @@ namespace NGTQG { } buildQuantizedGraph(indexPath, maxNumOfEdges); } + redirector.end(); + } #else + static void quantize(const std::string indexPath, float dimensionOfSubvector, size_t maxNumOfEdges, bool silence = true) { + NGT::StdOstreamRedirector redirector(silence); + redirector.begin(); { NGT::Index index(indexPath); NGT::ObjectSpace &objectSpace = index.getObjectSpace(); @@ -619,7 +629,6 @@ namespace NGTQG { if (stat(quantizedIndexPath.c_str(), &st) != 0) { NGT::Property ngtProperty; index.getProperty(ngtProperty); - //NGTQG::Command::CreateParameters createParameters(args, property.dimension); createQuantizedGraphFrame(quantizedIndexPath, ngtProperty.dimension, dimensionOfSubvector); buildQuantizedObjects(quantizedIndexPath, objectSpace); if (maxNumOfEdges != 0) { @@ -627,9 +636,9 @@ namespace NGTQG { } } } -#endif redirector.end(); } +#endif const bool readOnly; const std::string path; diff --git a/lib/NGT/NGTQ/Quantizer.h b/lib/NGT/NGTQ/Quantizer.h index 84d27bd..bc125c1 100644 --- a/lib/NGT/NGTQ/Quantizer.h +++ b/lib/NGT/NGTQ/Quantizer.h @@ -314,7 +314,26 @@ class QuantizationCodebook : public std::vector { uint32_t paddedDimension; NGT::Index *index; }; - + +class GraphNodeToInvertedIndexEntries : public std::vector { + typedef std::vector PARENT; + public: + GraphNodeToInvertedIndexEntries() {} + ~GraphNodeToInvertedIndexEntries() {} + void serialize(std::ofstream &os) { + uint32_t v = PARENT::size(); + NGT::Serializer::write(os, v); + os.write(reinterpret_cast(PARENT::data()), static_cast(v) * sizeof(uint32_t)); + } + void deserialize(std::ifstream &is) { + uint32_t v; + NGT::Serializer::read(is, v); + PARENT::resize(v); + is.read(reinterpret_cast(PARENT::data()), static_cast(v) * sizeof(uint32_t)); + } + +}; + template class InvertedIndexObject { public: @@ -2106,6 +2125,8 @@ class Quantizer { virtual QuantizedObjectDistance &getQuantizedObjectDistance() = 0; + virtual size_t getInvertedIndexSize() = 0; + ObjectList objectList; string rootDirectory; @@ -2643,8 +2664,10 @@ class QuantizerInstance : public Quantizer { } #endif // NGT_SHARED_MEMORY_ALLOCATOR #ifndef NGTQ_SHARED_INVERTED_INDEX - ofstream of(rootDirectory + "/ivt"); - invertedIndex.serialize(of); + { + ofstream of(rootDirectory + "/ivt"); + invertedIndex.serialize(of); + } #endif #ifdef NGTQ_QBG { @@ -3243,7 +3266,7 @@ class QuantizerInstance : public Quantizer { for (size_t idx = 0; idx < objects.size(); idx++) { if (objects[idx].second - 1 >= objectToBlobIndex.size()) { std::cerr << "Quantizer::insert: Fatal Error! Object ID is invalid. " - << idx << ":" << objects[idx].second - 1 << ":" << objectToBlobIndex.size() + << idx << ":" << objects[idx].second - 1 << ":" << objectToBlobIndex.size() << ":" << objects.size()<< std::endl; abort(); } @@ -3543,6 +3566,8 @@ class QuantizerInstance : public Quantizer { #endif invertedIndex.at(gid)->subspaceID = codebookIndex[idx]; } + + quantizationCodebook.setPaddedDimension(globalCodebookIndex.getObjectSpace().getPaddedDimension()); quantizationCodebook = qCodebook; @@ -4311,6 +4336,8 @@ class QuantizerInstance : public Quantizer { QuantizedObjectDistance &getQuantizedObjectDistance() { return *quantizedObjectDistance; } + size_t getInvertedIndexSize() { return invertedIndex.size(); } + #ifdef NGTQ_SHARED_INVERTED_INDEX NGT::PersistentRepository invertedIndex; #else @@ -4547,6 +4574,8 @@ class Quantization { size_t getGlobalCodebookSize() { return quantizer->globalCodebookIndex.getObjectRepositorySize(); } size_t getLocalCodebookSize(size_t idx) { return quantizer->getLocalCodebookSize(idx); } + size_t getInvertedIndexSize() { return quantizer->getInvertedIndexSize(); } + size_t getSharedMemorySize(ostream &os, SharedMemoryAllocator::GetMemorySizeType t = SharedMemoryAllocator::GetTotalMemorySize) { return quantizer->getSharedMemorySize(os, t); }