Skip to content

Commit

Permalink
QBG internal improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
masajiro committed Jul 19, 2023
1 parent 3f9d6f7 commit dcdb156
Show file tree
Hide file tree
Showing 23 changed files with 3,431 additions and 1,629 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.16
2.1.0
43 changes: 38 additions & 5 deletions lib/NGT/Clustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ namespace NGT {

class Cluster {
public:
Cluster():radius(0.0) {}
Cluster(std::vector<float> &c):centroid(c), radius(0.0) {}
Cluster(const Cluster &c) { *this = c; }
Cluster &operator=(const Cluster &c) {
Expand All @@ -95,8 +96,8 @@ namespace NGT {
double radius;
};

Clustering(InitializationMode im = InitializationModeHead, ClusteringType ct = ClusteringTypeKmeansWithNGT, size_t mi = 10000, size_t nc = 0):
clusteringType(ct), initializationMode(im), numberOfClusters(nc), maximumIteration(mi) { initialize(); }
Clustering(InitializationMode im = InitializationModeHead, ClusteringType ct = ClusteringTypeKmeansWithNGT, size_t mi = 10000, size_t nc = 0, bool s = true):
clusteringType(ct), initializationMode(im), numberOfClusters(nc), maximumIteration(mi), silence(s) { initialize(); }

void initialize() {
epsilonFrom = 0.12;
Expand Down Expand Up @@ -208,8 +209,9 @@ namespace NGT {
}
}
if ((numberOfClusters != 0) && (clusters.size() < numberOfClusters)) {
std::cerr << "initial cluster data are not enough. " << clusters.size() << ":" << numberOfClusters << std::endl;
exit(1);
std::stringstream msg;
msg << "initial cluster data are not enough. " << clusters.size() << ":" << numberOfClusters;
NGTThrowException(msg);
}
}
#if !defined(NGT_CLUSTER_NO_AVX)
Expand Down Expand Up @@ -247,6 +249,33 @@ namespace NGT {
}
#endif // !defined(NGT_AVX_DISABLED) && defined(__AVX__)

static void
clearMembers(std::vector<Cluster> &clusters) {
for (auto &cluster : clusters) {
cluster.members.clear();
}
}

static size_t
removeEmptyClusters(std::vector<Cluster> &clusters) {
size_t count = 0;
auto dst = clusters.begin();
for (auto src = clusters.begin(); src != clusters.end(); ++src) {
if ((*src).members.size() == 0) {
count++;
continue;
}
if (dst != src) {
*dst = std::move(*src);
}
++dst;
}
if (count != 0) {
clusters.resize(clusters.size() - count);
}
return count;
}

static double
distanceL2(std::vector<float> &vector1, std::vector<float> &vector2) {
return sqrt(sumOfSquares(&vector1[0], &vector2[0], vector1.size()));
Expand Down Expand Up @@ -661,10 +690,13 @@ namespace NGT {
}

static void
saveClusters(const std::string &file, std::vector<Cluster> &clusters)
saveClusters(const std::string &file, std::vector<Cluster> &clusters, bool skipEmptyClusters = false)
{
std::ofstream os(file);
for (auto cit = clusters.begin(); cit != clusters.end(); ++cit) {
if (skipEmptyClusters && (*cit).members.size() == 0) {
continue;
}
std::vector<float> &v = (*cit).centroid;
for (auto it = v.begin(); it != v.end(); ++it) {
os << std::setprecision(9) << (*it);
Expand Down Expand Up @@ -1042,6 +1074,7 @@ namespace NGT {
float epsilonStep;
size_t resultSizeCoefficient;
vector<double> diffHistory;
bool silence;
};

}
6 changes: 4 additions & 2 deletions lib/NGT/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ using namespace std;
if (searchParameters.querySize > 0 && queryCount >= searchParameters.querySize) {
break;
}
NGT::Object *object = index.allocateObject(line, " \t");
NGT::Object *object = index.allocateObject(line, " \t,");
queryCount++;
size_t step = searchParameters.step == 0 ? UINT_MAX : searchParameters.step;
for (size_t n = 0; n <= step; n++) {
Expand Down Expand Up @@ -373,6 +373,8 @@ using namespace std;
if (searchParameters.outputMode[0] == 'e') {
stream << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl;
stream << "# Number of queries=" << queryCount << endl;
stream << "# VM size=" << NGT::Common::getProcessVmSizeStr() << std::endl;
stream << "# Peak VM size=" << NGT::Common::getProcessVmPeakStr() << std::endl;
stream << "# End of Evaluation" << endl;

if (searchParameters.outputMode == "e+") {
Expand Down Expand Up @@ -510,7 +512,7 @@ using namespace std;
while(getline(is, line)) {
count++;
vector<string> tokens;
NGT::Common::tokenize(line, tokens, "\t ");
NGT::Common::tokenize(line, tokens, "\t, ");
if (tokens.size() == 0 || tokens[0].size() == 0) {
continue;
}
Expand Down
134 changes: 69 additions & 65 deletions lib/NGT/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,69 @@ namespace NGT {
char **argV;
};

class Timer {
public:
Timer():time(0) {}
void reset() { time = 0; ntime = 0; }

void start() {
struct timespec res;
clock_getres(CLOCK_REALTIME, &res);
reset();
clock_gettime(CLOCK_REALTIME, &startTime);
}

void restart() {
clock_gettime(CLOCK_REALTIME, &startTime);
}

void stop() {
clock_gettime(CLOCK_REALTIME, &stopTime);
sec = stopTime.tv_sec - startTime.tv_sec;
nsec = stopTime.tv_nsec - startTime.tv_nsec;
if (nsec < 0) {
sec -= 1;
nsec += 1000000000L;
}
time += (double)sec + (double)nsec / 1000000000.0;
ntime += sec * 1000000000L + nsec;
}

void add(Timer &t) {
time += t.time;
ntime += t.ntime;
}

friend std::ostream &operator<<(std::ostream &os, Timer &t) {
auto time = t.time;
if (time < 1.0) {
time *= 1000.0;
os << std::setprecision(6) << time << " (ms)";
return os;
}
if (time < 60.0) {
os << std::setprecision(6) << time << " (s)";
return os;
}
time /= 60.0;
if (time < 60.0) {
os << std::setprecision(6) << time << " (m)";
return os;
}
time /= 60.0;
os << std::setprecision(6) << time << " (h)";
return os;
}

struct timespec startTime;
struct timespec stopTime;

int64_t sec;
int64_t nsec;
int64_t ntime; // nano second
double time; // second
};

class Common {
public:
static void tokenize(const std::string &str, std::vector<std::string> &token, const std::string seps) {
Expand Down Expand Up @@ -289,13 +352,13 @@ namespace NGT {
for (idx = 0; idx < tokens.size(); idx++) {
if (tokens[idx].size() == 0) {
std::stringstream msg;
msg << "Common::extractVecotFromText: No data. " << textLine;
msg << "Common::extractVecot: No data. " << textLine;
NGTThrowException(msg);
}
char *e;
double v = ::strtod(tokens[idx].c_str(), &e);
if (*e != 0) {
std::cerr << "ObjectSpace::readText: Warning! Not numerical value. [" << e << "]" << std::endl;
std::cerr << "Common::extractVector: Warning! Not numerical value. [" << e << "] " << std::endl;
break;
}
object.push_back(v);
Expand Down Expand Up @@ -342,7 +405,7 @@ namespace NGT {
}
size = round(size * 100) / 100;
std::stringstream str;
str << size << unit;
str << size << " " << unit;
return str.str();
}
static std::string getProcessVmSizeStr() { return sizeToString(getProcessVmSize()); }
Expand Down Expand Up @@ -1319,7 +1382,7 @@ namespace NGT {
}


size_t size() { return vectorSize; }
size_t size() const { return vectorSize; }

public:
void extend(SharedMemoryAllocator &allocator) {
Expand Down Expand Up @@ -2176,6 +2239,7 @@ namespace NGT {
public:
Container(Object &o, ObjectID i):object(o), id(i) {}
Container(Container &c):object(c.object), id(c.id) {}
bool isEmptyObject() { return &object == 0; }
Object &object;
ObjectID id;
};
Expand Down Expand Up @@ -2213,7 +2277,7 @@ namespace NGT {
useAllNodesInLeaf = false;
expectedAccuracy = -1.0;
}
void setSize(size_t s) { size = s; };
void setSize(size_t s) { size = s; }
void setResults(ObjectDistances *r) { result = r; }
void setRadius(Distance r) { radius = r; }
void setEpsilon(float e) { explorationCoefficient = e + 1.0; }
Expand All @@ -2240,7 +2304,6 @@ namespace NGT {
bool useAllNodesInLeaf;
size_t visitCount;
float expectedAccuracy;

private:
ObjectDistances *result;
};
Expand Down Expand Up @@ -2300,64 +2363,5 @@ namespace NGT {
InsertContainer(Object &f, ObjectID i):Container(f, i) {}
};

class Timer {
public:
Timer():time(0) {}

void reset() { time = 0; ntime = 0; }

void start() {
struct timespec res;
clock_getres(CLOCK_REALTIME, &res);
reset();
clock_gettime(CLOCK_REALTIME, &startTime);
}

void restart() {
clock_gettime(CLOCK_REALTIME, &startTime);
}

void stop() {
clock_gettime(CLOCK_REALTIME, &stopTime);
sec = stopTime.tv_sec - startTime.tv_sec;
nsec = stopTime.tv_nsec - startTime.tv_nsec;
if (nsec < 0) {
sec -= 1;
nsec += 1000000000L;
}
time += (double)sec + (double)nsec / 1000000000.0;
ntime += sec * 1000000000L + nsec;
}

friend std::ostream &operator<<(std::ostream &os, Timer &t) {
auto time = t.time;
if (time < 1.0) {
time *= 1000.0;
os << std::setprecision(6) << time << " (ms)";
return os;
}
if (time < 60.0) {
os << std::setprecision(6) << time << " (s)";
return os;
}
time /= 60.0;
if (time < 60.0) {
os << std::setprecision(6) << time << " (m)";
return os;
}
time /= 60.0;
os << std::setprecision(6) << time << " (h)";
return os;
}

struct timespec startTime;
struct timespec stopTime;

int64_t sec;
int64_t nsec;
int64_t ntime; // nano second
double time; // second
};

} // namespace NGT

2 changes: 2 additions & 0 deletions lib/NGT/GraphOptimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
// limitations under the License.
//

#pragma once

#include "GraphReconstructor.h"
#include "Optimizer.h"

Expand Down
19 changes: 10 additions & 9 deletions lib/NGT/NGTQ/Capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,9 @@ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBG
hierarchicalKmeans.numOfSecondClusters = parameters->number_of_second_clusters;
hierarchicalKmeans.numOfThirdClusters = parameters->number_of_third_clusters;
hierarchicalKmeans.numOfObjects = 0;
hierarchicalKmeans.threeLayerClustering = true;
hierarchicalKmeans.silence = true;
//-/hierarchicalKmeans.threeLayerClustering = true;
hierarchicalKmeans.clusteringType = QBG::HierarchicalKmeans::ClusteringTypeThreeLayer;
hierarchicalKmeans.verbose = false;

try {
hierarchicalKmeans.clustering(index_path);
Expand All @@ -341,16 +342,16 @@ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBG
optimizer.iteration = parameters->rotation_iteration;
optimizer.clusterIteration = parameters->subvector_iteration;
optimizer.clusterSizeConstraint = false;
optimizer.nOfMatrices = parameters->number_of_matrices;
optimizer.seedStartObjectSizeRate = 0.1;
optimizer.seedStep = 2;
optimizer.numberOfMatrices = parameters->number_of_matrices;
optimizer.seedNumberOfSteps = 2;
optimizer.seedStep = 10;
optimizer.reject = 0.9;
optimizer.timelimit = 24 * 2;
optimizer.timelimit *= 60.0 * 60.0;
optimizer.rotation = parameters->rotation;
optimizer.repositioning = parameters->repositioning;
optimizer.globalType = QBG::Optimizer::GlobalTypeNone;
optimizer.silence = true;
optimizer.verbose = false;

try {
auto nthreads = omp_get_max_threads();
Expand All @@ -363,8 +364,8 @@ bool qbg_build_index(const char *index_path, QBGBuildParameters *parameters, QBG
}

try {
auto silence = true;
QBG::Index::build(index_path, silence);
auto verbose = false;
QBG::Index::build(index_path, verbose);
} catch (NGT::Exception &err) {
std::stringstream ss;
ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what();
Expand Down Expand Up @@ -406,7 +407,7 @@ static bool qbg_search_index_(QBG::Index* pindex, std::vector<float> &query, QBG
sc.setEdgeSize(param.number_of_edges);
sc.setGraphExplorationSize(param.number_of_explored_blobs);

pindex->searchBlobGraph(sc);
pindex->search(sc);

return true;
}
Expand Down
Loading

0 comments on commit dcdb156

Please sign in to comment.