Skip to content

Commit

Permalink
Reduce locks for adhoc BF (#419) (#420)
Browse files Browse the repository at this point in the history
  • Loading branch information
alonre24 authored Aug 29, 2023
1 parent a6609c3 commit 94f2fcd
Show file tree
Hide file tree
Showing 16 changed files with 154 additions and 128 deletions.
5 changes: 3 additions & 2 deletions src/VecSim/algorithms/brute_force/brute_force_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
int deleteVector(labelType labelType) override;
int deleteVectorById(labelType label, idType id) override;
double getDistanceFrom(labelType label, const void *vector_data) const override;
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override;
inline size_t indexLabelCount() const override { return this->labelToIdsLookup.size(); }

inline std::unique_ptr<vecsim_stl::abstract_results_container>
Expand Down Expand Up @@ -192,7 +192,8 @@ int BruteForceIndex_Multi<DataType, DistType>::deleteVectorById(labelType label,
}

template <typename DataType, typename DistType>
double BruteForceIndex_Multi<DataType, DistType>::getDistanceFrom(labelType label,
double
BruteForceIndex_Multi<DataType, DistType>::getDistanceFrom_Unsafe(labelType label,
const void *vector_data) const {

auto IDs = this->labelToIdsLookup.find(label);
Expand Down
5 changes: 3 additions & 2 deletions src/VecSim/algorithms/brute_force/brute_force_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
int deleteVector(labelType label) override;
int deleteVectorById(labelType label, idType id) override;
double getDistanceFrom(labelType label, const void *vector_data) const override;
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override;

inline std::unique_ptr<vecsim_stl::abstract_results_container>
getNewResultsContainer(size_t cap) const override {
Expand Down Expand Up @@ -184,7 +184,8 @@ int BruteForceIndex_Single<DataType, DistType>::deleteVectorById(labelType label
}

template <typename DataType, typename DistType>
double BruteForceIndex_Single<DataType, DistType>::getDistanceFrom(labelType label,
double
BruteForceIndex_Single<DataType, DistType>::getDistanceFrom_Unsafe(labelType label,
const void *vector_data) const {

auto optionalId = this->labelToIdLookup.find(label);
Expand Down
13 changes: 12 additions & 1 deletion src/VecSim/algorithms/hnsw/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,8 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
inline auto safeGetEntryPointState() const;
inline void lockIndexDataGuard() const;
inline void unlockIndexDataGuard() const;
inline void lockSharedIndexDataGuard() const;
inline void unlockSharedIndexDataGuard() const;
inline void lockNodeLinks(idType node_id) const;
inline void unlockNodeLinks(idType node_id) const;
inline void lockNodeLinks(ElementGraphData *node_data) const;
Expand Down Expand Up @@ -351,7 +353,6 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,

// Inline priority queue getter that need to be implemented by derived class.
virtual inline candidatesLabelsMaxHeap<DistType> *getNewMaxPriorityQueue() const = 0;
virtual double safeGetDistanceFrom(labelType label, const void *vector_data) const = 0;

#ifdef BUILD_TESTS
/**
Expand Down Expand Up @@ -520,6 +521,16 @@ void HNSWIndex<DataType, DistType>::unlockIndexDataGuard() const {
indexDataGuard.unlock();
}

template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::lockSharedIndexDataGuard() const {
indexDataGuard.lock_shared();
}

template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::unlockSharedIndexDataGuard() const {
indexDataGuard.unlock_shared();
}

template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::lockNodeLinks(ElementGraphData *node_data) const {
node_data->neighborsGuard.lock();
Expand Down
28 changes: 3 additions & 25 deletions src/VecSim/algorithms/hnsw/hnsw_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
return keys;
};

template <bool Safe>
inline double getDistanceFromInternal(labelType label, const void *vector_data) const;

public:
Expand Down Expand Up @@ -91,11 +90,8 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
inline std::vector<idType> markDelete(labelType label) override;
inline bool safeCheckIfLabelExistsInIndex(labelType label,
bool also_done_processing) const override;
double getDistanceFrom(labelType label, const void *vector_data) const override {
return getDistanceFromInternal<false>(label, vector_data);
}
double safeGetDistanceFrom(labelType label, const void *vector_data) const override {
return getDistanceFromInternal<true>(label, vector_data);
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override {
return getDistanceFromInternal(label, vector_data);
}
};

Expand All @@ -112,38 +108,20 @@ size_t HNSWIndex_Multi<DataType, DistType>::indexLabelCount() const {
* helper functions
*/

// Depending on the value of the Safe template parameter, this function will either return a copy
// of the argument or a reference to it.
template <bool Safe, typename Arg>
constexpr decltype(auto) getCopyOrReference(Arg &&arg) {
if constexpr (Safe) {
return std::decay_t<Arg>(arg);
} else {
return (arg);
}
}

template <typename DataType, typename DistType>
template <bool Safe>
double HNSWIndex_Multi<DataType, DistType>::getDistanceFromInternal(labelType label,
const void *vector_data) const {
DistType dist = INVALID_SCORE;

// Check if the label exists in the index, return invalid score if not.
if (Safe)
this->indexDataGuard.lock_shared();
auto it = this->labelLookup.find(label);
if (it == this->labelLookup.end()) {
if (Safe)
this->indexDataGuard.unlock_shared();
return dist;
}

// Get the vector of ids associated with the label.
// Get a copy if `Safe` is true, otherwise get a reference.
decltype(auto) IDs = getCopyOrReference<Safe>(it->second);
if (Safe)
this->indexDataGuard.unlock_shared();
auto &IDs = it->second;

// Iterate over the ids and find the minimum distance.
for (auto id : IDs) {
Expand Down
15 changes: 2 additions & 13 deletions src/VecSim/algorithms/hnsw/hnsw_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
inline void resizeLabelLookup(size_t new_max_elements) override;
inline vecsim_stl::set<labelType> getLabelsSet() const override;

template <bool Safe>
inline double getDistanceFromInternal(labelType label, const void *vector_data) const;

public:
Expand Down Expand Up @@ -73,11 +72,8 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
inline bool safeCheckIfLabelExistsInIndex(labelType label,
bool also_done_processing = false) const override;

double getDistanceFrom(labelType label, const void *vector_data) const override {
return getDistanceFromInternal<false>(label, vector_data);
}
double safeGetDistanceFrom(labelType label, const void *vector_data) const override {
return getDistanceFromInternal<true>(label, vector_data);
double getDistanceFrom_Unsafe(labelType label, const void *vector_data) const override {
return getDistanceFromInternal(label, vector_data);
}
};

Expand Down Expand Up @@ -106,22 +102,15 @@ inline vecsim_stl::set<labelType> HNSWIndex_Single<DataType, DistType>::getLabel
};

template <typename DataType, typename DistType>
template <bool Safe>
double
HNSWIndex_Single<DataType, DistType>::getDistanceFromInternal(labelType label,
const void *vector_data) const {
if (Safe)
this->indexDataGuard.lock_shared();

auto it = labelLookup.find(label);
if (it == labelLookup.end()) {
if (Safe)
this->indexDataGuard.unlock_shared();
return INVALID_SCORE;
}
idType id = it->second;
if (Safe)
this->indexDataGuard.unlock_shared();

return this->distFunc(vector_data, this->getDataByInternalId(id), this->dim);
}
Expand Down
35 changes: 24 additions & 11 deletions src/VecSim/algorithms/hnsw/hnsw_tiered.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
size_t indexSize() const override;
size_t indexLabelCount() const override;
size_t indexCapacity() const override;
double getDistanceFrom(labelType label, const void *blob) const override;
double getDistanceFrom_Unsafe(labelType label, const void *blob) const override;
// Do nothing here, each tier (flat buffer and HNSW) should increase capacity for itself when
// needed.
VecSimIndexInfo info() const override;
Expand All @@ -210,6 +210,17 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
"running asynchronous GC for tiered HNSW index");
this->executeReadySwapJobs(this->pendingSwapJobsThreshold);
}
void acquireSharedLocks() override {
this->flatIndexGuard.lock_shared();
this->mainIndexGuard.lock_shared();
this->getHNSWIndex()->lockSharedIndexDataGuard();
}

void releaseSharedLocks() override {
this->flatIndexGuard.unlock_shared();
this->mainIndexGuard.unlock_shared();
this->getHNSWIndex()->unlockSharedIndexDataGuard();
}
#ifdef BUILD_TESTS
void getDataByLabel(labelType label, std::vector<std::vector<DataType>> &vectors_output) const;
#endif
Expand Down Expand Up @@ -621,9 +632,9 @@ TieredHNSWIndex<DataType, DistType>::~TieredHNSWIndex() {
template <typename DataType, typename DistType>
size_t TieredHNSWIndex<DataType, DistType>::indexSize() const {
this->flatIndexGuard.lock_shared();
this->getHNSWIndex()->lockIndexDataGuard();
this->getHNSWIndex()->lockSharedIndexDataGuard();
size_t res = this->backendIndex->indexSize() + this->frontendIndex->indexSize();
this->getHNSWIndex()->unlockIndexDataGuard();
this->getHNSWIndex()->unlockSharedIndexDataGuard();
this->flatIndexGuard.unlock_shared();
return res;
}
Expand Down Expand Up @@ -803,14 +814,18 @@ int TieredHNSWIndex<DataType, DistType>::deleteVector(labelType label) {
// 3. label exists in both indexes - we may have some of the vectors with the same label in the flat
// buffer only and some in the Main index only (and maybe temporal duplications).
// So, we get the distance from both indexes and return the minimum.

// IMPORTANT: this should be called when the *tiered index locks are locked for shared ownership*,
// along with HNSW index data guard lock. That is since the internal getDistanceFrom calls access
// the indexes' data, and it is not safe to run insert/delete operation in parallel. Also, we avoid
// acquiring the locks internally, since this is usually called for every vector individually, and
// the overhead of acquiring and releasing the locks is significant in that case.
template <typename DataType, typename DistType>
double TieredHNSWIndex<DataType, DistType>::getDistanceFrom(labelType label,
const void *blob) const {
double TieredHNSWIndex<DataType, DistType>::getDistanceFrom_Unsafe(labelType label,
const void *blob) const {
// Try to get the distance from the flat buffer.
// If the label doesn't exist, the distance will be NaN.
this->flatIndexGuard.lock_shared();
auto flat_dist = this->frontendIndex->getDistanceFrom(label, blob);
this->flatIndexGuard.unlock_shared();
auto flat_dist = this->frontendIndex->getDistanceFrom_Unsafe(label, blob);

// Optimization. TODO: consider having different implementations for single and multi indexes,
// to avoid checking the index type on every query.
Expand All @@ -821,9 +836,7 @@ double TieredHNSWIndex<DataType, DistType>::getDistanceFrom(labelType label,
}

// Try to get the distance from the Main index.
this->mainIndexGuard.lock_shared();
auto hnsw_dist = getHNSWIndex()->safeGetDistanceFrom(label, blob);
this->mainIndexGuard.unlock_shared();
auto hnsw_dist = getHNSWIndex()->getDistanceFrom_Unsafe(label, blob);

// Return the minimum distance that is not NaN.
return std::fmin(flat_dist, hnsw_dist);
Expand Down
13 changes: 11 additions & 2 deletions src/VecSim/vec_sim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,9 @@ extern "C" int VecSimIndex_DeleteVector(VecSimIndex *index, size_t label) {
return index->deleteVector(label);
}

extern "C" double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t label, const void *blob) {
return index->getDistanceFrom(label, blob);
extern "C" double VecSimIndex_GetDistanceFrom_Unsafe(VecSimIndex *index, size_t label,
const void *blob) {
return index->getDistanceFrom_Unsafe(label, blob);
}

extern "C" size_t VecSimIndex_EstimateElementSize(const VecSimParams *params) {
Expand Down Expand Up @@ -241,6 +242,14 @@ extern "C" void VecSimTieredIndex_GC(VecSimIndex *index) {
}
}

extern "C" void VecSimTieredIndex_AcquireSharedLocks(VecSimIndex *index) {
index->acquireSharedLocks();
}

extern "C" void VecSimTieredIndex_ReleaseSharedLocks(VecSimIndex *index) {
index->releaseSharedLocks();
}

extern "C" void VecSim_SetMemoryFunctions(VecSimMemoryFunctions memoryfunctions) {
VecSimAllocator::setMemoryFunctions(memoryfunctions);
}
Expand Down
14 changes: 13 additions & 1 deletion src/VecSim/vec_sim.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ int VecSimIndex_DeleteVector(VecSimIndex *index, size_t label);
* @brief Calculate the distance of a vector from an index to a vector. This function assumes that
* the vector fits the index - its type and dimension are the same as the index's, and if the
* index's distance metric is cosine, the vector is already normalized.
* IMPORTANT: for tiered index, this should be called while *locks are locked for shared ownership*,
* as we avoid acquiring the locks internally. That is since this is usually called for every vector
* individually, and the overhead of acquiring and releasing the locks is significant in that case.
* @param index the index from which the first vector is located, and that defines the distance
* metric.
* @param label the label of the vector in the index.
Expand All @@ -82,7 +85,7 @@ int VecSimIndex_DeleteVector(VecSimIndex *index, size_t label);
* @return The distance (according to the index's distance metric) between `blob` and the vector
* with label label`.
*/
double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t label, const void *blob);
double VecSimIndex_GetDistanceFrom_Unsafe(VecSimIndex *index, size_t label, const void *blob);

/**
* @brief normalize the vector blob in place.
Expand Down Expand Up @@ -199,6 +202,15 @@ void VecSimTieredIndex_GC(VecSimIndex *index);
bool VecSimIndex_PreferAdHocSearch(VecSimIndex *index, size_t subsetSize, size_t k,
bool initial_check);

/**
* @brief Acquire/Release the required locks of the tiered index externally before executing an
* an unsafe *READ* operation (as the locks are acquired for shared ownership).
* @param index the tiered index to protect (no nothing for non-tiered indexes).
*/
void VecSimTieredIndex_AcquireSharedLocks(VecSimIndex *index);

void VecSimTieredIndex_ReleaseSharedLocks(VecSimIndex *index);

/**
* @brief Allow 3rd party memory functions to be used for memory management.
*
Expand Down
4 changes: 3 additions & 1 deletion src/VecSim/vec_sim_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,5 +236,7 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
return this->newBatchIterator(processed_blob, queryParams);
}

void runGC() override {} // Do nothing, relevant for tiered index only.
void runGC() override {} // Do nothing, relevant for tiered index only.
void acquireSharedLocks() override {} // Do nothing, relevant for tiered index only.
void releaseSharedLocks() override {} // Do nothing, relevant for tiered index only.
};
12 changes: 11 additions & 1 deletion src/VecSim/vec_sim_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct VecSimIndexInterface : public VecsimBaseObject {
* @return The distance (according to the index's distance metric) between `blob` and the vector
* with id `id`.
*/
virtual double getDistanceFrom(labelType id, const void *blob) const = 0;
virtual double getDistanceFrom_Unsafe(labelType id, const void *blob) const = 0;

/**
* @brief Return the number of vectors in the index (including ones that are marked as deleted).
Expand Down Expand Up @@ -220,6 +220,16 @@ struct VecSimIndexInterface : public VecsimBaseObject {
*/
virtual void runGC() = 0;

/**
* @brief Acquire the locks for shared ownership in tiered async index.
*/
virtual void acquireSharedLocks() = 0;

/**
* @brief Release the locks for shared ownership in tiered async index.
*/
virtual void releaseSharedLocks() = 0;

/**
* @brief Allow 3rd party timeout callback to be used for limiting runtime of a query.
*
Expand Down
8 changes: 4 additions & 4 deletions tests/benchmark/bm_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ void BM_BatchIterator<index_type_t>::BF_BatchesToAdhocBF(benchmark::State &st) {
VecSimBatchIterator_Free(batchIterator);
// Switch to ad-hoc BF
for (size_t i = 0; i < N_VECTORS; i += step) {
VecSimIndex_GetDistanceFrom(INDICES[VecSimAlgo_BF], i,
QUERIES[iter % N_QUERIES].data());
VecSimIndex_GetDistanceFrom_Unsafe(INDICES[VecSimAlgo_BF], i,
QUERIES[iter % N_QUERIES].data());
}
iter++;
}
Expand Down Expand Up @@ -203,8 +203,8 @@ void BM_BatchIterator<index_type_t>::HNSW_BatchesToAdhocBF(benchmark::State &st)
memory_delta);
// Switch to ad-hoc BF
for (size_t i = 0; i < N_VECTORS; i += step) {
VecSimIndex_GetDistanceFrom(INDICES[VecSimAlgo_HNSWLIB], i,
QUERIES[iter % N_QUERIES].data());
VecSimIndex_GetDistanceFrom_Unsafe(INDICES[VecSimAlgo_HNSWLIB], i,
QUERIES[iter % N_QUERIES].data());
}
iter++;
}
Expand Down
Loading

0 comments on commit 94f2fcd

Please sign in to comment.