Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create HNSW graph data interface with SpeeDB implementation (by Hilik) #510

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions src/VecSim/algorithms/hnsw/abs_graph_data.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#pragma once

#include <deque>
#include <memory>
#include <cassert>
#include <climits>
#include <queue>
#include <random>
#include <iostream>
#include <algorithm>
#include <unordered_map>
#include <sys/resource.h>
#include <fstream>
#include <shared_mutex>

#include "visited_nodes_handler.h"
#include "VecSim/spaces/spaces.h"
#include "VecSim/memory/vecsim_malloc.h"
#include "VecSim/utils/vecsim_stl.h"
#include "VecSim/utils/vec_utils.h"
#include "VecSim/utils/data_block.h"
#include "VecSim/utils/vecsim_results_container.h"
#include "VecSim/query_result_definitions.h"
#include "VecSim/vec_sim_common.h"
#include "VecSim/vec_sim_index.h"
#include "VecSim/tombstone_interface.h"

#ifdef BUILD_TESTS
#include "hnsw_serialization_utils.h"
#include "VecSim/utils/serializer.h"
#endif

using std::pair;
using graphNodeType = pair<idType, ushort>; // represented as: (element_id, level)



class absEdges {
public:
absEdges();
virtual ~absEdges();

virtual void push(idType id) = 0;

virtual bool removeIdIfExists(idType element_id) = 0;
virtual void removeId(idType element_id) = 0;

virtual std::pair<size_t, const idType *> Get() = 0;
virtual void Set(std::pair<size_t, const idType *> inp) = 0;

virtual void save(std::ofstream &output) ;
virtual void restore(std::ifstream &input);
};


// vector metadata contains all the metadata of the vector;
// this is replacing the id->metadata table and the element graph data
//

struct VectorMetaData
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be part of the index, not the generic graph store

{
enum Flags {
DELETE_MARK = 0x1, // element is logically deleted, but still exists in the graph
IN_PROCESS = 0x2, // element is being inserted into the graph
PERMANENT_DELETED = 0x4, // element no longer in the graph
};
VectorMetaData(const labelType &label, uint8_t max_level) :
label_(label), max_level_(max_level), flags_(0) {}

VectorMetaData(const VectorMetaData &src) :
label_(src.label_), max_level_(src.max_level_)
{flags_ = char(src.flags_);}

// mark methods
void mark(Flags flag) {
flags_ |= flag;
}
void unmark(Flags flag) {
flags_ &= ~flag;
}
bool ismarked(Flags flag) const {
return flags_ & flag;
}

labelType label_;
uint8_t max_level_;
std::atomic<uint8_t> flags_ = 0;
std::mutex NodeGuard;
};


class WriteBatch;
class absGraphData {
public:
absGraphData() {}
virtual ~absGraphData() {};

// vector methods
virtual const char *
getVectorByInternalId(idType internal_id) const = 0;

virtual void
multiGetVectors(const std::vector<idType> &,
std::vector<const char *> &results) const = 0;

virtual idType
pushVector(const void *vector_data,
int max_level,
const labelType &label,
WriteBatch *wb) = 0;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a part of the raw data, should not be part of the absGrpahData


// premanently delete the vector and the edges "free" the id
virtual void
deleteVectorAndEdges(idType internalId,
WriteBatch *wb) = 0;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove only the nodes and edges associated with the id



// vectorMetaData methods
virtual const VectorMetaData &
vectorMetaDataById(idType internal_id) const = 0;


virtual VectorMetaData &
vectorMetaDataById(idType internal_id,
WriteBatch *wb);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

meta data is part of the index





// outgoing edges
virtual const absEdges &
GetLevelOutgoingEdges(const graphNodeType &) const = 0;

virtual absEdges &
GetLevelOutgoingEdges(const graphNodeType &,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
GetLevelOutgoingEdges(const graphNodeType &,
GetLevelOutgoingEdgesForUpdate(const graphNodeType &,

WriteBatch *) = 0;


// inomming edges
// fetch incoming from the database
virtual const absEdges &
GetLevelIncomingEdges(const graphNodeType &) const = 0;
virtual absEdges &
GetLevelIncomingEdges(const graphNodeType &,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
GetLevelIncomingEdges(const graphNodeType &,
GetLevelIncomingEdgesForUpdate(const graphNodeType &,

WriteBatch *) = 0;

// support only simple updates (add / delete target) operations
// may not fetch the data from the database
virtual absEdges &
GetLevelVirtualIncomingEdges(const graphNodeType &id,
WriteBatch *) = 0;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addEdge(source, dest, level)

addIncomingEdge(source, dest, level)

addBidirectionalEdge(id1, id2. level)

updateAllEdges(source, list_of_dest, level)

// helper methods

// scan the database for the first node after starting id that exist at level
virtual idType
getVectorIdByLevel(short level,
idType startingId) const = 0;

// get a pair of candidates to swap for the gc
// first is a location that is permanent deleted
// second is a location that is valid
// start points is the last pair returned in the prev scan
virtual idType
getGarbadgeCollectionTarget(idType startPoint) const = 0;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

belongs to meta-data


// new and commit wrire batch
virtual WriteBatch *newWriteBatch() = 0;
virtual void CommitWriteBatch(WriteBatch *wb) = 0;


virtual void shrinkToFit() = 0;

public:
virtual void save(std::ofstream &output) const = 0;
virtual void restore(std::ifstream &input) = 0;

static absGraphData *
NewRamGraphData(std::shared_ptr<VecSimAllocator> allocator,
size_t block_size,
size_t max_num_outgoing_links,
size_t vector_size_bytes,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vector_size_bytes and vector alignment not needed here

size_t initial_capacity,
size_t vector_alignment);

static absGraphData *
NewRamWBGraphData(std::shared_ptr<VecSimAllocator> allocator,
size_t block_size,
size_t max_num_outgoing_links,
size_t vector_size_bytes,
size_t initial_capacity,
size_t vector_alignment);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove


static absGraphData *
NewDBGraphData(std::shared_ptr<VecSimAllocator> allocator,
std::string db_path);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
static absGraphData *
NewDBGraphData(std::shared_ptr<VecSimAllocator> allocator,
std::string db_path);
static absGraphData *
NewSpeedDBGraphData(std::shared_ptr<VecSimAllocator> allocator,
std::string db_path);



protected:

};

Loading
Loading