Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metric #606

Merged
merged 1 commit into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/abstract_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ template <typename data_t> class AbstractDataStore
// align the dimension by padding zeros.
virtual size_t get_aligned_dim() const = 0;

virtual size_t get_data_size() const = 0;

// populate the store with vectors (either from a pointer or bin file),
// potentially after pre-processing the vectors if the metric deems so
// e.g., normalizing vectors for cosine distance over floating-point vectors
Expand Down
2 changes: 2 additions & 0 deletions include/abstract_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class AbstractGraphStore
// set during load
virtual size_t get_max_range_of_graph() = 0;

virtual size_t get_graph_size() = 0;

// Total internal points _max_points + _num_frozen_points
size_t get_total_points()
{
Expand Down
2 changes: 2 additions & 0 deletions include/abstract_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "types.h"
#include "index_config.h"
#include "index_build_params.h"
#include "percentile_stats.h"
#include <any>

namespace diskann
Expand Down Expand Up @@ -108,6 +109,7 @@ class AbstractIndex

virtual bool is_label_valid(const std::string &raw_label) const = 0;
virtual bool is_set_universal_label() const = 0;
virtual TableStats get_table_stats() const = 0;

private:
virtual void _build(const DataType &data, const size_t num_points_to_load, TagVector &tags) = 0;
Expand Down
4 changes: 4 additions & 0 deletions include/in_mem_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_

virtual size_t get_aligned_dim() const override;

virtual size_t get_data_size() const override;

// Populate internal data from unaligned data while doing alignment and any
// normalization that is required.
virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
Expand Down Expand Up @@ -75,6 +77,8 @@ template <typename data_t> class InMemDataStore : public AbstractDataStore<data_

size_t _aligned_dim;

size_t _data_size = 0;

// It may seem weird to put distance metric along with the data store class,
// but this gives us perf benefits as the datastore can do distance
// computations during search and compute norms of vectors internally without
Expand Down
3 changes: 3 additions & 0 deletions include/in_mem_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class InMemGraphStore : public AbstractGraphStore
virtual size_t get_max_range_of_graph() override;
virtual uint32_t get_max_observed_degree() override;

virtual size_t get_graph_size() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string &filename, size_t expected_num_points);
#ifdef EXEC_ENV_OLS
Expand All @@ -44,6 +46,7 @@ class InMemGraphStore : public AbstractGraphStore
private:
size_t _max_range_of_graph = 0;
uint32_t _max_observed_degree = 0;
size_t _graph_size = 0;

std::vector<std::vector<uint32_t>> _graph;
};
Expand Down
3 changes: 3 additions & 0 deletions include/in_mem_static_graph_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class InMemStaticGraphStore : public AbstractGraphStore
virtual size_t get_max_range_of_graph() override;
virtual uint32_t get_max_observed_degree() override;

virtual size_t get_graph_size() override;

protected:
virtual std::tuple<uint32_t, uint32_t, size_t> load_impl(const std::string& filename, size_t expected_num_points);
#ifdef EXEC_ENV_OLS
Expand All @@ -69,6 +71,7 @@ class InMemStaticGraphStore : public AbstractGraphStore
private:
size_t _max_range_of_graph = 0;
uint32_t _max_observed_degree = 0;
size_t _graph_size = 0;

std::vector<size_t> _node_index;
std::vector<std::uint32_t> _graph;
Expand Down
5 changes: 5 additions & 0 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "in_mem_data_store.h"
#include "in_mem_graph_store.h"
#include "abstract_index.h"
#include "percentile_stats.h"
#include <bitset>

#include "quantized_distance.h"
Expand Down Expand Up @@ -306,6 +307,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

DISKANN_DLLEXPORT void count_nodes_at_bfs_levels();

DISKANN_DLLEXPORT TableStats get_table_stats() const override;

// This variable MUST be updated if the number of entries in the metadata
// change.
DISKANN_DLLEXPORT static const int METADATA_ROWS = 5;
Expand Down Expand Up @@ -567,6 +570,8 @@ template <typename T, typename TagT = uint32_t, typename LabelT = uint32_t> clas

simple_bitmask_buf _bitmask_buf;

TableStats _table_stats;

static const float INDEX_GROWTH_FACTOR;
};
} // namespace diskann
17 changes: 17 additions & 0 deletions include/percentile_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ struct QueryStats
unsigned n_hops = 0; // # search hops
};

struct TableStats
{
size_t total_mem_usage = 0;
size_t node_mem_usage = 0;
size_t graph_mem_usage = 0;
size_t label_mem_usage = 0;
size_t node_count = 0;
size_t label_count = 0;
size_t label_total_count = 0;

// streaming
size_t tag_memory_usage = 0;
size_t insert_count = 0;
size_t delete_count = 0;
size_t active_nodes = 0;
};

template <typename T>
inline T get_percentile_stats(QueryStats *stats, uint64_t len, float percentile,
const std::function<T(const QueryStats &)> &member_fn)
Expand Down
4 changes: 4 additions & 0 deletions include/pq_data_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ template <typename data_t> class PQDataStore : public AbstractDataStore<data_t>
// for Quantized data stores.
virtual size_t get_aligned_dim() const override;

virtual size_t get_data_size() const override;

// Populate quantized data from unaligned data using PQ functionality
virtual void populate_data(const data_t *vectors, const location_t num_pts) override;
virtual void populate_data(const std::string &filename, const size_t offset) override;
Expand Down Expand Up @@ -86,6 +88,8 @@ template <typename data_t> class PQDataStore : public AbstractDataStore<data_t>
uint8_t *_quantized_data = nullptr;
size_t _num_chunks = 0;

size_t _data_size = 0;

// REFACTOR TODO: Doing this temporarily before refactoring OPQ into
// its own class. Remove later.
bool _use_opq = false;
Expand Down
4 changes: 4 additions & 0 deletions include/pq_flash_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ template <typename T, typename LabelT = uint32_t> class PQFlashIndex

DISKANN_DLLEXPORT uint64_t get_data_dim();

DISKANN_DLLEXPORT TableStats get_table_stats();

std::shared_ptr<AlignedFileReader> &reader;

DISKANN_DLLEXPORT diskann::Metric get_metric();
Expand Down Expand Up @@ -241,6 +243,8 @@ template <typename T, typename LabelT = uint32_t> class PQFlashIndex
tsl::robin_map<uint32_t, std::vector<uint32_t>> _real_to_dummy_map;
std::unordered_map<std::string, LabelT> _label_map;

TableStats _table_stats;

#ifdef EXEC_ENV_OLS
// Set to a larger value than the actual header to accommodate
// any additions we make to the header. This is an outer limit
Expand Down
9 changes: 9 additions & 0 deletions src/in_mem_data_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ InMemDataStore<data_t>::InMemDataStore(const location_t num_points, const size_t
_aligned_dim = ROUND_UP(dim, _distance_fn->get_required_alignment());
alloc_aligned(((void **)&_data), this->_capacity * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
std::memset(_data, 0, this->_capacity * _aligned_dim * sizeof(data_t));

_data_size = this->_capacity * _aligned_dim * sizeof(data_t);
}

template <typename data_t> InMemDataStore<data_t>::~InMemDataStore()
Expand All @@ -33,6 +35,11 @@ template <typename data_t> size_t InMemDataStore<data_t>::get_aligned_dim() cons
return _aligned_dim;
}

template <typename data_t> size_t InMemDataStore<data_t>::get_data_size() const
{
return _data_size;
}

template <typename data_t> size_t InMemDataStore<data_t>::get_alignment_factor() const
{
return _distance_fn->get_required_alignment();
Expand Down Expand Up @@ -251,6 +258,7 @@ template <typename data_t> location_t InMemDataStore<data_t>::expand(const locat
#else
realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
#endif
this->_data_size = new_size * _aligned_dim * sizeof(data_t);
this->_capacity = new_size;
return this->_capacity;
}
Expand All @@ -277,6 +285,7 @@ template <typename data_t> location_t InMemDataStore<data_t>::shrink(const locat
#else
realloc_aligned((void **)&_data, new_size * _aligned_dim * sizeof(data_t), 8 * sizeof(data_t));
#endif
this->_data_size = new_size * _aligned_dim * sizeof(data_t);
this->_capacity = new_size;
return this->_capacity;
}
Expand Down
6 changes: 6 additions & 0 deletions src/in_mem_graph_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ std::tuple<uint32_t, uint32_t, size_t> InMemGraphStore::load_impl(const std::str
_max_range_of_graph = k;
}
}
_graph_size = cc * sizeof(uint32_t);

diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
<< std::endl;
Expand Down Expand Up @@ -241,4 +242,9 @@ uint32_t InMemGraphStore::get_max_observed_degree()
return _max_observed_degree;
}

size_t InMemGraphStore::get_graph_size()
{
return _graph_size;
}

} // namespace diskann
6 changes: 6 additions & 0 deletions src/in_mem_static_graph_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ std::tuple<uint32_t, uint32_t, size_t> InMemStaticGraphStore::load_impl(const st
}
}

_graph_size = cc * sizeof(uint32_t);
diskann::cout << "done. Index has " << nodes_read << " nodes and " << cc << " out-edges, _start is set to " << start
<< std::endl;
return std::make_tuple(nodes_read, start, file_frozen_pts);
Expand All @@ -197,4 +198,9 @@ uint32_t InMemStaticGraphStore::get_max_observed_degree()
return _max_observed_degree;
}

size_t InMemStaticGraphStore::get_graph_size()
{
return _graph_size;
}

} // namespace diskann
37 changes: 36 additions & 1 deletion src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,10 @@ size_t Index<T, TagT, LabelT>::load_tags(const std::string tag_filename)
#else
load_bin<TagT>(std::string(tag_filename), tag_data, file_num_points, file_dim);
#endif
this->_table_stats.tag_memory_usage =
file_num_points * file_dim * sizeof(TagT)
+ file_num_points * (sizeof(TagT) + sizeof(uint32_t))
+ file_num_points * (sizeof(TagT) + sizeof(uint32_t));

if (file_dim != 1)
{
Expand Down Expand Up @@ -562,6 +566,9 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
std::string delete_set_file = std::string(filename) + ".del";
std::string graph_file = std::string(filename);
data_file_num_pts = load_data(data_file);
this->_table_stats.node_count = data_file_num_pts;
this->_table_stats.node_mem_usage = this->_data_store->get_data_size();

if (file_exists(delete_set_file))
{
load_delete_set(delete_set_file);
Expand All @@ -571,6 +578,7 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
tags_file_num_pts = load_tags(tags_file);
}
graph_num_pts = load_graph(graph_file, data_file_num_pts);
this->_table_stats.graph_mem_usage = _graph_store->get_graph_size();
#endif
}
else
Expand All @@ -594,8 +602,12 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
if (file_exists(labels_file))
{
_label_map = load_label_map(labels_map_file);
this->_table_stats.label_count = _label_map.size();

parse_label_file_in_bitset(labels_file, label_num_pts, _label_map.size());
assert(label_num_pts == data_file_num_pts - _num_frozen_pts);
this->_table_stats.label_mem_usage = _bitmask_buf._buf.size() * sizeof(std::uint64_t);

if (file_exists(labels_to_medoids))
{
std::ifstream medoid_stream(labels_to_medoids);
Expand Down Expand Up @@ -644,8 +656,14 @@ void Index<T, TagT, LabelT>::load(const char *filename, uint32_t num_threads, ui
{
_empty_slots.insert((uint32_t)i);
}

reposition_frozen_point_to_end();

_table_stats.tag_memory_usage = _table_stats.node_mem_usage
+ _table_stats.graph_mem_usage
+ _table_stats.label_mem_usage
+ _table_stats.tag_memory_usage;

diskann::cout << "Num frozen points:" << _num_frozen_pts << " _nd: " << _nd << " _start: " << _start
<< " size(_location_to_tag): " << _location_to_tag.size()
<< " size(_tag_to_location):" << _tag_to_location.size() << " Max points: " << _max_points
Expand Down Expand Up @@ -2024,6 +2042,7 @@ void Index<T, TagT, LabelT>::parse_label_file_in_bitset(const std::string& label
simple_bitmask bm(_bitmask_buf.get_bitmask(line_cnt), _bitmask_buf._bitmask_size);
bm.set(token_as_num);
_labels.insert(token_as_num);
_table_stats.label_total_count++;

lbl_pos = next_lbl_pos + 1;
}
Expand Down Expand Up @@ -3235,6 +3254,11 @@ int Index<T, TagT, LabelT>::insert_point(const T *point, const TagT tag, const s

inter_insert(location, pruned_list, scratch);

// only support single thread insert
_table_stats.insert_count++;
_table_stats.active_nodes++;
_table_stats.node_count++;

return 0;
}

Expand Down Expand Up @@ -3285,6 +3309,11 @@ template <typename T, typename TagT, typename LabelT> int Index<T, TagT, LabelT>
_delete_set->insert(location);
_location_to_tag.erase(location);
_tag_to_location.erase(tag);

//only support single thread delete
_table_stats.delete_count++;
_table_stats.active_nodes--;

return 0;
}

Expand Down Expand Up @@ -3565,6 +3594,12 @@ size_t Index<T, TagT, LabelT>::search_string_range(const std::string& str, char
return std::string::npos;
}

template <typename T, typename TagT, typename LabelT>
TableStats Index<T, TagT, LabelT>::get_table_stats() const
{
return _table_stats;
}

/* Internals of the library */
template <typename T, typename TagT, typename LabelT> const float Index<T, TagT, LabelT>::INDEX_GROWTH_FACTOR = 1.5f;

Expand Down
7 changes: 7 additions & 0 deletions src/pq_data_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,16 @@ template <typename data_t> location_t PQDataStore<data_t>::load_impl(const std::
auto pivots_file = _pq_distance_fn->get_pivot_data_filename(file_prefix);
_pq_distance_fn->load_pivot_data(pivots_file, _num_chunks);

_data_size = num_points * _num_chunks * sizeof(data_t);

return this->_capacity;
}

template <typename data_t> size_t PQDataStore<data_t>::get_data_size() const
{
return _data_size;
}

template <typename data_t> location_t PQDataStore<data_t>::expand(const location_t new_size)
{
throw std::logic_error("Not implemented yet");
Expand Down
Loading
Loading