diff --git a/apps/search_disk_index.cpp b/apps/search_disk_index.cpp index fdc0e79cd..b3312de58 100644 --- a/apps/search_disk_index.cpp +++ b/apps/search_disk_index.cpp @@ -29,9 +29,101 @@ #endif #define WARMUP false +#define DISKANN_DEBUG_INDIVIDUAL_RESULTS namespace po = boost::program_options; +#ifdef DISKANN_DEBUG_INDIVIDUAL_RESULTS +void dump_individual_results(uint64_t test_id, uint64_t query_num, uint32_t *gt_ids, float *gt_dists, uint64_t gt_dim, + const std::vector &query_result_ids, + const std::vector &query_result_dists, uint64_t recall_at, + const std::string &result_output_prefix) +{ + uint32_t cumulative_dist_matches = 0; + uint32_t cumulative_id_matches = 0; + std::stringstream results_stream; + std::stringstream per_query_stats_stream; + + per_query_stats_stream << "query_id\tid_matches\tdist_matches\ttotal_matches\trecall" << std::endl; + for (int qid = 0; qid < query_num; qid++) + { + results_stream << qid << "\t"; + uint32_t per_query_dist_matches = 0; + uint32_t per_query_id_matches = 0; + + for (uint64_t i = 0; i < recall_at; i++) + { + auto rindex = qid * recall_at + i; + results_stream << "(" << query_result_ids[rindex] << "," << query_result_dists[rindex] << ","; + + bool id_match = false; + bool dist_match = false; + for (uint64_t j = 0; j < recall_at; j++) + { + auto gindex = qid * gt_dim + j; + if (query_result_ids[rindex] == gt_ids[gindex]) + { + per_query_id_matches++; + id_match = true; + break; + } + else if (query_result_dists[rindex] / gt_dists[gindex] <= 1.0f) + { + per_query_dist_matches++; + dist_match = true; + break; + } + } + std::string code = "X"; + if (id_match) + { + code = "I"; + } + else if (dist_match) + { + code = "D"; + } + results_stream << code << "),"; + } + + results_stream << std::endl; + + cumulative_id_matches += per_query_id_matches; + cumulative_dist_matches += per_query_dist_matches; + per_query_stats_stream << qid << "\t" << per_query_id_matches << "\t" << per_query_dist_matches << "\t" + << per_query_id_matches + per_query_dist_matches << "\t" + << (per_query_id_matches + per_query_dist_matches) * 1.0f / recall_at << std::endl; + } + { + + std::string results_file = result_output_prefix + "_L" + std::to_string(test_id) + "_results.tsv"; + std::ofstream out(results_file); + out << results_stream.str() << std::endl; + } + { + std::string per_query_stats_file = result_output_prefix + "_L" + std::to_string(test_id) + "_query_stats.tsv"; + std::ofstream out(per_query_stats_file); + out << per_query_stats_stream.str() << std::endl; + } +} + +void write_gt_to_tsv(const std::string &cur_result_path, uint64_t query_num, uint32_t *gt_ids, float *gt_dists, + uint64_t gt_dim) +{ + std::ofstream gt_out(cur_result_path + "_gt.tsv"); + for (int i = 0; i < query_num; i++) + { + gt_out << i << "\t"; + for (int j = 0; j < gt_dim; j++) + { + gt_out << "(" << gt_ids[i * gt_dim + j] << "," << gt_dists[i * gt_dim + j] << "),"; + } + gt_out << std::endl; + } +} +#endif + + void print_stats(std::string category, std::vector percentiles, std::vector results) { diskann::cout << std::setw(20) << category << ": " << std::flush; @@ -54,7 +146,7 @@ void parse_labels_of_query(const std::string &filters_for_query, std::vector &label_ids_for_query) { std::vector label_strs_for_query; - diskann::split_string(filters_for_query, MULTIPLE_LABEL_SEPARATOR, label_strs_for_query); + diskann::split_string(filters_for_query, FILTER_OR_SEPARATOR, label_strs_for_query); for (auto &label_str_for_query : label_strs_for_query) { label_ids_for_query.push_back(pFlashIndex->get_converted_label(label_str_for_query)); @@ -70,7 +162,7 @@ void populate_label_ids(const std::vector &filters_of_queries, { std::vector label_ids_of_query; parse_labels_of_query(filters_of_queries[0], pFlashIndex, label_ids_of_query); - for (auto i = 0; i < query_count; i++) + for (uint32_t i = 0; i < query_count; i++) { label_ids_of_queries.push_back(label_ids_of_query); } @@ -318,6 +410,10 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre query_result_ids[test_id].data(), recall_at, recall_at); best_recall = std::max(recall, best_recall); } +#ifdef DISKANN_DEBUG_INDIVIDUAL_RESULTS + dump_individual_results(test_id, query_num, gt_ids, gt_dists, gt_dim, query_result_ids[test_id], + query_result_dists[test_id], recall_at, result_output_prefix); +#endif diskann::cout << std::setw(6) << L << std::setw(12) << optimized_beamwidth << std::setw(16) << qps << std::setw(16) << mean_latency << std::setw(16) << latency_999 << std::setw(16) << mean_ios @@ -327,31 +423,14 @@ int search_disk_index(diskann::Metric &metric, const std::string &index_path_pre diskann::cout << std::setw(16) << recall << std::endl ; } else + { diskann::cout << std::endl; - - //std::stringstream rslts_string; - //for (auto x = 0; x < query_num; x++) - //{ - // rslts_string << "-----------------------------------------" << std::endl; - // rslts_string << "Query: " << x << std::endl; - // rslts_string << "GT: {"; - // for (auto rx = 0; rx < recall_at; rx++) - // { - // rslts_string << "(" << gt_ids[x* gt_dim + rx] << "," << gt_dists[x * gt_dim + rx] << "), "; - // } - // rslts_string << "}" << std::endl; - // rslts_string << "Results: {"; - // for (auto rx = 0; rx < recall_at; rx++) - // { - // rslts_string << "(" << query_result_ids[test_id][x * recall_at + rx] << "," - // << query_result_dists[test_id][x * recall_at + rx] << "), "; - // } - // rslts_string << "}" << std::endl; - // rslts_string << "-----------------------------------------" << std::endl; - //} - //diskann::cout << rslts_string.str() << std::endl; + } delete[] stats; } +#ifdef DISKANN_DEBUG_INDIVIDUAL_RESULTS + write_gt_to_tsv(result_output_prefix, query_num, gt_ids, gt_dists, gt_dim); +#endif diskann::cout << "Done searching. Now saving results " << std::endl; uint64_t test_id = 0; diff --git a/include/neighbor.h b/include/neighbor.h index 7e6b58a65..d7c0c25ed 100644 --- a/include/neighbor.h +++ b/include/neighbor.h @@ -109,11 +109,6 @@ class NeighborPriorityQueue return _cur < _size; } - void sort() - { - std::sort(_data.begin(), _data.begin() + _size); - } - size_t size() const { return _size; diff --git a/include/pq_flash_index.h b/include/pq_flash_index.h index d37988f78..5eaf85a06 100644 --- a/include/pq_flash_index.h +++ b/include/pq_flash_index.h @@ -18,6 +18,11 @@ #include "tsl/robin_set.h" #define FULL_PRECISION_REORDER_MULTIPLIER 3 +#define DEFAULT_VISITED_RESERVE_SIZE 4096 +//default max filters per query is set to the same +//as what we expect Bing to provide. If this is overkill, +//it can be set by clients in the load() function +#define DEFAULT_MAX_FILTERS_PER_QUERY 4096 namespace diskann { @@ -30,24 +35,28 @@ template class PQFlashIndex DISKANN_DLLEXPORT ~PQFlashIndex(); #ifdef EXEC_ENV_OLS - DISKANN_DLLEXPORT int load(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix); + DISKANN_DLLEXPORT int load(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix, + uint32_t max_filters_per_query = DEFAULT_MAX_FILTERS_PER_QUERY); #else // load compressed data, and obtains the handle to the disk-resident index - DISKANN_DLLEXPORT int load(uint32_t num_threads, const char *index_prefix); + DISKANN_DLLEXPORT int load(uint32_t num_threads, const char *index_prefix, + uint32_t max_filters_per_query = DEFAULT_MAX_FILTERS_PER_QUERY); #endif - DISKANN_DLLEXPORT void load_labels(const std::string& disk_index_filepath); - DISKANN_DLLEXPORT void load_label_medoid_map( - const std::string &labels_to_medoids_filepath, std::istream &medoid_stream); - DISKANN_DLLEXPORT void load_dummy_map(const std::string& dummy_map_filepath, std::istream &dummy_map_stream); + DISKANN_DLLEXPORT void load_labels(const std::string &disk_index_filepath); + DISKANN_DLLEXPORT void load_label_medoid_map(const std::string &labels_to_medoids_filepath, + std::istream &medoid_stream); + DISKANN_DLLEXPORT void load_dummy_map(const std::string &dummy_map_filepath, std::istream &dummy_map_stream); #ifdef EXEC_ENV_OLS DISKANN_DLLEXPORT int load_from_separate_paths(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_filepath, const char *pivots_filepath, - const char *compressed_filepath); + const char *compressed_filepath, + uint32_t max_filters_per_query); #else DISKANN_DLLEXPORT int load_from_separate_paths(uint32_t num_threads, const char *index_filepath, - const char *pivots_filepath, const char *compressed_filepath); + const char *pivots_filepath, const char *compressed_filepath, + uint32_t max_filters_per_query); #endif DISKANN_DLLEXPORT void load_cache_list(std::vector &node_list); @@ -116,7 +125,8 @@ template class PQFlashIndex protected: DISKANN_DLLEXPORT void use_medoids_data_as_centroids(); - DISKANN_DLLEXPORT void setup_thread_data(uint64_t nthreads, uint64_t visited_reserve = 4096); + DISKANN_DLLEXPORT void setup_thread_data(uint64_t nthreads, uint64_t visited_reserve = DEFAULT_VISITED_RESERVE_SIZE, + uint64_t max_filters_per_query = DEFAULT_MAX_FILTERS_PER_QUERY); DISKANN_DLLEXPORT void set_universal_label(const LabelT &label); @@ -189,7 +199,7 @@ template class PQFlashIndex // chunk_size = chunk size of each dimension chunk // pq_tables = float* [[2^8 * [chunk_size]] * _n_chunks] uint8_t *data = nullptr; - uint64_t _n_chunks; + uint64_t _n_chunks = 0; FixedChunkPQTable _pq_table; // distance comparator @@ -207,7 +217,7 @@ template class PQFlashIndex // we can optionally have multiple starting points uint32_t *_medoids = nullptr; // defaults to 1 - size_t _num_medoids; + size_t _num_medoids = 1; // by default, it is empty. If there are multiple // centroids, we pick the medoid corresponding to the // closest centroid as the starting point of search diff --git a/include/scratch.h b/include/scratch.h index 2f43e3365..d7a3758aa 100644 --- a/include/scratch.h +++ b/include/scratch.h @@ -150,7 +150,7 @@ template class SSDQueryScratch : public AbstractScratch NeighborPriorityQueue retset; std::vector full_retset; - SSDQueryScratch(size_t aligned_dim, size_t visited_reserve); + SSDQueryScratch(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query); ~SSDQueryScratch(); void reset(); @@ -162,7 +162,7 @@ template class SSDThreadData SSDQueryScratch scratch; IOContext ctx; - SSDThreadData(size_t aligned_dim, size_t visited_reserve); + SSDThreadData(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query); void clear(); }; diff --git a/include/utils.h b/include/utils.h index 463331435..52137c8b0 100644 --- a/include/utils.h +++ b/include/utils.h @@ -57,7 +57,7 @@ typedef int FileHandle; #define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||" #define PBWIDTH 60 -#define MULTIPLE_LABEL_SEPARATOR "|" +#define FILTER_OR_SEPARATOR "|" inline bool file_exists_impl(const std::string &name, bool dirCheck = false) { diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 016560217..59fc95b06 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -1274,7 +1274,9 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const augmented_labels_file = index_prefix_path + "_augmented_labels.txt"; if (filter_threshold != 0) { - dummy_remap_file = index_prefix_path + "_dummy_remap.txt"; + //changing the dummy map file from _dummy_map.txt to _disk.index_dummy_map.txt to keep with the + //convention that the index files all have the _disk.index prefix. + dummy_remap_file = index_prefix_path + "_disk.index_dummy_map.txt"; breakup_dense_points(data_file_to_use, labels_file_to_use, filter_threshold, augmented_data_file, augmented_labels_file, dummy_remap_file); // RKNOTE: This has large memory footprint, diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index a86609f1d..6700666b5 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -117,7 +117,7 @@ template inline T *PQFlashIndex::offset } template -void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve) +void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visited_reserve, uint64_t max_filters_per_query) { diskann::cout << "Setting up thread-specific contexts for nthreads: " << nthreads << std::endl; // omp parallel for to generate unique thread IDs @@ -126,7 +126,7 @@ void PQFlashIndex::setup_thread_data(uint64_t nthreads, uint64_t visi { #pragma omp critical { - SSDThreadData *data = new SSDThreadData(this->_aligned_dim, visited_reserve); + SSDThreadData *data = new SSDThreadData(this->_aligned_dim, visited_reserve, max_filters_per_query); this->reader->register_thread(); data->ctx = this->reader->get_ctx(); this->_thread_data.push(data); @@ -598,7 +598,8 @@ LabelT PQFlashIndex::get_converted_label(const std::string &filter_la return _universal_filter_label; } std::stringstream stream; - stream << "Unable to find label in the Label Map"; + stream << "Unable to find label " << filter_label + << " in the Label Map "; diskann::cerr << stream.str() << std::endl; throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } @@ -674,8 +675,6 @@ inline bool PQFlashIndex::point_has_label(uint32_t point_id, LabelT l template bool PQFlashIndex::point_has_any_label(uint32_t point_id, const std::vector &label_ids) { - uint32_t start_vec = _pts_to_label_offsets[point_id]; - uint32_t num_lbls = _pts_to_label_counts[start_vec]; bool ret_val = false; for (auto &cur_lbl : label_ids) { @@ -950,10 +949,10 @@ template void PQFlashIndex::load_labels #ifdef EXEC_ENV_OLS template -int PQFlashIndex::load(MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix) +int PQFlashIndex::load(MemoryMappedFiles &files, uint32_t num_threads, const char *index_prefix, uint32_t max_filters_per_query) { #else -template int PQFlashIndex::load(uint32_t num_threads, const char *index_prefix) +template int PQFlashIndex::load(uint32_t num_threads, const char *index_prefix, uint32_t max_filters_per_query) { #endif std::string pq_table_bin = std::string(index_prefix) + "_pq_pivots.bin"; @@ -961,10 +960,10 @@ template int PQFlashIndex::load(uint32_ std::string _disk_index_file = std::string(index_prefix) + "_disk.index"; #ifdef EXEC_ENV_OLS return load_from_separate_paths(files, num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(), - pq_compressed_vectors.c_str()); + pq_compressed_vectors.c_str(), max_filters_per_query); #else return load_from_separate_paths(num_threads, _disk_index_file.c_str(), pq_table_bin.c_str(), - pq_compressed_vectors.c_str()); + pq_compressed_vectors.c_str(), max_filters_per_query); #endif } @@ -972,12 +971,13 @@ template int PQFlashIndex::load(uint32_ template int PQFlashIndex::load_from_separate_paths(diskann::MemoryMappedFiles &files, uint32_t num_threads, const char *index_filepath, const char *pivots_filepath, - const char *compressed_filepath) + const char *compressed_filepath, uint32_t max_filters_per_query) { #else template int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, const char *index_filepath, - const char *pivots_filepath, const char *compressed_filepath) + const char *pivots_filepath, const char *compressed_filepath, + uint32_t max_filters_per_query) { #endif std::string pq_table_bin = pivots_filepath; @@ -1067,7 +1067,7 @@ int PQFlashIndex::load_from_separate_paths(uint32_t num_threads, cons // bytes are needed to store the header and read in that many using our // 'standard' aligned file reader approach. reader->open(_disk_index_file); - this->setup_thread_data(num_threads); + this->setup_thread_data(num_threads, DEFAULT_VISITED_RESERVE_SIZE, max_filters_per_query); this->_max_nthreads = num_threads; char *bytes = getHeaderBytes(); @@ -1294,7 +1294,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t const bool use_filter, const LabelT &filter_label, const bool use_reorder_data, QueryStats *stats) { - std::vector filters(1); + std::vector filters; filters.push_back(filter_label); cached_beam_search(query1, k_search, l_search, indices, distances, beam_width, use_filter, filters, std::numeric_limits::max(), use_reorder_data, stats); @@ -1480,8 +1480,6 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t const std::vector& label_ids = filter_labels; //avoid renaming. std::vector lbl_vec; - retset.sort(); - while (retset.has_unexpanded_node() && num_ios < max_ios_for_query) { // clear iteration state @@ -1495,8 +1493,7 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t for (const auto &lbl : label_ids) - { // assuming that number of OR labels is - // less than max frontier size allowed + { uint32_t lbl_marker = 0; while (lbl_marker < cur_list_size) { @@ -1607,25 +1604,13 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t cur_expanded_dist = _disk_pq_table.l2_distance( // disk_pq does not support OPQ yet query_float, (uint8_t *)node_fp_coords_copy); } - if (use_filters) - { - location_t real_id = cached_nhood.first; - if (_dummy_pts.find(real_id) != _dummy_pts.end()) - { - real_id = _dummy_to_real_map[real_id]; - } - if (full_retset_ids.find(real_id) == full_retset_ids.end()) - { - full_retset.push_back(Neighbor((uint32_t)real_id, cur_expanded_dist)); - full_retset_ids.insert(real_id); - } - } - else + location_t real_id = cached_nhood.first; + if (full_retset_ids.find(real_id) == full_retset_ids.end()) { - full_retset.push_back(Neighbor((unsigned)cached_nhood.first, cur_expanded_dist)); + full_retset.push_back(Neighbor((uint32_t)real_id, cur_expanded_dist)); + full_retset_ids.insert(real_id); } - uint64_t nnbrs = cached_nhood.second.first; uint32_t *node_nbrs = cached_nhood.second.second; @@ -1689,23 +1674,11 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t else cur_expanded_dist = _disk_pq_table.l2_distance(query_float, (uint8_t *)data_buf); } - if (use_filters) - { - location_t real_id = frontier_nhood.first; - if (_dummy_pts.find(real_id) != _dummy_pts.end()) - { - real_id = _dummy_to_real_map[real_id]; - } - - if (full_retset_ids.find(real_id) == full_retset_ids.end()) - { - full_retset.push_back(Neighbor(real_id, cur_expanded_dist)); - full_retset_ids.insert(real_id); - } - } - else + location_t real_id = frontier_nhood.first; + if (full_retset_ids.find(real_id) == full_retset_ids.end()) { - full_retset.push_back(Neighbor(frontier_nhood.first, cur_expanded_dist)); + full_retset.push_back(Neighbor(real_id, cur_expanded_dist)); + full_retset_ids.insert(real_id); } uint32_t *node_nbrs = (node_buf + 1); diff --git a/src/scratch.cpp b/src/scratch.cpp index c3836ccf1..287d41db8 100644 --- a/src/scratch.cpp +++ b/src/scratch.cpp @@ -93,13 +93,16 @@ template void SSDQueryScratch::reset() full_retset.clear(); } -template SSDQueryScratch::SSDQueryScratch(size_t aligned_dim, size_t visited_reserve) +template SSDQueryScratch::SSDQueryScratch(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) { size_t coord_alloc_size = ROUND_UP(sizeof(T) * aligned_dim, 256); diskann::alloc_aligned((void **)&coord_scratch, coord_alloc_size, 256); - diskann::alloc_aligned((void **)§or_scratch, defaults::MAX_N_SECTOR_READS * defaults::SECTOR_LEN, + + size_t max_sectors_in_scratch = (std::max)(defaults::MAX_N_SECTOR_READS, max_filters_per_query); + diskann::alloc_aligned((void **)§or_scratch, max_sectors_in_scratch * defaults::SECTOR_LEN, defaults::SECTOR_LEN); + diskann::alloc_aligned((void **)&this->_aligned_query_T, aligned_dim * sizeof(T), 8 * sizeof(T)); this->_pq_scratch = new PQScratch(defaults::MAX_GRAPH_DEGREE, aligned_dim); @@ -121,7 +124,7 @@ template SSDQueryScratch::~SSDQueryScratch() } template -SSDThreadData::SSDThreadData(size_t aligned_dim, size_t visited_reserve) : scratch(aligned_dim, visited_reserve) +SSDThreadData::SSDThreadData(size_t aligned_dim, size_t visited_reserve, size_t max_filters_per_query) : scratch(aligned_dim, visited_reserve, max_filters_per_query) { }