From e66952188c97ff2277f6228ad8c7237141882b5d Mon Sep 17 00:00:00 2001 From: Neelam Mahapatro Date: Mon, 29 Jan 2024 22:07:14 +0530 Subject: [PATCH] change raw_labels to populate_labels --- include/abstract_filter_store.h | 5 +--- include/in_mem_filter_store.h | 6 ++--- include/index.h | 7 ----- include/utils.h | 47 --------------------------------- src/in_mem_filter_store.cpp | 17 +++++------- src/index.cpp | 12 +++------ 6 files changed, 13 insertions(+), 81 deletions(-) diff --git a/include/abstract_filter_store.h b/include/abstract_filter_store.h index 09967b7c0..ee9b15dc1 100644 --- a/include/abstract_filter_store.h +++ b/include/abstract_filter_store.h @@ -40,7 +40,7 @@ template class AbstractFilterStore DISKANN_DLLEXPORT virtual std::pair get_universal_label() = 0; // takes raw label file and then genrate internal mapping file and keep the info of mapping - DISKANN_DLLEXPORT virtual size_t load_raw_labels(const std::string &raw_labels_file, + DISKANN_DLLEXPORT virtual size_t populate_labels(const std::string &raw_labels_file, const std::string &raw_universal_label) = 0; DISKANN_DLLEXPORT virtual void save_labels(const std::string &save_path, const size_t total_points) = 0; @@ -59,9 +59,6 @@ template class AbstractFilterStore private: size_t _num_points; - // populates pts_to labels and _labels from given label file - virtual size_t parse_label_file(const std::string &label_file) = 0; - // mark Index as friend so it can access protected loads template friend class Index; }; diff --git a/include/in_mem_filter_store.h b/include/in_mem_filter_store.h index ff187ccff..cbeab45b8 100644 --- a/include/in_mem_filter_store.h +++ b/include/in_mem_filter_store.h @@ -34,7 +34,7 @@ template class InMemFilterStore : public AbstractFilterSto std::pair get_universal_label() override; // ideally takes raw label file and then genrate internal mapping file and keep the info of mapping - size_t load_raw_labels(const std::string &raw_labels_file, const std::string &raw_universal_label) override; + size_t populate_labels(const std::string &raw_labels_file, const std::string &raw_universal_label) override; void save_labels(const std::string &save_path, const size_t total_points) override; // For dynamic filtered build, we compact the data and hence location_to_labels, we need the compacted version of @@ -50,6 +50,7 @@ template class InMemFilterStore : public AbstractFilterSto protected: // This is for internal use and only loads already parsed file, used by index in during load(). + // populates _loaction_to labels and _labels from given label file size_t load_labels(const std::string &labels_file) override; void load_label_map(const std::string &labels_map_file) override; void load_universal_labels(const std::string &universal_labels_file) override; @@ -69,9 +70,6 @@ template class InMemFilterStore : public AbstractFilterSto // 2. from _label_map and _mapped_universal_label, we can know what is raw universal label. Hence seems duplicate // std::string _raw_universal_label; - // populates _loaction_to labels and _labels from given label file - size_t parse_label_file(const std::string &label_file); - bool detect_common_filters_by_set_intersection(uint32_t point_id, bool search_invocation, const std::vector &incoming_labels); }; diff --git a/include/index.h b/include/index.h index 4e3dedce0..e40fba51b 100644 --- a/include/index.h +++ b/include/index.h @@ -112,11 +112,6 @@ template clas const IndexFilterParams &filter_params, const std::vector &tags = std::vector()); - // Filtered support streaming index - DISKANN_DLLEXPORT void build_filtered_index(const T *data, const size_t num_points_to_load, - const IndexFilterParams &filter_params, - const std::vector &tags = std::vector()); - // DISKANN_DLLEXPORT void set_universal_label(const LabelT &label); DISKANN_DLLEXPORT void set_universal_label(const std::string &raw_labels); @@ -249,8 +244,6 @@ template clas // determines navigating node of the graph by calculating medoid of datafopt uint32_t calculate_entry_point(); - void parse_label_file(const std::string &label_file, size_t &num_pts_labels); - // Returns the locations of start point and frozen points suitable for use // with iterate_to_fixed_point. std::vector get_init_ids(); diff --git a/include/utils.h b/include/utils.h index b2a574c87..19033d786 100644 --- a/include/utils.h +++ b/include/utils.h @@ -176,53 +176,6 @@ inline int delete_file(const std::string &fileName) } } -inline void convert_label_to_numeric(const std::string &inFileName, const std::string &outFileName, - const std::string &mapFileName, const std::string &unv_label) -{ - std::unordered_map string_int_map; - std::ofstream label_writer(outFileName); - std::ifstream label_reader(inFileName); - if (unv_label != "") - string_int_map[unv_label] = 0; // if universal label is provided map it to 0 always - std::string line, token; - while (std::getline(label_reader, line)) - { - std::istringstream new_iss(line); - std::vector lbls; - while (getline(new_iss, token, ',')) - { - token.erase(std::remove(token.begin(), token.end(), '\n'), token.end()); - token.erase(std::remove(token.begin(), token.end(), '\r'), token.end()); - if (string_int_map.find(token) == string_int_map.end()) - { - uint32_t nextId = (uint32_t)string_int_map.size() + 1; - string_int_map[token] = nextId; // nextId can never be 0 - } - lbls.push_back(string_int_map[token]); - } - if (lbls.size() <= 0) - { - std::cout << "No label found"; - exit(-1); - } - for (size_t j = 0; j < lbls.size(); j++) - { - if (j != lbls.size() - 1) - label_writer << lbls[j] << ","; - else - label_writer << lbls[j] << std::endl; - } - } - label_writer.close(); - - std::ofstream map_writer(mapFileName); - for (auto mp : string_int_map) - { - map_writer << mp.first << "\t" << mp.second << std::endl; - } - map_writer.close(); -} - #ifdef EXEC_ENV_OLS class AlignedFileReader; #endif diff --git a/src/in_mem_filter_store.cpp b/src/in_mem_filter_store.cpp index a98e5c050..e125f8adb 100644 --- a/src/in_mem_filter_store.cpp +++ b/src/in_mem_filter_store.cpp @@ -94,7 +94,7 @@ template std::pair InMemFilterStore -size_t InMemFilterStore::load_raw_labels(const std::string &raw_labels_file, +size_t InMemFilterStore::populate_labels(const std::string &raw_labels_file, const std::string &raw_universal_label) { std::string raw_label_file_path = @@ -105,13 +105,7 @@ size_t InMemFilterStore::load_raw_labels(const std::string &raw_labe std::string mem_labels_int_map_file = raw_label_file_path + "_labels_map.txt"; _label_map = InMemFilterStore::convert_label_to_numeric(raw_labels_file, labels_file_to_use, mem_labels_int_map_file, raw_universal_label); - return parse_label_file(labels_file_to_use); -} - -template size_t InMemFilterStore::load_labels(const std::string &labels_file) -{ - // parse the generated label file - return parse_label_file(labels_file); + return load_labels(labels_file_to_use); } template void InMemFilterStore::load_label_map(const std::string &labels_map_file) @@ -137,7 +131,7 @@ template void InMemFilterStore::load_label_map // TODO: throw exception from here and also make sure filtered_index is set appropriately for both build and // search of index. diskann::cout << "Warning: Can't load label map file please make sure it was generate, either by " - "filter_store->load_raw_labels() " + "filter_store->populate_labels() " "then index->save() or convert_label_to_numeric() method in case of dynamic index" << std::endl; } @@ -269,7 +263,7 @@ template label_type InMemFilterStore::get_nume throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } -template size_t InMemFilterStore::parse_label_file(const std::string &label_file) +template size_t InMemFilterStore::load_labels(const std::string &label_file) { // Format of Label txt file: filters with comma separators // Format of Label txt file: filters with comma separators @@ -354,6 +348,9 @@ std::unordered_map InMemFilterStore::conver std::ofstream label_writer(outFileName); std::ifstream label_reader(inFileName); std::string line, token; + if (raw_universal_label != "") + string_int_map[raw_universal_label] = 0; // if universal label is provided map it to 0 always + while (std::getline(label_reader, line)) { std::istringstream new_iss(line); diff --git a/src/index.cpp b/src/index.cpp index e8985fc0d..5722683a6 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -1743,12 +1743,6 @@ void Index::build(const std::string &data_file, const size_t nu std::cout << "Indexing time: " << diff.count() << "\n"; } -template -void Index::parse_label_file(const std::string &label_file, size_t &num_points) -{ - num_points = _filter_store->load_labels(label_file); -} - template void Index::set_universal_label(const std::string &raw_label) { @@ -1760,7 +1754,7 @@ void Index::build_filtered_index(const char *filename, const si const IndexFilterParams &filter_params, const std::vector &tags) { _filtered_index = true; - size_t num_points_labels = _filter_store->load_raw_labels(filter_params.label_file, ""); + size_t num_points_labels = _filter_store->populate_labels(filter_params.label_file, ""); if (filter_params.universal_label != "") { _filter_store->set_universal_label(filter_params.universal_label); @@ -1769,7 +1763,7 @@ void Index::build_filtered_index(const char *filename, const si this->build(filename, num_points_to_load, tags); } -template +/*template void Index::build_filtered_index(const T *data, const size_t num_points_to_load, const IndexFilterParams &filter_params, const std::vector &tags) { @@ -1781,7 +1775,7 @@ void Index::build_filtered_index(const T *data, const size_t nu } calculate_best_medoids(num_points_to_load, 25); this->build(data, num_points_to_load, tags); -} +}*/ template std::pair Index::_search(const DataType &query, const size_t K, const uint32_t L,