Skip to content

Commit

Permalink
Changes in build to separate brute forceable points for filtered indices
Browse files Browse the repository at this point in the history
  • Loading branch information
gopal-msr committed Nov 21, 2024
1 parent d9eaf13 commit b2485d8
Show file tree
Hide file tree
Showing 12 changed files with 251 additions and 13 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "gperftools"]
path = gperftools
url = https://github.com/gperftools/gperftools.git
[submodule "CRoaring"]
path = CRoaring
url = https://github.com/RoaringBitmap/CRoaring
25 changes: 25 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,31 @@ endif()

add_definitions(-DMKL_ILP64)


# Roaring Bitmap
#if ( NOT EXISTS "${PROJECT_SOURCE_DIR}/CRoaring/LICENSE")
# message(FATAL_ERROR "The RoaringBitmap submodule was not found. "
# " Please run 'git submodule init' followed by 'git submodule update'")
#endif()
#add_subdirectory(CRoaring)
#include_directories(CRoaring/include/roaring)

#set (CROARING_LIBRARY "${PROJECT_SOURCEDIR}/CRoaring/build/src/Release/roaring.lib")
#add_custom_target(build_croaring DEPENDS CROARING_LIBRARY)
#if (MSVC)
# add_custom_command(OUTPUT ${CROARING_LIBRARY}
# COMMAND ${CMAKE_VS_MSBUILD_COMMAND} build/RoaringBitmap.sln /m /nologo
# /t:roaring /p:Configuration="Release"
# /property:Platform="x64"
# /p:PlatformToolset=v${MSVC_TOOLSET_VERSION}
# /p:WindowsTargetPlatformVersion=${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}
# WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/CRoaring)
#endif()
#add_library(croaring_lib STATIC IMPORTED)
#set_target_properties(croaring_lib PROPERTIES IMPORTED_LOCATION "${CROARING_LIBRARY}")



# Section for tcmalloc. The DiskANN tools are always linked to tcmalloc. For Windows, they also need to
# force-include the _tcmalloc symbol for enabling tcmalloc.
#
Expand Down
1 change: 1 addition & 0 deletions CRoaring
Submodule CRoaring added at ad487e
1 change: 1 addition & 0 deletions include/common_includes.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@
#include <sys/stat.h>
#include <unordered_map>
#include <vector>
#include <map>
3 changes: 2 additions & 1 deletion include/disk_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ DISKANN_DLLEXPORT int build_disk_index(
std::string(""), // default is empty string for no label_file
const std::string &universal_label = "",
const uint32_t filter_threshold = 0,
const uint32_t Lf = 0); // default is empty string for no universal label
const uint32_t Lf = 0,
const uint32_t filter_bf_threshold = 0); // default is empty string for no universal label

template <typename T>
DISKANN_DLLEXPORT void
Expand Down
14 changes: 14 additions & 0 deletions include/filter_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ typedef std::tuple<std::vector<label_set>,
typedef std::tuple<std::vector<std::vector<uint32_t>>, uint64_t>
load_label_index_return_values;


namespace diskann {

//CONSTANTS
DISKANN_DLLEXPORT extern const char* NO_LABEL_FOR_POINT;
DISKANN_DLLEXPORT extern const char FILTERS_LABEL_DELIMITER;


template <typename T>
DISKANN_DLLEXPORT void
generate_label_indices(path input_data_path, path final_index_path_prefix,
Expand All @@ -76,6 +83,13 @@ generate_label_specific_vector_files_compat(
tsl::robin_map<std::string, uint32_t> labels_to_number_of_points,
std::vector<label_set> point_ids_to_labels, label_set all_labels);

template<typename T>
DISKANN_DLLEXPORT void separate_brute_forceable_points(
const std::string& base_file, const std::string& label_file,
const location_t filter_bf_threshold,
const std::string& new_lbl_file,
const std::string& bf_data_file);

/*
* For each label, generates a file containing all vectors that have said label.
* Also copies data from original bin file to new dimension-aligned file.
Expand Down
2 changes: 1 addition & 1 deletion include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ class Index : public AbstractIndex {

bool _use_universal_label = false;
LabelT _universal_label = 0;
uint32_t _filterIndexingQueueSize;
uint32_t _filter_indexing_queue_size;
std::unordered_map<std::string, LabelT> _label_map;

// Indexing parameters
Expand Down
27 changes: 27 additions & 0 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,33 @@ inline int delete_file(const std::string &fileName) {
}
}

inline std::string trim(const std::string& str) {
// Find the first non-whitespace character
size_t start = 0;
while (start < str.size() && std::isspace(str[start])) {
++start;
}

// Find the last non-whitespace character
size_t end = str.size();
while (end > start && std::isspace(str[end - 1])) {
--end;
}

// Return the substring from start to end
return str.substr(start, end - start);
}

//VERY INEFFICIENT SPLIT_FUNCTION. USE IT AT YOUR OWN RISK.
inline void split_string(const std::string& str, const char sep, std::vector<std::string>& split_strings) {
std::string token;
std::istringstream iss(str);
while (getline(iss, token, sep)) {
token = trim(token);
split_strings.push_back(token);
}
}

// generates formatted_label and _labels_map file.
inline void convert_labels_string_to_int(const std::string &inFileName,
const std::string &outFileName,
Expand Down
39 changes: 31 additions & 8 deletions src/disk_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "pq_flash_index.h"
#include "timer.h"
#include "tsl/robin_set.h"
#include "utils.h"
#include "filter_utils.h"

namespace diskann {

Expand Down Expand Up @@ -1114,22 +1116,25 @@ void create_disk_layout(const std::string base_file,
<< std::endl;
}



template <typename T, typename LabelT>
int build_disk_index(const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters,
diskann::Metric compareMetric, bool use_opq,
const std::string &codebook_prefix, bool use_filters,
const std::string &label_file,
const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf) {
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold) {
std::stringstream parser;
parser << std::string(indexBuildParameters);
std::string cur_param;
std::vector<std::string> param_list;
while (parser >> cur_param) {
param_list.push_back(cur_param);
}
if (param_list.size() < 5 || param_list.size() > 9) {
if (param_list.size() < 5 || param_list.size() > 10) {
diskann::cout
<< "Correct usage of parameters is R (max degree)\n"
"L (indexing list size, better if >= R)\n"
Expand Down Expand Up @@ -1210,6 +1215,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath,
disk_index_path +
"_dummy_map.txt"; // remap will be used if we break-up points of
// high label-density to create copies
std::string bf_data_file = disk_index_path + "_brute_force.txt";
std::string bf_excluded_label_file = disk_index_path + "_non_brute_force.txt";

std::string sample_base_prefix = index_prefix_path + "_sample";
// optional, used if disk index file must store pq data
Expand All @@ -1224,6 +1231,16 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath,
// cosine/ mips metrics
bool created_temp_file_for_processed_data = false;


//Brute force check.
if (use_filters && filter_bf_threshold > 0) {
assert(label_file != "");
diskann::separate_brute_forceable_points<T>(data_file_to_use, label_file,
filter_bf_threshold, bf_excluded_label_file, bf_data_file);
labels_file_original = bf_excluded_label_file;
created_temp_file_for_processed_data = true;
}

// output a new base file which contains extra dimension with sqrt(1 -
// ||x||^2/M^2) for every x, M is max norm of all points. Extra space on
// disk needed!
Expand Down Expand Up @@ -1484,38 +1501,44 @@ template DISKANN_DLLEXPORT int build_disk_index<int8_t, uint32_t>(
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);
template DISKANN_DLLEXPORT int build_disk_index<uint8_t, uint32_t>(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);
template DISKANN_DLLEXPORT int build_disk_index<float, uint32_t>(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);
// LabelT = uint16
template DISKANN_DLLEXPORT int build_disk_index<int8_t, uint16_t>(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);
template DISKANN_DLLEXPORT int build_disk_index<uint8_t, uint16_t>(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);
template DISKANN_DLLEXPORT int build_disk_index<float, uint16_t>(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric compareMetric,
bool use_opq, const std::string &codebook_prefix, bool use_filters,
const std::string &label_file, const std::string &universal_label,
const uint32_t filter_threshold, const uint32_t Lf);
const uint32_t filter_threshold, const uint32_t Lf,
const uint32_t filter_bf_threshold);

template DISKANN_DLLEXPORT int build_merged_vamana_index<int8_t, uint32_t>(
std::string base_file, diskann::Metric compareMetric, uint32_t L,
Expand Down
1 change: 1 addition & 0 deletions src/dll/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTO
target_link_options(${PROJECT_NAME} PRIVATE /DLL /IMPLIB:${DISKANN_DLL_IMPLIB} /LTCG)
target_link_libraries(${PROJECT_NAME} PRIVATE ${DISKANN_MKL_LINK_LIBRARIES})
target_link_libraries(${PROJECT_NAME} PRIVATE synchronization.lib)
target_link_libraries(${PROJECT_NAME} PRIVATE ${CROARING_LIBRARY})

if (DISKANN_DLL_TCMALLOC_LINK_OPTIONS)
target_link_libraries(${PROJECT_NAME} PUBLIC ${DISKANN_DLL_TCMALLOC_LINK_OPTIONS})
Expand Down
Loading

0 comments on commit b2485d8

Please sign in to comment.