From 10c1b3abb4e0c5611670065ee8b66aa7dd8f3145 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Simhadri Date: Sun, 19 Sep 2021 15:31:23 -0700 Subject: [PATCH] Search disk index fix (#20) * change default cache behavior in search_disk_index; scale factors in float to int8 * added int8_to_float_scale file --- tests/search_disk_index.cpp | 6 +-- tests/utils/CMakeLists.txt | 9 +++++ tests/utils/float_bin_to_int8.cpp | 2 +- tests/utils/int8_to_float_scale.cpp | 62 +++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 tests/utils/int8_to_float_scale.cpp diff --git a/tests/search_disk_index.cpp b/tests/search_disk_index.cpp index 9dacbc394..b2f12ad60 100644 --- a/tests/search_disk_index.cpp +++ b/tests/search_disk_index.cpp @@ -149,9 +149,9 @@ int search_disk_index(int argc, char** argv) { std::vector node_list; diskann::cout << "Caching " << num_nodes_to_cache << " BFS nodes around medoid(s)" << std::endl; - _pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list); - // _pFlashIndex->generate_cache_list_from_sample_queries( - // warmup_query_file, 15, 6, num_nodes_to_cache, num_threads, node_list); + //_pFlashIndex->cache_bfs_levels(num_nodes_to_cache, node_list); + _pFlashIndex->generate_cache_list_from_sample_queries( + warmup_query_file, 15, 6, num_nodes_to_cache, num_threads, node_list); _pFlashIndex->load_cache_list(node_list); node_list.clear(); node_list.shrink_to_fit(); diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt index a8171e739..363913618 100644 --- a/tests/utils/CMakeLists.txt +++ b/tests/utils/CMakeLists.txt @@ -47,6 +47,15 @@ else() target_link_libraries(int8_to_float ${PROJECT_NAME}) endif() +add_executable(int8_to_float_scale int8_to_float_scale.cpp) +if(MSVC) + target_link_options(int8_to_float_scale PRIVATE /MACHINE:x64) + target_link_libraries(int8_to_float_scale debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib) + target_link_libraries(int8_to_float_scale optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) +else() + target_link_libraries(int8_to_float_scale ${PROJECT_NAME}) +endif() + add_executable(uint8_to_float uint8_to_float.cpp) if(MSVC) target_link_options(uint8_to_float PRIVATE /MACHINE:x64) diff --git a/tests/utils/float_bin_to_int8.cpp b/tests/utils/float_bin_to_int8.cpp index 0620730a5..b7cf5e8f2 100644 --- a/tests/utils/float_bin_to_int8.cpp +++ b/tests/utils/float_bin_to_int8.cpp @@ -12,7 +12,7 @@ void block_convert(std::ofstream& writer, int8_t* write_buf, for (_u64 i = 0; i < npts; i++) { for (_u64 d = 0; d < ndims; d++) { write_buf[d + i * ndims] = - (int8_t)((read_buf[d + i * ndims] - bias) * (256.0 / scale)); + (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale)); } } writer.write((char*) write_buf, npts * ndims); diff --git a/tests/utils/int8_to_float_scale.cpp b/tests/utils/int8_to_float_scale.cpp new file mode 100644 index 000000000..1dff3edcc --- /dev/null +++ b/tests/utils/int8_to_float_scale.cpp @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "utils.h" + +void block_convert(std::ofstream& writer, float* write_buf, + std::ifstream& reader, int8_t* read_buf, _u64 npts, + _u64 ndims, float bias, float scale) { + reader.read((char*) read_buf, npts * ndims * sizeof(int8_t)); + + for (_u64 i = 0; i < npts; i++) { + for (_u64 d = 0; d < ndims; d++) { + write_buf[d + i * ndims] = + (((float)read_buf[d + i * ndims] - bias) * scale); + } + } + writer.write((char*) write_buf, npts * ndims * sizeof(float)); +} + +int main(int argc, char** argv) { + if (argc != 5) { + std::cout << "Usage: " << argv[0] << " input-int8.bin output-float.bin bias scale" + << std::endl; + exit(-1); + } + + std::ifstream reader(argv[1], std::ios::binary); + _u32 npts_u32; + _u32 ndims_u32; + reader.read((char*) &npts_u32, sizeof(_s32)); + reader.read((char*) &ndims_u32, sizeof(_s32)); + size_t npts = npts_u32; + size_t ndims = ndims_u32; + std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims + << std::endl; + + _u64 blk_size = 131072; + _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; + + std::ofstream writer(argv[2], std::ios::binary); + auto read_buf = new int8_t[blk_size * ndims]; + auto write_buf = new float[blk_size * ndims]; + float bias = atof(argv[3]); + float scale = atof(argv[4]); + + writer.write((char*) (&npts_u32), sizeof(_u32)); + writer.write((char*) (&ndims_u32), sizeof(_u32)); + + for (_u64 i = 0; i < nblks; i++) { + _u64 cblk_size = std::min(npts - i * blk_size, blk_size); + block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, + scale); + std::cout << "Block #" << i << " written" << std::endl; + } + + delete[] read_buf; + delete[] write_buf; + + writer.close(); + reader.close(); +}