diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt index 6f7cb32b8..e69722dcf 100644 --- a/tests/utils/CMakeLists.txt +++ b/tests/utils/CMakeLists.txt @@ -10,6 +10,13 @@ if(MSVC) target_link_libraries(fvecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) endif() +add_executable(float_bin_to_int8 float_bin_to_int8.cpp) +if(MSVC) + target_link_options(float_bin_to_int8 PRIVATE /MACHINE:x64) + target_link_libraries(float_bin_to_int8 debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib) + target_link_libraries(float_bin_to_int8 optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) +endif() + add_executable(ivecs_to_bin ivecs_to_bin.cpp) if(MSVC) target_link_options(ivecs_to_bin PRIVATE /MACHINE:x64) @@ -24,6 +31,13 @@ if(MSVC) target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) endif() +add_executable(bin_to_tsv bin_to_tsv.cpp) +if(MSVC) + target_link_options(bin_to_tsv PRIVATE /MACHINE:x64) + target_link_libraries(bin_to_tsv debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib) + target_link_libraries(bin_to_tsv optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib) +endif() + add_executable(int8_to_float int8_to_float.cpp) if(MSVC) target_link_options(int8_to_float PRIVATE /MACHINE:x64) diff --git a/tests/utils/bin_to_tsv.cpp b/tests/utils/bin_to_tsv.cpp new file mode 100644 index 000000000..37874e243 --- /dev/null +++ b/tests/utils/bin_to_tsv.cpp @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "utils.h" + +template +void block_convert(std::ofstream& writer, std::ifstream& reader, T* read_buf, + _u64 npts, _u64 ndims) { + reader.read((char*) read_buf, npts * ndims * sizeof(float)); + + for (_u64 i = 0; i < npts; i++) { + for (_u64 d = 0; d < ndims; d++) { + writer << read_buf[d + i * ndims]; + if (d < ndims - 1) + writer << "\t"; + else + writer << "\n"; + } + } +} + +int main(int argc, char** argv) { + if (argc != 4) { + std::cout << argv[0] << " input_bin output_tsv" << std::endl; + exit(-1); + } + std::string type_string(argv[1]); + if ((type_string != std::string("float")) && + (type_string != std::string("int8")) && + (type_string != std::string("uin8"))) { + std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl; + } + + std::ifstream reader(argv[2], std::ios::binary); + _u32 npts_u32; + _u32 ndims_u32; + reader.read((char*) &npts_u32, sizeof(_s32)); + reader.read((char*) &ndims_u32, sizeof(_s32)); + size_t npts = npts_u32; + size_t ndims = ndims_u32; + std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims + << std::endl; + + _u64 blk_size = 131072; + _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; + + std::ofstream writer(argv[3]); + char* read_buf = new char[blk_size * ndims * 4]; + for (_u64 i = 0; i < nblks; i++) { + _u64 cblk_size = std::min(npts - i * blk_size, blk_size); + if (type_string == std::string("float")) + block_convert(writer, reader, (float*)read_buf, cblk_size, ndims); + else if (type_string == std::string("int8")) + block_convert(writer, reader, (int8_t*) read_buf, cblk_size, ndims); + else if (type_string == std::string("uint8")) + block_convert(writer, reader, (uint8_t*) read_buf, cblk_size, + ndims); + std::cout << "Block #" << i << " written" << std::endl; + } + + delete[] read_buf; + + writer.close(); + reader.close(); +} diff --git a/tests/utils/float_bin_to_int8.cpp b/tests/utils/float_bin_to_int8.cpp new file mode 100644 index 000000000..4f422a233 --- /dev/null +++ b/tests/utils/float_bin_to_int8.cpp @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#include +#include "utils.h" + + +void block_convert(std::ofstream& writer, int8_t* write_buf, + std::ifstream& reader, float* read_buf, _u64 npts, + _u64 ndims, float bias, float scale) { + reader.read((char*) read_buf, npts * ndims * sizeof(float)); + + for (_u64 i = 0; i < npts; i++) { + for (_u64 d = 0; d < ndims; d++) { + write_buf[d + i * ndims] = + (int8_t)((read_buf[d + i * ndims] - bias) * (256.0 / scale)); + } + } + writer.write((char*) write_buf, npts * ndims); +} + +int main(int argc, char** argv) { + if (argc != 5) { + std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" + << std::endl; + exit(-1); + } + + std::ifstream reader(argv[1], std::ios::binary); + _u32 npts_u32; + _u32 ndims_u32; + reader.read((char*) &npts_u32, sizeof(_s32)); + reader.read((char*) &ndims_u32, sizeof(_s32)); + size_t npts = npts_u32; + size_t ndims = ndims_u32; + std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims + << std::endl; + + _u64 blk_size = 131072; + _u64 nblks = ROUND_UP(npts, blk_size) / blk_size; + + std::ofstream writer(argv[2], std::ios::binary); + auto read_buf = new float[blk_size * ndims]; + auto write_buf = new int8_t[blk_size * ndims]; + float bias = atof(argv[3]); + float scale = atof(argv[4]); + + writer.write((char*) (&npts_u32), sizeof(_u32)); + writer.write((char*) (&ndims_u32), sizeof(_u32)); + + for (_u64 i = 0; i < nblks; i++) { + _u64 cblk_size = std::min(npts - i * blk_size, blk_size); + block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, + scale); + std::cout << "Block #" << i << " written" << std::endl; + } + + delete[] read_buf; + delete[] write_buf; + + writer.close(); + reader.close(); +}