Skip to content

Commit

Permalink
format conversion tools (#9)
Browse files Browse the repository at this point in the history
* added tsv to bin format convertor

* added tool to convert float binary to int8 binary
  • Loading branch information
harsha-simhadri authored Jun 8, 2021
1 parent 90ebe5a commit 4b01372
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
14 changes: 14 additions & 0 deletions tests/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ if(MSVC)
target_link_libraries(fvecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(float_bin_to_int8 float_bin_to_int8.cpp)
if(MSVC)
target_link_options(float_bin_to_int8 PRIVATE /MACHINE:x64)
target_link_libraries(float_bin_to_int8 debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
target_link_libraries(float_bin_to_int8 optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(ivecs_to_bin ivecs_to_bin.cpp)
if(MSVC)
target_link_options(ivecs_to_bin PRIVATE /MACHINE:x64)
Expand All @@ -24,6 +31,13 @@ if(MSVC)
target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(bin_to_tsv bin_to_tsv.cpp)
if(MSVC)
target_link_options(bin_to_tsv PRIVATE /MACHINE:x64)
target_link_libraries(bin_to_tsv debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
target_link_libraries(bin_to_tsv optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
endif()

add_executable(int8_to_float int8_to_float.cpp)
if(MSVC)
target_link_options(int8_to_float PRIVATE /MACHINE:x64)
Expand Down
66 changes: 66 additions & 0 deletions tests/utils/bin_to_tsv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <iostream>
#include "utils.h"

template<class T>
void block_convert(std::ofstream& writer, std::ifstream& reader, T* read_buf,
_u64 npts, _u64 ndims) {
reader.read((char*) read_buf, npts * ndims * sizeof(float));

for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; d++) {
writer << read_buf[d + i * ndims];
if (d < ndims - 1)
writer << "\t";
else
writer << "\n";
}
}
}

int main(int argc, char** argv) {
if (argc != 4) {
std::cout << argv[0] << " <float/int8/uint8> input_bin output_tsv" << std::endl;
exit(-1);
}
std::string type_string(argv[1]);
if ((type_string != std::string("float")) &&
(type_string != std::string("int8")) &&
(type_string != std::string("uin8"))) {
std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl;
}

std::ifstream reader(argv[2], std::ios::binary);
_u32 npts_u32;
_u32 ndims_u32;
reader.read((char*) &npts_u32, sizeof(_s32));
reader.read((char*) &ndims_u32, sizeof(_s32));
size_t npts = npts_u32;
size_t ndims = ndims_u32;
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
<< std::endl;

_u64 blk_size = 131072;
_u64 nblks = ROUND_UP(npts, blk_size) / blk_size;

std::ofstream writer(argv[3]);
char* read_buf = new char[blk_size * ndims * 4];
for (_u64 i = 0; i < nblks; i++) {
_u64 cblk_size = std::min(npts - i * blk_size, blk_size);
if (type_string == std::string("float"))
block_convert<float>(writer, reader, (float*)read_buf, cblk_size, ndims);
else if (type_string == std::string("int8"))
block_convert<int8_t>(writer, reader, (int8_t*) read_buf, cblk_size, ndims);
else if (type_string == std::string("uint8"))
block_convert<uint8_t>(writer, reader, (uint8_t*) read_buf, cblk_size,
ndims);
std::cout << "Block #" << i << " written" << std::endl;
}

delete[] read_buf;

writer.close();
reader.close();
}
63 changes: 63 additions & 0 deletions tests/utils/float_bin_to_int8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.

#include <iostream>
#include "utils.h"


void block_convert(std::ofstream& writer, int8_t* write_buf,
std::ifstream& reader, float* read_buf, _u64 npts,
_u64 ndims, float bias, float scale) {
reader.read((char*) read_buf, npts * ndims * sizeof(float));

for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; d++) {
write_buf[d + i * ndims] =
(int8_t)((read_buf[d + i * ndims] - bias) * (256.0 / scale));
}
}
writer.write((char*) write_buf, npts * ndims);
}

int main(int argc, char** argv) {
if (argc != 5) {
std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale"
<< std::endl;
exit(-1);
}

std::ifstream reader(argv[1], std::ios::binary);
_u32 npts_u32;
_u32 ndims_u32;
reader.read((char*) &npts_u32, sizeof(_s32));
reader.read((char*) &ndims_u32, sizeof(_s32));
size_t npts = npts_u32;
size_t ndims = ndims_u32;
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
<< std::endl;

_u64 blk_size = 131072;
_u64 nblks = ROUND_UP(npts, blk_size) / blk_size;

std::ofstream writer(argv[2], std::ios::binary);
auto read_buf = new float[blk_size * ndims];
auto write_buf = new int8_t[blk_size * ndims];
float bias = atof(argv[3]);
float scale = atof(argv[4]);

writer.write((char*) (&npts_u32), sizeof(_u32));
writer.write((char*) (&ndims_u32), sizeof(_u32));

for (_u64 i = 0; i < nblks; i++) {
_u64 cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias,
scale);
std::cout << "Block #" << i << " written" << std::endl;
}

delete[] read_buf;
delete[] write_buf;

writer.close();
reader.close();
}

0 comments on commit 4b01372

Please sign in to comment.