format conversion tools (#9)

* added tsv to bin format convertor * added tool to convert float binary to int8 binary
microsoft · Jun 8, 2021 · 4b01372 · 4b01372
1 parent 90ebe5a
commit 4b01372
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 0 deletions.
diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
@@ -10,6 +10,13 @@ if(MSVC)
 	target_link_libraries(fvecs_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
 endif()
 
+add_executable(float_bin_to_int8 float_bin_to_int8.cpp)
+if(MSVC)
+	target_link_options(float_bin_to_int8 PRIVATE /MACHINE:x64)
+	target_link_libraries(float_bin_to_int8 debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
+	target_link_libraries(float_bin_to_int8 optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
+endif()
+
 add_executable(ivecs_to_bin ivecs_to_bin.cpp)
 if(MSVC)
 	target_link_options(ivecs_to_bin PRIVATE /MACHINE:x64)
@@ -24,6 +31,13 @@ if(MSVC)
 	target_link_libraries(tsv_to_bin optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
 endif()
 
+add_executable(bin_to_tsv bin_to_tsv.cpp)
+if(MSVC)
+	target_link_options(bin_to_tsv PRIVATE /MACHINE:x64)
+	target_link_libraries(bin_to_tsv debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
+	target_link_libraries(bin_to_tsv optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
+endif()
+
 add_executable(int8_to_float int8_to_float.cpp)
 if(MSVC)
 	target_link_options(int8_to_float PRIVATE /MACHINE:x64)

diff --git a/tests/utils/bin_to_tsv.cpp b/tests/utils/bin_to_tsv.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <iostream>
+#include "utils.h"
+
+template<class T>
+void block_convert(std::ofstream& writer, std::ifstream& reader, T* read_buf,
+                   _u64 npts, _u64 ndims) {
+  reader.read((char*) read_buf, npts * ndims * sizeof(float));
+
+  for (_u64 i = 0; i < npts; i++) {
+    for (_u64 d = 0; d < ndims; d++) {
+      writer << read_buf[d + i * ndims];
+      if (d < ndims - 1)
+        writer << "\t";
+      else
+        writer << "\n";
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc != 4) {
+    std::cout << argv[0] << " <float/int8/uint8> input_bin output_tsv" << std::endl;
+    exit(-1);
+  }
+  std::string type_string(argv[1]);
+  if ((type_string != std::string("float")) &&
+      (type_string != std::string("int8")) &&
+      (type_string != std::string("uin8"))) {
+    std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl;
+  }
+
+  std::ifstream reader(argv[2], std::ios::binary);
+  _u32          npts_u32;
+  _u32          ndims_u32;
+  reader.read((char*) &npts_u32, sizeof(_s32));
+  reader.read((char*) &ndims_u32, sizeof(_s32));
+  size_t npts = npts_u32;
+  size_t ndims = ndims_u32;
+  std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
+            << std::endl;
+
+  _u64 blk_size = 131072;
+  _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
+
+  std::ofstream writer(argv[3]);
+  char*         read_buf = new char[blk_size * ndims * 4];
+  for (_u64 i = 0; i < nblks; i++) {
+    _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
+    if (type_string == std::string("float"))
+      block_convert<float>(writer, reader, (float*)read_buf, cblk_size, ndims);
+    else if (type_string == std::string("int8"))
+      block_convert<int8_t>(writer, reader, (int8_t*) read_buf, cblk_size, ndims);
+    else if (type_string == std::string("uint8"))
+      block_convert<uint8_t>(writer, reader, (uint8_t*) read_buf, cblk_size,
+                             ndims);
+    std::cout << "Block #" << i << " written" << std::endl;
+  }
+
+  delete[] read_buf;
+
+  writer.close();
+  reader.close();
+}
diff --git a/tests/utils/float_bin_to_int8.cpp b/tests/utils/float_bin_to_int8.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#include <iostream>
+#include "utils.h"
+
+
+void block_convert(std::ofstream& writer, int8_t* write_buf,
+                   std::ifstream& reader, float* read_buf, _u64 npts,
+                   _u64 ndims, float bias, float scale) {
+  reader.read((char*) read_buf, npts * ndims * sizeof(float));
+
+  for (_u64 i = 0; i < npts; i++) {
+    for (_u64 d = 0; d < ndims; d++) {
+      write_buf[d + i * ndims] =
+          (int8_t)((read_buf[d + i * ndims] - bias) * (256.0 / scale));
+    }
+  }
+  writer.write((char*) write_buf, npts * ndims);
+}
+
+int main(int argc, char** argv) {
+  if (argc != 5) {
+    std::cout << "Usage: " << argv[0] << "  input_bin  output_tsv  bias  scale"
+              << std::endl;
+    exit(-1);
+  }
+
+  std::ifstream reader(argv[1], std::ios::binary);
+  _u32          npts_u32;
+  _u32          ndims_u32;
+  reader.read((char*) &npts_u32, sizeof(_s32));
+  reader.read((char*) &ndims_u32, sizeof(_s32));
+  size_t npts = npts_u32;
+  size_t ndims = ndims_u32;
+  std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
+            << std::endl;
+
+  _u64 blk_size = 131072;
+  _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
+
+  std::ofstream writer(argv[2], std::ios::binary);
+  auto          read_buf = new float[blk_size * ndims];
+  auto          write_buf = new int8_t[blk_size * ndims];
+  float         bias = atof(argv[3]);
+  float         scale = atof(argv[4]);
+
+  writer.write((char*) (&npts_u32), sizeof(_u32));
+  writer.write((char*) (&ndims_u32), sizeof(_u32));
+
+  for (_u64 i = 0; i < nblks; i++) {
+    _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
+    block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias,
+                  scale);
+    std::cout << "Block #" << i << " written" << std::endl;
+  }
+
+  delete[] read_buf;
+  delete[] write_buf;
+
+  writer.close();
+  reader.close();
+}