From 13df0cf7c7ae32de32b1ea1634fd71afbb683af7 Mon Sep 17 00:00:00 2001 From: rakri <78582691+rakri@users.noreply.github.com> Date: Tue, 6 Feb 2024 18:09:30 +0530 Subject: [PATCH] Rakri/cosine bug fix (#450) * compiles, but need to verify * fixed windows compiler warning * minor typo * added cosine unit test with unnormalized data * minor typo in user prompt cosine/l2 * cosine was already supported in groundtruth, edited the message to say so * clang-format --------- Co-authored-by: rakri --- .github/actions/generate-random/action.yml | 3 ++ .github/workflows/disk-pq.yml | 10 +++++ apps/build_disk_index.cpp | 2 + apps/utils/compute_groundtruth.cpp | 3 +- apps/utils/rand_data_gen.cpp | 52 +++++++++++++++++----- src/disk_utils.cpp | 32 ++++++++++--- src/pq_flash_index.cpp | 27 ++++++----- unit_tester.sh | 16 +++---- 8 files changed, 106 insertions(+), 39 deletions(-) diff --git a/.github/actions/generate-random/action.yml b/.github/actions/generate-random/action.yml index 297209d7b..2755067df 100644 --- a/.github/actions/generate-random/action.yml +++ b/.github/actions/generate-random/action.yml @@ -9,11 +9,13 @@ runs: echo "Generating random vectors for index" dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0 dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0 echo "Generating random vectors for query" dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0 + dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0 dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0 @@ -21,6 +23,7 @@ runs: dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 dist/bin/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100 + dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100 echo "Computing ground truth for int8s across l2, mips, and cosine distance functions" dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100 diff --git a/.github/workflows/disk-pq.yml b/.github/workflows/disk-pq.yml index 35c662184..6e71e7999 100644 --- a/.github/workflows/disk-pq.yml +++ b/.github/workflows/disk-pq.yml @@ -34,6 +34,11 @@ jobs: run: | dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 + - name: build and search disk index (one shot graph build, cosine, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 + dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: build and search disk index (one shot graph build, L2, no diskPQ) (int8) if: success() || failure() run: | @@ -66,6 +71,11 @@ jobs: run: | dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 + - name: build and search disk index (sharded graph build, cosine, no diskPQ) (float) + if: success() || failure() + run: | + dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 + dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16 - name: build and search disk index (sharded graph build, L2, no diskPQ) (int8) run: | dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006 diff --git a/apps/build_disk_index.cpp b/apps/build_disk_index.cpp index b617a5f4a..f48b61726 100644 --- a/apps/build_disk_index.cpp +++ b/apps/build_disk_index.cpp @@ -107,6 +107,8 @@ int main(int argc, char **argv) metric = diskann::Metric::L2; else if (dist_fn == std::string("mips")) metric = diskann::Metric::INNER_PRODUCT; + else if (dist_fn == std::string("cosine")) + metric = diskann::Metric::COSINE; else { std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl; diff --git a/apps/utils/compute_groundtruth.cpp b/apps/utils/compute_groundtruth.cpp index f33a26b84..da32fd7c6 100644 --- a/apps/utils/compute_groundtruth.cpp +++ b/apps/utils/compute_groundtruth.cpp @@ -499,7 +499,8 @@ int main(int argc, char **argv) desc.add_options()("help,h", "Print information on arguments"); desc.add_options()("data_type", po::value(&data_type)->required(), "data type "); - desc.add_options()("dist_fn", po::value(&dist_fn)->required(), "distance function "); + desc.add_options()("dist_fn", po::value(&dist_fn)->required(), + "distance function "); desc.add_options()("base_file", po::value(&base_file)->required(), "File containing the base vectors in binary format"); desc.add_options()("query_file", po::value(&query_file)->required(), diff --git a/apps/utils/rand_data_gen.cpp b/apps/utils/rand_data_gen.cpp index a6f9305c8..e89ede800 100644 --- a/apps/utils/rand_data_gen.cpp +++ b/apps/utils/rand_data_gen.cpp @@ -11,23 +11,31 @@ namespace po = boost::program_options; -int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, float norm) +int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm, + float rand_scale) { auto vec = new float[ndims]; std::random_device rd{}; std::mt19937 gen{rd()}; std::normal_distribution<> normal_rand{0, 1}; + std::uniform_real_distribution<> unif_dis(1.0, rand_scale); for (size_t i = 0; i < npts; i++) { float sum = 0; + float scale = 1.0f; + if (rand_scale > 1.0f) + scale = (float)unif_dis(gen); for (size_t d = 0; d < ndims; ++d) - vec[d] = (float)normal_rand(gen); - for (size_t d = 0; d < ndims; ++d) - sum += vec[d] * vec[d]; - for (size_t d = 0; d < ndims; ++d) - vec[d] = vec[d] * norm / std::sqrt(sum); + vec[d] = scale * (float)normal_rand(gen); + if (normalization) + { + for (size_t d = 0; d < ndims; ++d) + sum += vec[d] * vec[d]; + for (size_t d = 0; d < ndims; ++d) + vec[d] = vec[d] * norm / std::sqrt(sum); + } writer.write((char *)vec, ndims * sizeof(float)); } @@ -104,8 +112,8 @@ int main(int argc, char **argv) { std::string data_type, output_file; size_t ndims, npts; - float norm; - + float norm, rand_scaling; + bool normalization = false; try { po::options_description desc{"Arguments"}; @@ -117,7 +125,11 @@ int main(int argc, char **argv) "File name for saving the random vectors"); desc.add_options()("ndims,D", po::value(&ndims)->required(), "Dimensoinality of the vector"); desc.add_options()("npts,N", po::value(&npts)->required(), "Number of vectors"); - desc.add_options()("norm", po::value(&norm)->required(), "Norm of the vectors"); + desc.add_options()("norm", po::value(&norm)->default_value(-1.0f), + "Norm of the vectors (if not specified, vectors are not normalized)"); + desc.add_options()("rand_scaling", po::value(&rand_scaling)->default_value(1.0f), + "Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from " + "[1, rand_scale]. Only applicable for floating point data"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); if (vm.count("help")) @@ -139,9 +151,20 @@ int main(int argc, char **argv) return -1; } - if (norm <= 0.0) + if (norm > 0.0) + { + normalization = true; + } + + if (rand_scaling < 1.0) + { + std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl; + return -1; + } + + if ((rand_scaling > 1.0) && (normalization == true)) { - std::cerr << "Error: Norm must be a positive number" << std::endl; + std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl; return -1; } @@ -155,6 +178,11 @@ int main(int argc, char **argv) << std::endl; return -1; } + if (rand_scaling > 1.0) + { + std::cout << "Data scaling only supported for floating point data." << std::endl; + return -1; + } } try @@ -177,7 +205,7 @@ int main(int argc, char **argv) size_t cblk_size = std::min(npts - i * blk_size, blk_size); if (data_type == std::string("float")) { - ret = block_write_float(writer, ndims, cblk_size, norm); + ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling); } else if (data_type == std::string("int8")) { diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 624eecc65..016560217 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -1129,11 +1129,12 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const return -1; } - if (!std::is_same::value && compareMetric == diskann::Metric::INNER_PRODUCT) + if (!std::is_same::value && + (compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE)) { std::stringstream stream; - stream << "DiskANN currently only supports floating point data for Max " - "Inner Product Search. " + stream << "Disk-index build currently only supports floating point data for Max " + "Inner Product Search/ cosine similarity. " << std::endl; throw diskann::ANNException(stream.str(), -1); } @@ -1195,6 +1196,10 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin"; // optional, used if disk index must store pq data std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin"; + std::string prepped_base = + index_prefix_path + + "_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics + bool created_temp_file_for_processed_data = false; // output a new base file which contains extra dimension with sqrt(1 - // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on @@ -1205,14 +1210,26 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const std::cout << "Using Inner Product search, so need to pre-process base " "data into temp file. Please ensure there is additional " "(n*(d+1)*4) bytes for storing pre-processed base vectors, " - "apart from the intermin indices and final index." + "apart from the interim indices created by DiskANN and the final index." << std::endl; - std::string prepped_base = index_prefix_path + "_prepped_base.bin"; data_file_to_use = prepped_base; float max_norm_of_base = diskann::prepare_base_for_inner_products(base_file, prepped_base); std::string norm_file = disk_index_path + "_max_base_norm.bin"; diskann::save_bin(norm_file, &max_norm_of_base, 1, 1); diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl; + created_temp_file_for_processed_data = true; + } + else if (compareMetric == diskann::Metric::COSINE) + { + Timer timer; + std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional " + "(n*d*4) bytes for storing normalized base vectors, " + "apart from the interim indices created by DiskANN and the final index." + << std::endl; + data_file_to_use = prepped_base; + diskann::normalize_data_file(base_file, prepped_base); + diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl; + created_temp_file_for_processed_data = true; } uint32_t R = (uint32_t)atoi(param_list[0].c_str()); @@ -1304,7 +1321,7 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const #if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD) MallocExtension::instance()->ReleaseFreeMemory(); #endif - + // Whether it is cosine or inner product, we still L2 metric due to the pre-processing. timer.reset(); diskann::build_merged_vamana_index(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val, indexing_ram_budget, mem_index_path, medoids_path, centroids_path, @@ -1345,7 +1362,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const std::remove(augmented_labels_file.c_str()); std::remove(labels_file_to_use.c_str()); } - + if (created_temp_file_for_processed_data) + std::remove(prepped_base.c_str()); std::remove(mem_index_path.c_str()); if (use_disk_pq) std::remove(disk_pq_compressed_vectors_path.c_str()); diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index ed3a0085e..860ce58f4 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -32,14 +32,16 @@ template PQFlashIndex::PQFlashIndex(std::shared_ptr &fileReader, diskann::Metric m) : reader(fileReader), metric(m), _thread_data(nullptr) { + diskann::Metric metric_to_invoke = m; if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT) { if (std::is_floating_point::value) { - diskann::cout << "Cosine metric chosen for (normalized) float data." - "Changing distance to L2 to boost accuracy." + diskann::cout << "Since data is floating point, we assume that it has been appropriately pre-processed " + "(normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we " + "shall invoke an l2 distance function." << std::endl; - metric = diskann::Metric::L2; + metric_to_invoke = diskann::Metric::L2; } else { @@ -49,8 +51,8 @@ PQFlashIndex::PQFlashIndex(std::shared_ptr &fileRe } } - this->_dist_cmp.reset(diskann::get_distance_function(metric)); - this->_dist_cmp_float.reset(diskann::get_distance_function(metric)); + this->_dist_cmp.reset(diskann::get_distance_function(metric_to_invoke)); + this->_dist_cmp_float.reset(diskann::get_distance_function(metric_to_invoke)); } template PQFlashIndex::~PQFlashIndex() @@ -1292,20 +1294,23 @@ void PQFlashIndex::cached_beam_search(const T *query1, const uint64_t float *query_float = pq_query_scratch->aligned_query_float; float *query_rotated = pq_query_scratch->rotated_query; - // if inner product, we laso normalize the query and set the last coordinate - // to 0 (this is the extra coordindate used to convert MIPS to L2 search) - if (metric == diskann::Metric::INNER_PRODUCT) + // normalization step. for cosine, we simply normalize the query + // for mips, we normalize the first d-1 dims, and add a 0 for last dim, since an extra coordinate was used to + // convert MIPS to L2 search + if (metric == diskann::Metric::INNER_PRODUCT || metric == diskann::Metric::COSINE) { - for (size_t i = 0; i < this->_data_dim - 1; i++) + uint64_t inherent_dim = (metric == diskann::Metric::COSINE) ? this->_data_dim : (uint64_t)(this->_data_dim - 1); + for (size_t i = 0; i < inherent_dim; i++) { aligned_query_T[i] = query1[i]; query_norm += query1[i] * query1[i]; } - aligned_query_T[this->_data_dim - 1] = 0; + if (metric == diskann::Metric::INNER_PRODUCT) + aligned_query_T[this->_data_dim - 1] = 0; query_norm = std::sqrt(query_norm); - for (size_t i = 0; i < this->_data_dim - 1; i++) + for (size_t i = 0; i < inherent_dim; i++) { aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm); } diff --git a/unit_tester.sh b/unit_tester.sh index d19e62575..1ef96c025 100755 --- a/unit_tester.sh +++ b/unit_tester.sh @@ -43,29 +43,29 @@ while IFS= read -r line; do BUDGETBUILD=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(5*1024*1024*1024)"` BUDGETSERVE=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(10*1024*1024*1024)"` echo "=============================================================================================================================================" - echo "Running tests on ${DATASET} dataset, ${TYPE} datatype, $METRIC metric, ${BUDGETBUILD} GiB and ${BUDGETSERVE} GiB build and serve budget" + echo "Running apps on ${DATASET} dataset, ${TYPE} datatype, $METRIC metric, ${BUDGETBUILD} GiB and ${BUDGETSERVE} GiB build and serve budget" echo "=============================================================================================================================================" rm ${DISK}_* #echo "Going to run test on ${BASE} base, ${QUERY} query, ${TYPE} datatype, ${METRIC} metric, saving gt at ${GT}" echo "Computing Groundtruth" - #${BUILD_FOLDER}/tests/utils/compute_groundtruth ${TYPE} ${BASE} ${QUERY} 30 ${GT} ${METRIC} > /dev/null - ${BUILD_FOLDER}/tests/utils/compute_groundtruth --data_type ${TYPE} --base_file ${BASE} --query_file ${QUERY} --K 30 --gt_file ${GT} --dist_fn ${METRIC} > /dev/null + #${BUILD_FOLDER}/apps/utils/compute_groundtruth ${TYPE} ${BASE} ${QUERY} 30 ${GT} ${METRIC} > /dev/null + ${BUILD_FOLDER}/apps/utils/compute_groundtruth --data_type ${TYPE} --base_file ${BASE} --query_file ${QUERY} --K 30 --gt_file ${GT} --dist_fn ${METRIC} > /dev/null echo "Building Mem Index" -# /usr/bin/time ${BUILD_FOLDER}/tests/build_memory_index ${TYPE} ${METRIC} ${BASE} ${MEM} 32 50 1.2 0 > ${MBLOG} - /usr/bin/time ${BUILD_FOLDER}/tests/build_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${MEM} -R 32 -L 50 --alpha 1.2 -T 0 > ${MBLOG} +# /usr/bin/time ${BUILD_FOLDER}/apps/build_memory_index ${TYPE} ${METRIC} ${BASE} ${MEM} 32 50 1.2 0 > ${MBLOG} + /usr/bin/time ${BUILD_FOLDER}/apps/build_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${MEM} -R 32 -L 50 --alpha 1.2 -T 0 > ${MBLOG} awk '/^Degree/' ${MBLOG} awk '/^Indexing/' ${MBLOG} echo "Searching Mem Index" - ${BUILD_FOLDER}/tests/search_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${MEM} -T 16 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 10 20 30 40 50 60 70 80 90 100 > ${MSLOG} + ${BUILD_FOLDER}/apps/search_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${MEM} -T 16 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 10 20 30 40 50 60 70 80 90 100 > ${MSLOG} awk '/===/{x=NR+10}(NR<=x){print}' ${MSLOG} echo "Building Disk Index" - ${BUILD_FOLDER}/tests/build_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${DISK} -R 32 -L 50 -B ${BUDGETSERVE} -M ${BUDGETBUILD} -T 32 --PQ_disk_bytes 0 > ${DBLOG} + ${BUILD_FOLDER}/apps/build_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${DISK} -R 32 -L 50 -B ${BUDGETSERVE} -M ${BUDGETBUILD} -T 32 --PQ_disk_bytes 0 > ${DBLOG} awk '/^Compressing/' ${DBLOG} echo "#shards in disk index" awk '/^Indexing/' ${DBLOG} echo "Searching Disk Index" - ${BUILD_FOLDER}/tests/search_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${DISK} --num_nodes_to_cache 10000 -T 10 -W 4 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 20 40 60 80 100 > ${DSLOG} + ${BUILD_FOLDER}/apps/search_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${DISK} --num_nodes_to_cache 10000 -T 10 -W 4 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 20 40 60 80 100 > ${DSLOG} echo "# shards used during index construction:" awk '/medoids/{x=NR+1}(NR<=x){print}' ${DSLOG} awk '/===/{x=NR+10}(NR<=x){print}' ${DSLOG}