diff --git a/include/pq.h b/include/pq.h index 3e6119f22..fb7b91797 100644 --- a/include/pq.h +++ b/include/pq.h @@ -67,18 +67,25 @@ DISKANN_DLLEXPORT int generate_opq_pivots(const float *train_data, size_t num_tr unsigned num_pq_chunks, std::string opq_pivots_path, bool make_zero_mean = false); -DISKANN_DLLEXPORT int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim, - size_t num_pq_chunks, std::vector &pivot_data_vector); + +DISKANN_DLLEXPORT int generate_pq_pivots_simplified(float *train_data, size_t num_train, size_t dim, + size_t num_pq_chunks, std::vector &pivot_data_vector, + std::vector ¢roids, std::vector &offsets, + const bool make_zero_mean = true, + const uint32_t kmeans_iters_for_pq = 15); template int generate_pq_data_from_pivots(const std::string &data_file, unsigned num_centers, unsigned num_pq_chunks, const std::string &pq_pivots_path, const std::string &pq_compressed_vectors_path, bool use_opq = false); -DISKANN_DLLEXPORT int generate_pq_data_from_pivots_simplified(const float *data, const size_t num, - const float *pivot_data, const size_t pivots_num, - const size_t dim, const size_t num_pq_chunks, - std::vector &pq); +using PQ_DATA_TYPE = uint8_t; +DISKANN_DLLEXPORT int generate_pq_data_from_pivots_simplified(float *data, const size_t num, const float *pivot_data, + const size_t pivots_num, const size_t dim, + const size_t num_pq_chunks, + const std::vector ¢roids, + const std::vector &offsets, + std::vector &pq); template void generate_disk_quantized_data(const std::string &data_file_to_use, const std::string &disk_pq_pivots_path, diff --git a/src/pq.cpp b/src/pq.cpp index d2b545c79..0e473b73e 100644 --- a/src/pq.cpp +++ b/src/pq.cpp @@ -348,29 +348,61 @@ void pq_dist_lookup(const uint8_t *pq_ids, const size_t n_pts, const size_t pq_n // Input is provided in the in-memory buffer train_data. // Output is stored in the in-memory buffer pivot_data_vector. // Simplification is based on the following assumptions: -// dim % num_pq_chunks == 0 // num_centers == 256 by default -// KMEANS_ITERS_FOR_PQ == 15 by default -// make_zero_mean is false by default. // These assumptions allow to make the function much simpler and avoid storing // array of chunk_offsets and centroids. // The compiler pragma for multi-threading support is removed from this implementation // for the purpose of integration into systems that strictly control resource allocation. -int generate_pq_pivots_simplified(const float *train_data, size_t num_train, size_t dim, size_t num_pq_chunks, - std::vector &pivot_data_vector) +int generate_pq_pivots_simplified(float *train_data, size_t num_train, size_t dim, size_t num_pq_chunks, + std::vector &pivot_data_vector, std::vector ¢roids, + std::vector &offsets, const bool make_zero_mean, + const uint32_t kmeans_iters_for_pq) { - if (num_pq_chunks > dim || dim % num_pq_chunks != 0) + if (num_pq_chunks == 0 || num_pq_chunks > dim) { return -1; } + // Calculate offsets of chunks + { + size_t chunk_size = dim / num_pq_chunks; + std::vector sizes; + + // If dim % num_pq_chunks != 0 we need to adjust chunk_size + // to cover the whole of vectors. + if (dim % num_pq_chunks != 0) + { + do + { + chunk_size++; + } while (chunk_size * num_pq_chunks < dim); + + sizes.resize(num_pq_chunks, chunk_size); + size_t target = chunk_size * num_pq_chunks; + for (auto iter = sizes.rbegin(); iter != sizes.rend() && target > dim; iter++, target--) + { + (*iter)--; + } + } + else + { + sizes.resize(num_pq_chunks, chunk_size); + } + + offsets.resize(num_pq_chunks, 0); + for (size_t i = 0; i < num_pq_chunks - 1; i++) + { + offsets[i + 1] = (uint32_t)(offsets[i] + sizes[i]); + } + offsets.push_back((uint32_t)dim); + } + const size_t num_centers = 256; - const size_t cur_chunk_size = dim / num_pq_chunks; - const uint32_t KMEANS_ITERS_FOR_PQ = 15; + const uint32_t max_chunk_size = offsets[1]; pivot_data_vector.resize(num_centers * dim); - std::vector cur_pivot_data_vector(num_centers * cur_chunk_size); - std::vector cur_data_vector(num_train * cur_chunk_size); + std::vector cur_pivot_data_vector(num_centers * max_chunk_size); + std::vector cur_data_vector(num_train * max_chunk_size); std::vector closest_center_vector(num_train); float *pivot_data = &pivot_data_vector[0]; @@ -378,9 +410,30 @@ int generate_pq_pivots_simplified(const float *train_data, size_t num_train, siz float *cur_data = &cur_data_vector[0]; uint32_t *closest_center = &closest_center_vector[0]; + centroids.clear(); + if (make_zero_mean) + { + centroids.resize(dim, 0.0); + + for (uint64_t d = 0; d < dim; d++) + { + for (uint64_t p = 0; p < num_train; p++) + { + centroids[d] += train_data[p * dim + d]; + } + centroids[d] /= num_train; + + for (uint64_t p = 0; p < num_train; p++) + { + train_data[p * dim + d] -= centroids[d]; + } + } + } + for (size_t i = 0; i < num_pq_chunks; i++) { - size_t chunk_offset = cur_chunk_size * i; + const size_t chunk_offset = offsets[i]; + const size_t cur_chunk_size = offsets[i + 1] - offsets[i]; for (int32_t j = 0; j < num_train; j++) { @@ -390,7 +443,7 @@ int generate_pq_pivots_simplified(const float *train_data, size_t num_train, siz kmeans::kmeanspp_selecting_pivots(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers); - kmeans::run_lloyds(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers, KMEANS_ITERS_FOR_PQ, NULL, + kmeans::run_lloyds(cur_data, num_train, cur_chunk_size, cur_pivot_data, num_centers, kmeans_iters_for_pq, NULL, closest_center); for (uint64_t j = 0; j < num_centers; j++) @@ -776,24 +829,33 @@ int generate_opq_pivots(const float *passed_train_data, size_t num_train, uint32 // Output is stored in the in-memory buffer pq. // Simplification is based on the following assumptions: // supporting only float data type -// dim % num_pq_chunks == 0, which results in a fixed chunk_size // num_centers == 256 by default -// make_zero_mean is false by default. // These assumptions allow to make the function much simpler and avoid using // array of chunk_offsets and centroids. // The compiler pragma for multi-threading support is removed from this implementation // for the purpose of integration into systems that strictly control resource allocation. -int generate_pq_data_from_pivots_simplified(const float *data, const size_t num, const float *pivot_data, +int generate_pq_data_from_pivots_simplified(float *data, const size_t num, const float *pivot_data, const size_t pivots_num, const size_t dim, const size_t num_pq_chunks, - std::vector &pq) + const std::vector ¢roids, const std::vector &offsets, + std::vector &pq) { - if (num_pq_chunks == 0 || num_pq_chunks > dim || dim % num_pq_chunks != 0) + if (num_pq_chunks == 0 || num_pq_chunks > dim) + { + return -1; + } + + if (!centroids.empty() && centroids.size() != dim) + { + return -1; + } + + if (offsets.size() != num_pq_chunks + 1 || offsets[0] != 0) { return -1; } const size_t num_centers = 256; - const size_t chunk_size = dim / num_pq_chunks; + const uint32_t max_chunk_size = offsets[1]; if (pivots_num != num_centers * dim) { @@ -802,33 +864,43 @@ int generate_pq_data_from_pivots_simplified(const float *data, const size_t num, pq.resize(num * num_pq_chunks); - std::vector cur_pivot_vector(num_centers * chunk_size); - std::vector cur_data_vector(num * chunk_size); + std::vector cur_pivot_vector(num_centers * max_chunk_size); + std::vector cur_data_vector(num * max_chunk_size); std::vector closest_center_vector(num); float *cur_pivot_data = &cur_pivot_vector[0]; float *cur_data = &cur_data_vector[0]; uint32_t *closest_center = &closest_center_vector[0]; + if (!centroids.empty()) + { + for (size_t p = 0; p < num; p++) + { + for (uint64_t d = 0; d < dim; d++) + { + data[p * dim + d] -= centroids[d]; + } + } + } + for (size_t i = 0; i < num_pq_chunks; i++) { - const size_t chunk_offset = chunk_size * i; + const size_t chunk_offset = offsets[i]; + const size_t cur_chunk_size = offsets[i + 1] - offsets[i]; for (int j = 0; j < num_centers; j++) { - std::memcpy(cur_pivot_data + j * chunk_size, pivot_data + j * dim + chunk_offset, - chunk_size * sizeof(float)); + std::memcpy(cur_pivot_data + j * cur_chunk_size, pivot_data + j * dim + chunk_offset, + cur_chunk_size * sizeof(float)); } for (int j = 0; j < num; j++) { - for (size_t k = 0; k < chunk_size; k++) - { - cur_data[j * chunk_size + k] = data[j * dim + chunk_offset + k]; - } + std::memcpy(cur_data + j * cur_chunk_size, data + j * dim + chunk_offset, cur_chunk_size * sizeof(float)); } - math_utils::compute_closest_centers(cur_data, num, chunk_size, cur_pivot_data, num_centers, 1, closest_center); + math_utils::compute_closest_centers(cur_data, num, cur_chunk_size, cur_pivot_data, num_centers, 1, + closest_center); for (int j = 0; j < num; j++) {