Skip to content

Commit

Permalink
improve the preformance for hgraph bulid & add
Browse files Browse the repository at this point in the history
- avx512 simd optimization
- memory_block_io optimization

Signed-off-by: LHT129 <[email protected]>
  • Loading branch information
LHT129 committed Dec 14, 2024
1 parent bb3d70c commit 7fccafa
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 22 deletions.
38 changes: 28 additions & 10 deletions src/io/memory_block_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ class MemoryBlockIO : public BasicIO<MemoryBlockIO> {
public:
explicit MemoryBlockIO(Allocator* allocator, uint64_t block_size = DEFAULT_BLOCK_SIZE)
: block_size_(block_size), allocator_(allocator), blocks_(0, allocator) {
this->update_by_block_size();
}

MemoryBlockIO(const JsonType& io_param, const IndexCommonParam& common_param)
: allocator_(common_param.allocator_), blocks_(0, common_param.allocator_) {
: MemoryBlockIO(common_param.allocator_) {
if (io_param.contains(BLOCK_IO_BLOCK_SIZE_KEY)) {
this->block_size_ =
io_param[BLOCK_IO_BLOCK_SIZE_KEY]; // TODO(LHT): trans str to uint64_t
}
this->update_by_block_size();
}

~MemoryBlockIO() override {
Expand Down Expand Up @@ -83,22 +85,34 @@ class MemoryBlockIO : public BasicIO<MemoryBlockIO> {
private:
[[nodiscard]] inline bool
check_valid_offset(uint64_t size) const {
return size <= blocks_.size() * block_size_;
return size <= (blocks_.size() << block_bit_);
}

inline void
update_by_block_size() {
block_bit_ = 0;
while (block_size_ > 0) {
block_size_ >>= 1;
block_bit_ += 1;
}
block_bit_ -= 1;
in_block_mask_ = (1ULL << block_bit_) - 1;
block_size_ = in_block_mask_ + 1;
}

inline void
check_and_realloc(uint64_t size);

[[nodiscard]] inline const uint8_t*
get_data_ptr(uint64_t offset) const {
auto block_no = offset / block_size_;
auto block_off = offset % block_size_;
auto block_no = offset >> block_bit_;
auto block_off = offset & in_block_mask_;
return blocks_[block_no] + block_off;
}

[[nodiscard]] inline bool
check_in_one_block(uint64_t off1, uint64_t off2) const {
return (off1 / block_size_) == (off2 / block_size_);
return (off1 ^ off2) < block_size_;
}

private:
Expand All @@ -109,14 +123,18 @@ class MemoryBlockIO : public BasicIO<MemoryBlockIO> {
Allocator* const allocator_{nullptr};

static const uint64_t DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB

uint64_t block_bit_ = 27;

uint64_t in_block_mask_ = (1 << 27) - 1;
};

void
MemoryBlockIO::WriteImpl(const uint8_t* data, uint64_t size, uint64_t offset) {
check_and_realloc(size + offset);
uint64_t cur_size = 0;
auto start_no = offset / block_size_;
auto start_off = offset % block_size_;
auto start_no = offset >> block_bit_;
auto start_off = offset & in_block_mask_;
auto max_size = block_size_ - start_off;
while (cur_size < size) {
uint8_t* cur_write = blocks_[start_no] + start_off;
Expand All @@ -134,8 +152,8 @@ MemoryBlockIO::ReadImpl(uint64_t size, uint64_t offset, uint8_t* data) const {
bool ret = check_valid_offset(size + offset);
if (ret) {
uint64_t cur_size = 0;
auto start_no = offset / block_size_;
auto start_off = offset % block_size_;
auto start_no = offset >> block_bit_;
auto start_off = offset & in_block_mask_;
auto max_size = block_size_ - start_off;
while (cur_size < size) {
const uint8_t* cur_read = blocks_[start_no] + start_off;
Expand Down Expand Up @@ -189,7 +207,7 @@ MemoryBlockIO::check_and_realloc(uint64_t size) {
if (check_valid_offset(size)) {
return;
}
const uint64_t new_block_count = (size + this->block_size_ - 1) / block_size_;
const uint64_t new_block_count = (size + this->block_size_ - 1) >> block_bit_;
auto cur_block_size = this->blocks_.size();
while (cur_block_size < new_block_count) {
this->blocks_.emplace_back((uint8_t*)(allocator_->Allocate(block_size_)));
Expand Down
21 changes: 9 additions & 12 deletions src/simd/avx512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <immintrin.h>

#include <cmath>
#include <iostream>

#include "fp32_simd.h"
#include "normalize.h"
Expand Down Expand Up @@ -299,9 +300,10 @@ SQ8ComputeL2Sqr(const float* query,
for (; i + 15 < dim; i += 16) {
// Load data into registers
__m128i code_values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(codes + i));
__m512 diff_values = _mm512_loadu_ps(diff + i);

__m512i codes_512 = _mm512_cvtepu8_epi32(code_values);
__m512 code_floats = _mm512_div_ps(_mm512_cvtepi32_ps(codes_512), _mm512_set1_ps(255.0f));
__m512 diff_values = _mm512_loadu_ps(diff + i);
__m512 lowerBound_values = _mm512_loadu_ps(lowerBound + i);
__m512 query_values = _mm512_loadu_ps(query + i);

Expand Down Expand Up @@ -373,19 +375,14 @@ SQ8ComputeCodesL2Sqr(const uint8_t* codes1,
for (; i + 15 < dim; i += 16) {
__m128i code1_values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(codes1 + i));
__m128i code2_values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(codes2 + i));
__m512i codes1_512 = _mm512_cvtepu8_epi32(code1_values);
__m512i codes2_512 = _mm512_cvtepu8_epi32(code2_values);
__m512 code1_floats = _mm512_div_ps(_mm512_cvtepi32_ps(codes1_512), _mm512_set1_ps(255.0f));
__m512 code2_floats = _mm512_div_ps(_mm512_cvtepi32_ps(codes2_512), _mm512_set1_ps(255.0f));
__m512 diff_values = _mm512_loadu_ps(diff + i);
__m512 lowerBound_values = _mm512_loadu_ps(lowerBound + i);

// Perform calculations
__m512 scaled_codes1 = _mm512_fmadd_ps(code1_floats, diff_values, lowerBound_values);
__m512 scaled_codes2 = _mm512_fmadd_ps(code2_floats, diff_values, lowerBound_values);
__m512 val = _mm512_sub_ps(scaled_codes1, scaled_codes2);
val = _mm512_mul_ps(val, val);
sum = _mm512_add_ps(sum, val);
__m512i codes1_512 = _mm512_cvtepu8_epi32(code1_values);
__m512i codes2_512 = _mm512_cvtepu8_epi32(code2_values);
__m512 sub = _mm512_cvtepi32_ps(_mm512_sub_epi32(codes1_512, codes2_512));
__m512 scaled = _mm512_mul_ps(sub, _mm512_set1_ps(1.0 / 255.0f));
__m512 val = _mm512_mul_ps(scaled, diff_values);
sum = _mm512_fmadd_ps(val, val, sum);
}

// Horizontal addition
Expand Down

0 comments on commit 7fccafa

Please sign in to comment.