Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
00b29ce
Add SSE4.2 implementation
AntoinePrv Oct 27, 2025
ca857d5
Add unpack uint8_t benchmark
AntoinePrv Oct 28, 2025
d5a7b45
LOCAL: Benchmark all sizes
AntoinePrv Oct 29, 2025
0d0bd0e
Add bool unpack benchmark
AntoinePrv Oct 30, 2025
3d271ba
LOCAL: Remove unecessary benchmark functions
AntoinePrv Oct 29, 2025
cb60229
Add Kernel plan builder
AntoinePrv Oct 20, 2025
3099d4f
Add simd kernel
AntoinePrv Oct 24, 2025
b688519
Handle rshifts on SSE2
AntoinePrv Oct 27, 2025
35c6b6b
Use new kernel when possible in generated 128 code
AntoinePrv Oct 27, 2025
e6558dc
Refactor array to xsimd::batch_constant
AntoinePrv Oct 27, 2025
c029cb7
Refactor right shift
AntoinePrv Oct 27, 2025
e78fbb4
Add oversized plan
AntoinePrv Oct 28, 2025
9d1904a
Add oversized kernel
AntoinePrv Oct 28, 2025
4db976a
Rename kernels
AntoinePrv Oct 28, 2025
d2eda35
Add simd kernel dispatch
AntoinePrv Oct 28, 2025
df80c5c
Call Simd kernel directly
AntoinePrv Oct 28, 2025
6090843
Fix SIMD level None
AntoinePrv Oct 29, 2025
f01f34c
Initialize swizzles to -1
AntoinePrv Oct 29, 2025
c66278a
Doc
AntoinePrv Oct 29, 2025
6b1ce47
Improve test error message
AntoinePrv Oct 29, 2025
85c7f99
Use new kernel in avx2
AntoinePrv Oct 28, 2025
d197da6
AVX2 swizzle fallback
AntoinePrv Oct 29, 2025
8bf6dc4
Remove dead code
AntoinePrv Oct 30, 2025
4d8caa9
Simplify Large masks
AntoinePrv Oct 30, 2025
088a570
Remove bpacking 256 generated file
AntoinePrv Oct 30, 2025
2c13d5b
Remove uint8_t fallback
AntoinePrv Oct 30, 2025
eb19fe1
Add boolean simd implementation
AntoinePrv Oct 30, 2025
ec54191
Use std::is_base_of for arch detection
AntoinePrv Oct 30, 2025
f05569a
LOCAL: Add unpack_naive
AntoinePrv Oct 30, 2025
6e72467
LOCAL: Remove scalar benchmark functions
AntoinePrv Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 99 additions & 25 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <stdexcept>
#include <vector>

Expand Down Expand Up @@ -86,33 +87,48 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);
auto unpacked = std::make_unique<Int[]>(num_values);

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
unpack(packed_ptr, unpacked.get(), num_values, bit_width, /* bit_offset = */ 0);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues8 = {
benchmark::CreateDenseRange(0, 8, 1),
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues16 = {
kBitWidths16,
benchmark::CreateDenseRange(0, 16, 1),
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateDenseRange(0, 32, 1),
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateDenseRange(0, 64, 1),
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc<bool> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<bool>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc<uint8_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint8_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc<uint16_t> unpack,
bool skip = false, std::string skip_msg = "") {
Expand All @@ -129,14 +145,50 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
// BENCHMARK_CAPTURE(BM_UnpackBool, NaiveUnaligned, false, &unpack_naive<bool>)
// ->ArgsProduct(kBitWidthsNumValuesBool);
// BENCHMARK_CAPTURE(BM_UnpackUint8, NaiveUnaligned, false, &unpack_naive<uint8_t>)
// ->ArgsProduct(kBitWidthsNumValues8);
// BENCHMARK_CAPTURE(BM_UnpackUint16, NaiveUnaligned, false, &unpack_naive<uint16_t>)
// ->ArgsProduct(kBitWidthsNumValues16);
// BENCHMARK_CAPTURE(BM_UnpackUint32, NaiveUnaligned, false, &unpack_naive<uint32_t>)
// ->ArgsProduct(kBitWidthsNumValues32);
// BENCHMARK_CAPTURE(BM_UnpackUint64, NaiveUnaligned, false, &unpack_naive<uint64_t>)
// ->ArgsProduct(kBitWidthsNumValues64);
//
// BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &unpack_scalar<bool>)
// ->ArgsProduct(kBitWidthsNumValuesBool);
// BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar<uint8_t>)
// ->ArgsProduct(kBitWidthsNumValues8);
// BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
// ->ArgsProduct(kBitWidthsNumValues16);
// BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
// ->ArgsProduct(kBitWidthsNumValues32);
// BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
// ->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_SSE4_2)
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &unpack_sse4_2<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &unpack_avx2<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
Expand All @@ -152,6 +204,14 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &unpack_avx512<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
Expand All @@ -167,6 +227,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &unpack_neon<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &unpack_neon<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
Expand All @@ -175,20 +239,30 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);

BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);

BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
// BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
// ->ArgsProduct(kBitWidthsNumValuesBool);
// BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
// ->ArgsProduct(kBitWidthsNumValuesBool);
//
// BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
// ->ArgsProduct(kBitWidthsNumValues8);
// BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
// ->ArgsProduct(kBitWidthsNumValues8);
//
// BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
// ->ArgsProduct(kBitWidthsNumValues16);
// BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
// ->ArgsProduct(kBitWidthsNumValues16);
//
// BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>)
// ->ArgsProduct(kBitWidthsNumValues32);
// BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>)
// ->ArgsProduct(kBitWidthsNumValues32);
//
// BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>)
// ->ArgsProduct(kBitWidthsNumValues64);
// BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>)
// ->ArgsProduct(kBitWidthsNumValues64);

} // namespace
} // namespace arrow::internal
18 changes: 10 additions & 8 deletions cpp/src/arrow/util/bpacking_dispatch_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,20 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_
using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;

// Running the optimized kernel for batch extraction
const int unpacker_iter_count = batch_size / kValuesUnpacked;
for (int i = 0; i < unpacker_iter_count; ++i) {
in = UnpackerForWidth::unpack(in, out);
out += kValuesUnpacked;
if constexpr (kValuesUnpacked > 0) {
// Running the optimized kernel for batch extraction
const int unpacker_iter_count = batch_size / kValuesUnpacked;
for (int i = 0; i < unpacker_iter_count; ++i) {
in = UnpackerForWidth::unpack(in, out);
out += kValuesUnpacked;
}
batch_size -= unpacker_iter_count * kValuesUnpacked;
ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
}
batch_size -= unpacker_iter_count * kValuesUnpacked;

// Running the epilog for the remaining values that don't fit in a kernel
ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
ARROW_DCHECK_GE(batch_size, 0);
ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
ARROW_COMPILER_ASSUME(batch_size >= 0);
unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= */ 0);
}
Expand Down
17 changes: 17 additions & 0 deletions cpp/src/arrow/util/bpacking_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,21 @@ template void unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int, int);

template <typename Uint, int kBitWidth>
struct NoOpUnpacker {
static constexpr int kValuesUnpacked = 0;
};

template <typename Uint>
void unpack_naive(const uint8_t* in, Uint* out, int batch_size, int num_bits,
int bit_offset) {
return unpack_jump<NoOpUnpacker>(in, out, batch_size, num_bits, bit_offset);
}

template void unpack_naive<bool>(const uint8_t*, bool*, int, int, int);
template void unpack_naive<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
template void unpack_naive<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack_naive<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack_naive<uint64_t>(const uint8_t*, uint64_t*, int, int, int);

} // namespace arrow::internal
21 changes: 21 additions & 0 deletions cpp/src/arrow/util/bpacking_scalar_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,25 @@ extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint32_t>(
extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint64_t>(
const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);

template <typename Uint>
ARROW_EXPORT void unpack_naive(const uint8_t* in, Uint* out, int batch_size, int num_bits,
int bit_offset);

extern template ARROW_TEMPLATE_EXPORT void unpack_naive<bool>(const uint8_t* in,
bool* out, int batch_size,
int num_bits,
int bit_offset);

extern template ARROW_TEMPLATE_EXPORT void unpack_naive<uint8_t>(
const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);

extern template ARROW_TEMPLATE_EXPORT void unpack_naive<uint16_t>(
const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);

extern template ARROW_TEMPLATE_EXPORT void unpack_naive<uint32_t>(
const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);

extern template ARROW_TEMPLATE_EXPORT void unpack_naive<uint64_t>(
const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);

} // namespace arrow::internal
Loading
Loading