From 293d6970acfa064268929bd364a2e663baf29d11 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 7 Apr 2024 20:24:38 +0800 Subject: [PATCH] unpack16 using avx2 and restrict --- cpp/src/arrow/util/bpacking.cc | 18 +- .../bpacking16_simd128_generated_internal.h | 34 +- .../bpacking16_simd256_generated_internal.h | 331 ++++++++++++++++++ cpp/src/arrow/util/bpacking16_simd_codegen.py | 6 +- cpp/src/arrow/util/bpacking_avx2.cc | 6 + cpp/src/arrow/util/bpacking_simd_internal.h | 2 +- 6 files changed, 374 insertions(+), 23 deletions(-) create mode 100644 cpp/src/arrow/util/bpacking16_simd256_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index bf904184c259c..9fa4fa5b81408 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -457,14 +457,28 @@ int unpack16_default(const uint8_t* in, uint16_t* out, int batch_size, int num_b } return batch_size; } + +struct Unpack16DynamicFunction { + using FunctionType = decltype(&unpack16_default); + + static std::vector> implementations() { + return {{DispatchLevel::NONE, unpack16_default} +#if defined(ARROW_HAVE_RUNTIME_AVX2) + , + {DispatchLevel::AVX2, unpack16_avx2} +#endif + }; + } +}; + } int unpack16(const uint8_t* in, uint16_t* out, int batch_size, int num_bits) { - // TODO: unpack16_neon, unpack16_avx2 #if defined(ARROW_HAVE_NEON) return unpack16_neon(reinterpret_cast(in), out, batch_size, num_bits); #else - return unpack16_default(in, out, batch_size, num_bits); + static DynamicDispatch dispatch; + return dispatch.func(in, out, batch_size, num_bits); #endif } diff --git a/cpp/src/arrow/util/bpacking16_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking16_simd128_generated_internal.h index 93d4317846e6d..cd979e9054338 100644 --- a/cpp/src/arrow/util/bpacking16_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking16_simd128_generated_internal.h @@ -38,14 +38,14 @@ struct Unpack16Bits128 { using simd_batch = xsimd::make_sized_batch_t; -inline static const uint16_t* unpack0_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack0_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } -inline static const uint16_t* unpack1_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack1_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x1; simd_batch masks(mask); @@ -70,7 +70,7 @@ inline static const uint16_t* unpack1_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack2_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack2_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x3; simd_batch masks(mask); @@ -95,7 +95,7 @@ inline static const uint16_t* unpack2_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack3_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack3_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x7; simd_batch masks(mask); @@ -120,7 +120,7 @@ inline static const uint16_t* unpack3_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack4_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack4_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0xf; simd_batch masks(mask); @@ -145,7 +145,7 @@ inline static const uint16_t* unpack4_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack5_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack5_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x1f; simd_batch masks(mask); @@ -170,7 +170,7 @@ inline static const uint16_t* unpack5_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack6_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack6_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x3f; simd_batch masks(mask); @@ -195,7 +195,7 @@ inline static const uint16_t* unpack6_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack7_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack7_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x7f; simd_batch masks(mask); @@ -220,7 +220,7 @@ inline static const uint16_t* unpack7_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack8_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack8_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0xff; simd_batch masks(mask); @@ -245,7 +245,7 @@ inline static const uint16_t* unpack8_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack9_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack9_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x1ff; simd_batch masks(mask); @@ -270,7 +270,7 @@ inline static const uint16_t* unpack9_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack10_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack10_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x3ff; simd_batch masks(mask); @@ -295,7 +295,7 @@ inline static const uint16_t* unpack10_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack11_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack11_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x7ff; simd_batch masks(mask); @@ -320,7 +320,7 @@ inline static const uint16_t* unpack11_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack12_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack12_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0xfff; simd_batch masks(mask); @@ -345,7 +345,7 @@ inline static const uint16_t* unpack12_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack13_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack13_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x1fff; simd_batch masks(mask); @@ -370,7 +370,7 @@ inline static const uint16_t* unpack13_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack14_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack14_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x3fff; simd_batch masks(mask); @@ -395,7 +395,7 @@ inline static const uint16_t* unpack14_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack15_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack15_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { uint16_t mask = 0x7fff; simd_batch masks(mask); @@ -420,7 +420,7 @@ inline static const uint16_t* unpack15_16(const uint16_t* in, uint16_t* out) { return in; } -inline static const uint16_t* unpack16_16(const uint16_t* in, uint16_t* out) { +inline static const uint16_t* unpack16_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { memcpy(out, in, 16 * sizeof(*out)); in += 16; out += 16; diff --git a/cpp/src/arrow/util/bpacking16_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking16_simd256_generated_internal.h new file mode 100644 index 0000000000000..bc99fa7af6cb3 --- /dev/null +++ b/cpp/src/arrow/util/bpacking16_simd256_generated_internal.h @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Automatically generated file; DO NOT EDIT. + +#pragma once + +#include +#include + +#include + +#include "arrow/util/dispatch.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace internal { +namespace { + +using ::arrow::util::SafeLoad; + +template +struct Unpack16Bits256 { + +using simd_batch = xsimd::make_sized_batch_t; + +inline static const uint16_t* unpack0_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + memset(out, 0x0, 16 * sizeof(*out)); + out += 16; + + return in; +} + +inline static const uint16_t* unpack1_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 1; + return in; +} + +inline static const uint16_t* unpack2_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 2; + return in; +} + +inline static const uint16_t* unpack3_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 15 | SafeLoad(in + 1) << 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5, 8, 11, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 3; + return in; +} + +inline static const uint16_t* unpack4_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 4; + return in; +} + +inline static const uint16_t* unpack5_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 15 | SafeLoad(in + 1) << 1), SafeLoad(in + 1), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 2), SafeLoad(in + 2), SafeLoad(in + 2), static_cast(SafeLoad(in + 2) >> 13 | SafeLoad(in + 3) << 3), SafeLoad(in + 3), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3, 8, 0, 2, 7, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 5; + return in; +} + +inline static const uint16_t* unpack6_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 12 | SafeLoad(in + 1) << 4), SafeLoad(in + 1), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 4), SafeLoad(in + 4), SafeLoad(in + 4), static_cast(SafeLoad(in + 4) >> 14 | SafeLoad(in + 5) << 2), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10, 0, 6, 0, 2, 8, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 6; + return in; +} + +inline static const uint16_t* unpack7_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 14 | SafeLoad(in + 1) << 2), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 4), SafeLoad(in + 2), static_cast(SafeLoad(in + 2) >> 10 | SafeLoad(in + 3) << 6), SafeLoad(in + 3), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 15 | SafeLoad(in + 4) << 1), SafeLoad(in + 4), static_cast(SafeLoad(in + 4) >> 13 | SafeLoad(in + 5) << 3), SafeLoad(in + 5), static_cast(SafeLoad(in + 5) >> 11 | SafeLoad(in + 6) << 5), SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1, 8, 0, 6, 0, 4, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 7; + return in; +} + +inline static const uint16_t* unpack8_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 8; + return in; +} + +inline static const uint16_t* unpack9_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 9 | SafeLoad(in + 1) << 7), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 11 | SafeLoad(in + 2) << 5), SafeLoad(in + 2), static_cast(SafeLoad(in + 2) >> 13 | SafeLoad(in + 3) << 3), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 15 | SafeLoad(in + 4) << 1), static_cast(SafeLoad(in + 4) >> 8 | SafeLoad(in + 5) << 8), SafeLoad(in + 5), static_cast(SafeLoad(in + 5) >> 10 | SafeLoad(in + 6) << 6), SafeLoad(in + 6), static_cast(SafeLoad(in + 6) >> 12 | SafeLoad(in + 7) << 4), SafeLoad(in + 7), static_cast(SafeLoad(in + 7) >> 14 | SafeLoad(in + 8) << 2), SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 0, 1, 0, 3, 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 9; + return in; +} + +inline static const uint16_t* unpack10_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 10 | SafeLoad(in + 1) << 6), SafeLoad(in + 1), static_cast(SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 2), static_cast(SafeLoad(in + 2) >> 8 | SafeLoad(in + 3) << 8), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 4), SafeLoad(in + 4), SafeLoad(in + 5), static_cast(SafeLoad(in + 5) >> 10 | SafeLoad(in + 6) << 6), SafeLoad(in + 6), static_cast(SafeLoad(in + 6) >> 14 | SafeLoad(in + 7) << 2), static_cast(SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 8), SafeLoad(in + 8), static_cast(SafeLoad(in + 8) >> 12 | SafeLoad(in + 9) << 4), SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6, 0, 0, 4, 0, 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 10; + return in; +} + +inline static const uint16_t* unpack11_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 11 | SafeLoad(in + 1) << 5), static_cast(SafeLoad(in + 1) >> 6 | SafeLoad(in + 2) << 10), SafeLoad(in + 2), static_cast(SafeLoad(in + 2) >> 12 | SafeLoad(in + 3) << 4), static_cast(SafeLoad(in + 3) >> 7 | SafeLoad(in + 4) << 9), SafeLoad(in + 4), static_cast(SafeLoad(in + 4) >> 13 | SafeLoad(in + 5) << 3), static_cast(SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 8), SafeLoad(in + 6), static_cast(SafeLoad(in + 6) >> 14 | SafeLoad(in + 7) << 2), static_cast(SafeLoad(in + 7) >> 9 | SafeLoad(in + 8) << 7), SafeLoad(in + 8), static_cast(SafeLoad(in + 8) >> 15 | SafeLoad(in + 9) << 1), static_cast(SafeLoad(in + 9) >> 10 | SafeLoad(in + 10) << 6), SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 11; + return in; +} + +inline static const uint16_t* unpack12_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 12 | SafeLoad(in + 1) << 4), static_cast(SafeLoad(in + 1) >> 8 | SafeLoad(in + 2) << 8), SafeLoad(in + 2), SafeLoad(in + 3), static_cast(SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 4), static_cast(SafeLoad(in + 4) >> 8 | SafeLoad(in + 5) << 8), SafeLoad(in + 5), SafeLoad(in + 6), static_cast(SafeLoad(in + 6) >> 12 | SafeLoad(in + 7) << 4), static_cast(SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 8), SafeLoad(in + 8), SafeLoad(in + 9), static_cast(SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 4), static_cast(SafeLoad(in + 10) >> 8 | SafeLoad(in + 11) << 8), SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 12; + return in; +} + +inline static const uint16_t* unpack13_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 13 | SafeLoad(in + 1) << 3), static_cast(SafeLoad(in + 1) >> 10 | SafeLoad(in + 2) << 6), static_cast(SafeLoad(in + 2) >> 7 | SafeLoad(in + 3) << 9), static_cast(SafeLoad(in + 3) >> 4 | SafeLoad(in + 4) << 12), SafeLoad(in + 4), static_cast(SafeLoad(in + 4) >> 14 | SafeLoad(in + 5) << 2), static_cast(SafeLoad(in + 5) >> 11 | SafeLoad(in + 6) << 5), static_cast(SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 8), static_cast(SafeLoad(in + 7) >> 5 | SafeLoad(in + 8) << 11), SafeLoad(in + 8), static_cast(SafeLoad(in + 8) >> 15 | SafeLoad(in + 9) << 1), static_cast(SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 4), static_cast(SafeLoad(in + 10) >> 9 | SafeLoad(in + 11) << 7), static_cast(SafeLoad(in + 11) >> 6 | SafeLoad(in + 12) << 10), SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 13; + return in; +} + +inline static const uint16_t* unpack14_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 14 | SafeLoad(in + 1) << 2), static_cast(SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 4), static_cast(SafeLoad(in + 2) >> 10 | SafeLoad(in + 3) << 6), static_cast(SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 8), static_cast(SafeLoad(in + 4) >> 6 | SafeLoad(in + 5) << 10), static_cast(SafeLoad(in + 5) >> 4 | SafeLoad(in + 6) << 12), SafeLoad(in + 6), SafeLoad(in + 7), static_cast(SafeLoad(in + 7) >> 14 | SafeLoad(in + 8) << 2), static_cast(SafeLoad(in + 8) >> 12 | SafeLoad(in + 9) << 4), static_cast(SafeLoad(in + 9) >> 10 | SafeLoad(in + 10) << 6), static_cast(SafeLoad(in + 10) >> 8 | SafeLoad(in + 11) << 8), static_cast(SafeLoad(in + 11) >> 6 | SafeLoad(in + 12) << 10), static_cast(SafeLoad(in + 12) >> 4 | SafeLoad(in + 13) << 12), SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 14; + return in; +} + +inline static const uint16_t* unpack15_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + uint16_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), static_cast(SafeLoad(in + 0) >> 15 | SafeLoad(in + 1) << 1), static_cast(SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 2), static_cast(SafeLoad(in + 2) >> 13 | SafeLoad(in + 3) << 3), static_cast(SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 4), static_cast(SafeLoad(in + 4) >> 11 | SafeLoad(in + 5) << 5), static_cast(SafeLoad(in + 5) >> 10 | SafeLoad(in + 6) << 6), static_cast(SafeLoad(in + 6) >> 9 | SafeLoad(in + 7) << 7), static_cast(SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 8), static_cast(SafeLoad(in + 8) >> 7 | SafeLoad(in + 9) << 9), static_cast(SafeLoad(in + 9) >> 6 | SafeLoad(in + 10) << 10), static_cast(SafeLoad(in + 10) >> 5 | SafeLoad(in + 11) << 11), static_cast(SafeLoad(in + 11) >> 4 | SafeLoad(in + 12) << 12), static_cast(SafeLoad(in + 12) >> 3 | SafeLoad(in + 13) << 13), static_cast(SafeLoad(in + 13) >> 2 | SafeLoad(in + 14) << 14), SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 15; + return in; +} + +inline static const uint16_t* unpack16_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) { + memcpy(out, in, 16 * sizeof(*out)); + in += 16; + out += 16; + + return in; +} + +}; // struct Unpack16Bits256 + +} // namespace +} // namespace internal +} // namespace arrow + diff --git a/cpp/src/arrow/util/bpacking16_simd_codegen.py b/cpp/src/arrow/util/bpacking16_simd_codegen.py index ee0a1efd3bcac..7d9abd88be19d 100644 --- a/cpp/src/arrow/util/bpacking16_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking16_simd_codegen.py @@ -37,7 +37,7 @@ def __init__(self, simd_width): def print_unpack_bit0_func(self): print( - "inline static const uint16_t* unpack0_16(const uint16_t* in, uint16_t* out) {") + "inline static const uint16_t* unpack0_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {") print(" memset(out, 0x0, 16 * sizeof(*out));") print(" out += 16;") print("") @@ -47,7 +47,7 @@ def print_unpack_bit0_func(self): def print_unpack_bit16_func(self): print( - "inline static const uint16_t* unpack16_16(const uint16_t* in, uint16_t* out) {") + "inline static const uint16_t* unpack16_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {") print(" memcpy(out, in, 16 * sizeof(*out));") print(" in += 16;") print(" out += 16;") @@ -66,7 +66,7 @@ def p(code): mask = (1 << bit) - 1 bracket = "{" - print(f"inline static const uint16_t* unpack{bit}_16(const uint16_t* in, uint16_t* out) {{") + print(f"inline static const uint16_t* unpack{bit}_16(const uint16_t* __restrict__ in, uint16_t* __restrict__ out) {{") p(dedent(f"""\ uint16_t mask = 0x{mask:0x}; diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 9105aaa2af411..6258b4e797909 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -18,6 +18,7 @@ #include "arrow/util/bpacking_avx2.h" #include "arrow/util/bpacking_simd256_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" +#include "arrow/util/bpacking16_simd256_generated_internal.h" namespace arrow { namespace internal { @@ -27,5 +28,10 @@ int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bit num_bits); } +int unpack16_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack16_specialized>(in, out, batch_size, + num_bits); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index f34e177a319ce..702336b4a559f 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -135,7 +135,7 @@ static int unpack32_specialized(const uint32_t* in, uint32_t* out, int batch_siz } template -static int unpack16_specialized(const uint16_t* in, uint16_t* out, int batch_size, +static int unpack16_specialized(const uint16_t* __restrict__ in, uint16_t* __restrict__ out, int batch_size, int num_bits) { batch_size = batch_size / 16 * 16; int num_loops = batch_size / 16;