@@ -40,6 +40,8 @@ namespace arrow::internal {
4040// - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the
4141// byte can be reused (when val_bit_width divides packed_max_byte_spread).
4242// - Try for uint16_t and uint8_t and bool (currently copy)
43+ // - Add unpack_exact to benchmarks
44+ // - Reduce input size on small bit width using a broadcast.
4345// - For Avx2:
4446// - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used?
4547// - Investigate AVX2 with 128 bit register
@@ -287,13 +289,14 @@ constexpr auto make_batch_constant() {
287289// Intel x86-64 does not have variable left shifts before AVX2.
288290//
289291// We replace the variable left shift by a variable multiply with a power of two.
292+ // The behaviour is the same sa long as there are no overflow.
290293//
291294// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of
292295// integers per second through vectorization, Software Practice & Experience 45 (1), 2015.
293296// http://arxiv.org/abs/1209.2137
294297template <typename Arch, typename Int, Int... kShifts >
295- auto left_shift (const xsimd::batch<Int, Arch>& batch,
296- xsimd::batch_constant<Int, Arch, kShifts ...> shifts) {
298+ auto left_shift_no_overflow (const xsimd::batch<Int, Arch>& batch,
299+ xsimd::batch_constant<Int, Arch, kShifts ...> shifts) {
297300 constexpr bool kHasSse2 = xsimd::supported_architectures::contains<xsimd::sse2>();
298301 constexpr bool kHasAvx2 = xsimd::supported_architectures::contains<xsimd::avx2>();
299302
@@ -324,8 +327,8 @@ auto left_shift(const xsimd::batch<Int, Arch>& batch,
324327// integers per second through vectorization, Software Practice & Experience 45 (1), 2015.
325328// http://arxiv.org/abs/1209.2137
326329template <typename Arch, typename Int, Int... kShifts >
327- auto overflow_right_shift (const xsimd::batch<Int, Arch>& batch,
328- xsimd::batch_constant<Int, Arch, kShifts ...> shifts) {
330+ auto right_shift_by_excess (const xsimd::batch<Int, Arch>& batch,
331+ xsimd::batch_constant<Int, Arch, kShifts ...> shifts) {
329332 constexpr bool kHasSse2 = xsimd::supported_architectures::contains<xsimd::sse2>();
330333 constexpr bool kHasAvx2 = xsimd::supported_architectures::contains<xsimd::avx2>();
331334
@@ -375,7 +378,7 @@ struct MediumKernel {
375378 // Intel x86-64 does not have variable right shifts before AVX2.
376379 // We know the packed value can safely be left shifted up to the largest offset so we
377380 // can use the fallback on these platforms.
378- const auto shifted = overflow_right_shift (words, kRightShifts );
381+ const auto shifted = right_shift_by_excess (words, kRightShifts );
379382 const auto vals = shifted & kMask ;
380383 xsimd::store_unaligned (out + kOutOffset , vals);
381384 }
@@ -515,7 +518,7 @@ struct LargeKernel {
515518 // Intel x86-64 does not have variable right shifts before AVX2.
516519 // We know the packed value can safely be left shifted up to the largest offset so we
517520 // can use the fallback on these platforms.
518- const auto shifted = overflow_right_shift (words, kRightShifts );
521+ const auto shifted = right_shift_by_excess (words, kRightShifts );
519522 const auto vals = shifted & kMask ;
520523 xsimd::store_unaligned (out + kOutOffset , vals);
521524 }
@@ -547,12 +550,12 @@ struct LargeKernel {
547550
548551 const auto low_swizzled = xsimd::swizzle (bytes, kLowSwizzles );
549552 const auto low_words = xsimd::bitwise_cast<unpacked_type>(low_swizzled);
550- const auto low_shifted = overflow_right_shift (low_words, kLowRShifts );
553+ const auto low_shifted = right_shift_by_excess (low_words, kLowRShifts );
551554 const auto low_half_vals = low_shifted & kPlan .low_mask ;
552555
553556 const auto high_swizzled = xsimd::swizzle (bytes, kHighSwizzles );
554557 const auto high_words = xsimd::bitwise_cast<unpacked_type>(high_swizzled);
555- const auto high_shifted = left_shift (high_words, kHighLShifts );
558+ const auto high_shifted = left_shift_no_overflow (high_words, kHighLShifts );
556559 const auto high_half_vals = high_shifted & kPlan .high_mask ;
557560
558561 const auto vals = low_half_vals | high_half_vals;
0 commit comments