From 44cdc9e5e20b9ca122bfb932f8e5d0308a37cb36 Mon Sep 17 00:00:00 2001 From: icejoywoo Date: Thu, 18 Jan 2024 12:39:13 -0800 Subject: [PATCH] Fix simd::gatherBits for Mac M1 or when AVX2 is disabled (#8415) Summary: Fix https://github.com/facebookincubator/velox/issues/8377 When avx2 is disabled or run on Mac M1(arm64), simd::gatherBits works incorrectly. This fix comes from DecoderUtil::nonNullRowsFromSparse. Pull Request resolved: https://github.com/facebookincubator/velox/pull/8415 Reviewed By: xiaoxmeng Differential Revision: D52873543 Pulled By: Yuhta fbshipit-source-id: ce8cbeb2069a809410b7a259e05285be2e1a70b5 --- velox/common/base/SimdUtil.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/velox/common/base/SimdUtil.cpp b/velox/common/base/SimdUtil.cpp index 6b84435dd893..03576ac31ec4 100644 --- a/velox/common/base/SimdUtil.cpp +++ b/velox/common/base/SimdUtil.cpp @@ -23,6 +23,7 @@ void gatherBits( const uint64_t* bits, folly::Range indexRange, uint64_t* result) { + constexpr int32_t kStep = xsimd::batch::size; const auto size = indexRange.size(); auto indices = indexRange.data(); uint8_t* resultPtr = reinterpret_cast(result); @@ -37,14 +38,16 @@ void gatherBits( } int32_t i = 0; - for (; i + 8 < size; i += 8) { - *(resultPtr++) = - simd::gather8Bits(bits, xsimd::load_unaligned(indices + i), 8); + for (; i + kStep < size; i += kStep) { + uint16_t flags = + simd::gather8Bits(bits, xsimd::load_unaligned(indices + i), kStep); + bits::storeBitsToByte(flags, resultPtr, i); } const auto bitsLeft = size - i; if (bitsLeft > 0) { - *resultPtr = + uint16_t flags = simd::gather8Bits(bits, xsimd::load_unaligned(indices + i), bitsLeft); + bits::storeBitsToByte(flags, resultPtr, i); } }