From e4cd07547c8d90ea95e59ecb94099613fe04a8a1 Mon Sep 17 00:00:00 2001 From: zuyu <zuyu@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:18:58 -0800 Subject: [PATCH] refactor: IntDecoder --- velox/dwio/common/IntCodecCommon.h | 1 + velox/dwio/common/IntDecoder.h | 90 ++++++++++++++---------------- 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/velox/dwio/common/IntCodecCommon.h b/velox/dwio/common/IntCodecCommon.h index a83f663689fb..f17cc12287b7 100644 --- a/velox/dwio/common/IntCodecCommon.h +++ b/velox/dwio/common/IntCodecCommon.h @@ -26,6 +26,7 @@ constexpr uint32_t LONG_BYTE_SIZE = 8; constexpr uint64_t BASE_128_MASK = 0x7f; constexpr uint64_t BASE_256_MASK = 0xff; +constexpr __int128_t INT128_BASE_128_MASK = 0x7f; constexpr __int128_t INT128_BASE_256_MASK = 0xff; // Timezone constants diff --git a/velox/dwio/common/IntDecoder.h b/velox/dwio/common/IntDecoder.h index 79f7bb8d89e5..012845bb53bb 100644 --- a/velox/dwio/common/IntDecoder.h +++ b/velox/dwio/common/IntDecoder.h @@ -293,7 +293,7 @@ FOLLY_ALWAYS_INLINE uint64_t IntDecoder<isSigned>::readVuLong() { } int64_t result = 0; - int64_t offset = 0; + uint32_t offset = 0; signed char ch; do { ch = readByte(); @@ -311,42 +311,44 @@ FOLLY_ALWAYS_INLINE int64_t IntDecoder<isSigned>::readVsLong() { template <bool isSigned> inline int64_t IntDecoder<isSigned>::readLongLE() { VELOX_DCHECK_EQ(pendingSkip_, 0); - int64_t result = 0; - if (bufferStart_ && bufferStart_ + sizeof(int64_t) <= bufferEnd_) { + if (bufferStart_ && bufferStart_ + numBytes_ <= bufferEnd_) { bufferStart_ += numBytes_; if (numBytes_ == 8) { return *reinterpret_cast<const int64_t*>(bufferStart_ - 8); } - if (numBytes_ == 4) { - if (isSigned) { + + if constexpr (isSigned) { + if (numBytes_ == 4) { return *reinterpret_cast<const int32_t*>(bufferStart_ - 4); } - return *reinterpret_cast<const uint32_t*>(bufferStart_ - 4); - } - if (isSigned) { return *reinterpret_cast<const int16_t*>(bufferStart_ - 2); + } else { + if (numBytes_ == 4) { + return *reinterpret_cast<const uint32_t*>(bufferStart_ - 4); + } + return *reinterpret_cast<const uint16_t*>(bufferStart_ - 2); } - return *reinterpret_cast<const uint16_t*>(bufferStart_ - 2); } - char b; - int64_t offset = 0; - for (uint32_t i = 0; i < numBytes_; ++i) { - b = readByte(); + int64_t result = 0; + for (uint32_t offset = 0; offset < numBytes_ * 8; offset += 8) { + const char b = readByte(); result |= (b & BASE_256_MASK) << offset; - offset += 8; } - if (isSigned && numBytes_ < 8) { - if (numBytes_ == 2) { - return static_cast<int16_t>(result); + if constexpr (isSigned) { + if (numBytes_ >= 8) { + return result; } if (numBytes_ == 4) { return static_cast<int32_t>(result); } - VELOX_DCHECK(false, "Bad width for signed fixed width: {}", numBytes_); + VELOX_DCHECK( + numBytes_ == 2, "Bad width for signed fixed width: {}", numBytes_); + return static_cast<int16_t>(result); + } else { + return result; } - return result; } template <bool isSigned> @@ -356,7 +358,7 @@ inline cppType IntDecoder<isSigned>::readLittleEndianFromBigEndian() { cppType bigEndianValue = 0; // Input is in Big Endian layout of size numBytes. - if (bufferStart_ && (bufferStart_ + sizeof(int64_t) <= bufferEnd_)) { + if (bufferStart_ && (bufferStart_ + numBytes_ <= bufferEnd_)) { bufferStart_ += numBytes_; const auto valueOffset = bufferStart_ - numBytes_; // Use first byte to initialize bigEndianValue. @@ -376,18 +378,15 @@ inline cppType IntDecoder<isSigned>::readLittleEndianFromBigEndian() { } } - char b; - cppType offset = 0; - cppType numBytesBigEndian = 0; // Read numBytes input into numBytesBigEndian. - for (uint32_t i = 0; i < numBytes_; ++i) { - b = readByte(); + cppType numBytesBigEndian = 0; + for (uint32_t offset = 0; offset < numBytes_ * 8; offset += 8) { + const char b = readByte(); if constexpr (sizeof(cppType) == 16) { numBytesBigEndian |= (b & INT128_BASE_256_MASK) << offset; } else { numBytesBigEndian |= (b & BASE_256_MASK) << offset; } - offset += 8; } // Use first byte to initialize bigEndianValue. bigEndianValue = @@ -415,19 +414,13 @@ inline uint128_t IntDecoder<isSigned>::readVuHugeInt() { VELOX_DCHECK_EQ(pendingSkip_, 0); uint128_t value = 0; - uint128_t work; uint32_t offset = 0; signed char ch; - while (true) { + do { ch = readByte(); - work = ch & 0x7f; - work <<= offset; - value |= work; + value |= (ch & INT128_BASE_128_MASK) << offset; offset += 7; - if (!(ch & 0x80)) { - break; - } - } + } while (ch < 0); return value; } @@ -439,27 +432,28 @@ inline T IntDecoder<isSigned>::readInt() { } if (bigEndian_) { return readLittleEndianFromBigEndian<T>(); - } else { - if constexpr (std::is_same_v<T, int128_t>) { - if (numBytes_ == 8) { - return readLongLE(); - } - if (numBytes_ == 12) { - VELOX_DCHECK(!useVInts_, "Int96 should not be VInt encoded."); - return readInt96(); - } - VELOX_NYI(); + } + if constexpr (std::is_same_v<T, int128_t>) { + if (numBytes_ == 8) { + return readLongLE(); } + if (numBytes_ == 12) { + return readInt96(); + } + VELOX_NYI(); + } else { return readLongLE(); } } template <bool isSigned> inline int128_t IntDecoder<isSigned>::readInt96() { + VELOX_DCHECK(!useVInts_, "Int96 should not be VInt encoded."); + int128_t result = 0; - for (int i = 0; i < 12; ++i) { - auto ch = readByte(); - result |= static_cast<uint128_t>(ch & BASE_256_MASK) << (i * 8); + for (uint32_t offset = 0; offset < 96; offset += 8) { + const auto ch = readByte(); + result |= (ch & INT128_BASE_256_MASK) << offset; } return result; }