From 83cfc69677f1d585a82200aa17957b14a3d0048e Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Wed, 2 Oct 2024 22:47:05 +0530 Subject: [PATCH 1/7] Modify the variable names for better readability --- velox/common/encode/Base64.cpp | 326 +++++++++++++++++---------------- velox/common/encode/Base64.h | 144 ++++++--------- 2 files changed, 227 insertions(+), 243 deletions(-) diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index da4e9cdbfcfd..82244c106a4a 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -89,11 +89,11 @@ constexpr const Base64::ReverseIndex kBase64UrlReverseIndexTable = { // Validate the character in charset with ReverseIndex table constexpr bool checkForwardIndex( - uint8_t idx, + uint8_t index, const Base64::Charset& charset, const Base64::ReverseIndex& reverseIndex) { - return (reverseIndex[static_cast(charset[idx])] == idx) && - (idx > 0 ? checkForwardIndex(idx - 1, charset, reverseIndex) : true); + return (reverseIndex[static_cast(charset[index])] == index) && + (index > 0 ? checkForwardIndex(index - 1, charset, reverseIndex) : true); } // Verify that for every entry in kBase64Charset, the corresponding entry @@ -117,22 +117,23 @@ static_assert( // Searches for a character within a charset up to a certain index. constexpr bool findCharacterInCharset( const Base64::Charset& charset, - uint8_t idx, - const char c) { - return idx < charset.size() && - ((charset[idx] == c) || findCharacterInCharset(charset, idx + 1, c)); + uint8_t index, + const char targetChar) { + return index < charset.size() && + ((charset[index] == targetChar) || + findCharacterInCharset(charset, index + 1, targetChar)); } // Checks the consistency of a reverse index mapping for a given character // set. constexpr bool checkReverseIndex( - uint8_t idx, + uint8_t index, const Base64::Charset& charset, const Base64::ReverseIndex& reverseIndex) { - return (reverseIndex[idx] == 255 - ? !findCharacterInCharset(charset, 0, static_cast(idx)) - : (charset[reverseIndex[idx]] == idx)) && - (idx > 0 ? checkReverseIndex(idx - 1, charset, reverseIndex) : true); + return (reverseIndex[index] == 255 + ? !findCharacterInCharset(charset, 0, static_cast(index)) + : (charset[reverseIndex[index]] == index)) && + (index > 0 ? checkReverseIndex(index - 1, charset, reverseIndex) : true); } // Verify that for every entry in kBase64ReverseIndexTable, the corresponding @@ -158,86 +159,88 @@ static_assert( // Implementation of Base64 encoding and decoding functions. template /* static */ std::string Base64::encodeImpl( - const T& data, + const T& input, const Base64::Charset& charset, - bool include_pad) { - size_t outlen = calculateEncodedSize(data.size(), include_pad); - std::string out; - out.resize(outlen); - encodeImpl(data, charset, include_pad, out.data()); - return out; + bool includePadding) { + size_t encodedSize = calculateEncodedSize(input.size(), includePadding); + std::string encodedResult; + encodedResult.resize(encodedSize); + encodeImpl(input, charset, includePadding, encodedResult.data()); + return encodedResult; } // static -size_t Base64::calculateEncodedSize(size_t size, bool withPadding) { - if (size == 0) { +size_t Base64::calculateEncodedSize(size_t inputSize, bool withPadding) { + if (inputSize == 0) { return 0; } // Calculate the output size assuming that we are including padding. - size_t encodedSize = ((size + 2) / 3) * 4; + size_t encodedSize = ((inputSize + 2) / 3) * 4; if (!withPadding) { // If the padding was not requested, subtract the padding bytes. - encodedSize -= (3 - (size % 3)) % 3; + encodedSize -= (3 - (inputSize % 3)) % 3; } return encodedSize; } // static -void Base64::encode(const char* data, size_t len, char* output) { - encodeImpl(folly::StringPiece(data, len), kBase64Charset, true, output); +void Base64::encode(const char* input, size_t inputSize, char* output) { + encodeImpl( + folly::StringPiece(input, inputSize), kBase64Charset, true, output); } // static -void Base64::encodeUrl(const char* data, size_t len, char* output) { - encodeImpl(folly::StringPiece(data, len), kBase64UrlCharset, true, output); +void Base64::encodeUrl(const char* input, size_t inputSize, char* output) { + encodeImpl( + folly::StringPiece(input, inputSize), kBase64UrlCharset, true, output); } template /* static */ void Base64::encodeImpl( - const T& data, + const T& input, const Base64::Charset& charset, - bool include_pad, - char* out) { - auto len = data.size(); - if (len == 0) { + bool includePadding, + char* outputBuffer) { + auto inputSize = input.size(); + if (inputSize == 0) { return; } - auto wp = out; - auto it = data.begin(); + auto outputPointer = outputBuffer; + auto inputIterator = input.begin(); // For each group of 3 bytes (24 bits) in the input, split that into // 4 groups of 6 bits and encode that using the supplied charset lookup - for (; len > 2; len -= 3) { - uint32_t curr = uint8_t(*it++) << 16; - curr |= uint8_t(*it++) << 8; - curr |= uint8_t(*it++); - - *wp++ = charset[(curr >> 18) & 0x3f]; - *wp++ = charset[(curr >> 12) & 0x3f]; - *wp++ = charset[(curr >> 6) & 0x3f]; - *wp++ = charset[curr & 0x3f]; + for (; inputSize > 2; inputSize -= 3) { + uint32_t inputBlock = uint8_t(*inputIterator++) << 16; + inputBlock |= uint8_t(*inputIterator++) << 8; + inputBlock |= uint8_t(*inputIterator++); + + *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; + *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; + *outputPointer++ = charset[(inputBlock >> 6) & 0x3f]; + *outputPointer++ = charset[inputBlock & 0x3f]; } - if (len > 0) { + if (inputSize > 0) { // We have either 1 or 2 input bytes left. Encode this similar to the // above (assuming 0 for all other bytes). Optionally append the '=' // character if it is requested. - uint32_t curr = uint8_t(*it++) << 16; - *wp++ = charset[(curr >> 18) & 0x3f]; - if (len > 1) { - curr |= uint8_t(*it) << 8; - *wp++ = charset[(curr >> 12) & 0x3f]; - *wp++ = charset[(curr >> 6) & 0x3f]; - if (include_pad) { - *wp = kPadding; + uint32_t inputBlock = uint8_t(*inputIterator++) << 16; + *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; + if (inputSize > 1) { + inputBlock |= uint8_t(*inputIterator) << 8; + *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; + *outputPointer++ = charset[(inputBlock >> 6) & 0x3f]; + if (includePadding) { + *outputPointer = kPadding; } } else { - *wp++ = charset[(curr >> 12) & 0x3f]; - if (include_pad) { - *wp++ = kPadding; - *wp = kPadding; + *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; + if (includePadding) { + *outputPointer++ = kPadding; + *outputPointer = kPadding; } } } @@ -249,191 +252,198 @@ std::string Base64::encode(folly::StringPiece text) { } // static -std::string Base64::encode(const char* data, size_t len) { - return encode(folly::StringPiece(data, len)); +std::string Base64::encode(const char* input, size_t inputSize) { + return encode(folly::StringPiece(input, inputSize)); } namespace { /** - * this is a quick and dirty iterator implementation for an IOBuf so that the - * template that uses iterators can work on IOBuf chains. It only implements - * postfix increment because that is all the algorithm needs, and it is a noop - * since the read<>() function already incremented the cursor. + * This is a quick and simple iterator implementation for an IOBuf so that the + * template that uses iterators can work on IOBuf chains. It only implements + * postfix increment because that is all the algorithm needs, and it is a no-op + * since the read<>() function already increments the cursor. */ class IOBufWrapper { private: class Iterator { public: - explicit Iterator(const folly::IOBuf* data) : cs_(data) {} + explicit Iterator(const folly::IOBuf* inputBuffer) : cursor_(inputBuffer) {} Iterator& operator++(int32_t) { - // This is a noop since reading from the Cursor has already moved the - // position + // This is a no-op since reading from the Cursor has already moved the + // position. return *this; } uint8_t operator*() { - // This will read _and_ increment - return cs_.read(); + // This will read _and_ increment the cursor. + return cursor_.read(); } private: - folly::io::Cursor cs_; + folly::io::Cursor cursor_; }; public: - explicit IOBufWrapper(const folly::IOBuf* data) : data_(data) {} - + explicit IOBufWrapper(const folly::IOBuf* inputBuffer) + : input_(inputBuffer) {} size_t size() const { - return data_->computeChainDataLength(); + return input_->computeChainDataLength(); } Iterator begin() const { - return Iterator(data_); + return Iterator(input_); } private: - const folly::IOBuf* data_; + const folly::IOBuf* input_; }; } // namespace // static -std::string Base64::encode(const folly::IOBuf* data) { - return encodeImpl(IOBufWrapper(data), kBase64Charset, true); +std::string Base64::encode(const folly::IOBuf* inputBuffer) { + return encodeImpl(IOBufWrapper(inputBuffer), kBase64Charset, true); } // static -std::string Base64::decode(folly::StringPiece encoded) { - std::string output; - Base64::decode(std::make_pair(encoded.data(), encoded.size()), output); - return output; +std::string Base64::decode(folly::StringPiece encodedText) { + std::string decodedResult; + Base64::decode( + std::make_pair(encodedText.data(), encodedText.size()), decodedResult); + return decodedResult; } // static void Base64::decode( const std::pair& payload, - std::string& output) { + std::string& decodedOutput) { size_t inputSize = payload.second; - output.resize(calculateDecodedSize(payload.first, inputSize)); - decode(payload.first, inputSize, output.data(), output.size()); + decodedOutput.resize(calculateDecodedSize(payload.first, inputSize)); + decode(payload.first, inputSize, decodedOutput.data(), decodedOutput.size()); } // static -void Base64::decode(const char* data, size_t size, char* output) { - size_t out_len = size / 4 * 3; - Base64::decode(data, size, output, out_len); +void Base64::decode(const char* input, size_t size, char* output) { + size_t expectedOutputSize = size / 4 * 3; + Base64::decode(input, size, output, expectedOutputSize); } // static uint8_t Base64::base64ReverseLookup( - char p, + char encodedChar, const Base64::ReverseIndex& reverseIndex) { - auto curr = reverseIndex[(uint8_t)p]; - if (curr >= 0x40) { + auto reverseLookupValue = reverseIndex[(uint8_t)encodedChar]; + if (reverseLookupValue >= 0x40) { VELOX_USER_FAIL("decode() - invalid input string: invalid characters"); } - return curr; + return reverseLookupValue; } // static -size_t -Base64::decode(const char* src, size_t src_len, char* dst, size_t dst_len) { - return decodeImpl(src, src_len, dst, dst_len, kBase64ReverseIndexTable); +size_t Base64::decode( + const char* input, + size_t inputSize, + char* output, + size_t outputSize) { + return decodeImpl( + input, inputSize, output, outputSize, kBase64ReverseIndexTable); } // static -size_t Base64::calculateDecodedSize(const char* data, size_t& size) { - if (size == 0) { +size_t Base64::calculateDecodedSize(const char* input, size_t& inputSize) { + if (inputSize == 0) { return 0; } - // Check if the input data is padded - if (isPadded(data, size)) { + // Check if the input string is padded + if (isPadded(input, inputSize)) { // If padded, ensure that the string length is a multiple of the encoded // block size - if (size % kEncodedBlockByteSize != 0) { + if (inputSize % kEncodedBlockByteSize != 0) { VELOX_USER_FAIL( "Base64::decode() - invalid input string: " "string length is not a multiple of 4."); } - auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize; - auto padding = numPadding(data, size); - size -= padding; + auto decodedSize = + (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize; + auto paddingCount = numPadding(input, inputSize); + inputSize -= paddingCount; // Adjust the needed size by deducting the bytes corresponding to the // padding from the calculated size. - return needed - - ((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / + return decodedSize - + ((paddingCount * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / kEncodedBlockByteSize; } // If not padded, Calculate extra bytes, if any - auto extra = size % kEncodedBlockByteSize; - auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize; + auto extraBytes = inputSize % kEncodedBlockByteSize; + auto decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize; // Adjust the needed size for extra bytes, if present - if (extra) { - if (extra == 1) { + if (extraBytes) { + if (extraBytes == 1) { VELOX_USER_FAIL( "Base64::decode() - invalid input string: " "string length cannot be 1 more than a multiple of 4."); } - needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize; + decodedSize += (extraBytes * kBinaryBlockByteSize) / kEncodedBlockByteSize; } - return needed; + return decodedSize; } // static size_t Base64::decodeImpl( - const char* src, - size_t src_len, - char* dst, - size_t dst_len, + const char* input, + size_t inputSize, + char* outputBuffer, + size_t outputSize, const Base64::ReverseIndex& reverseIndex) { - if (!src_len) { + if (!inputSize) { return 0; } - auto needed = calculateDecodedSize(src, src_len); - if (dst_len < needed) { + auto decodedSize = calculateDecodedSize(input, inputSize); + if (outputSize < decodedSize) { VELOX_USER_FAIL( "Base64::decode() - invalid output string: " "output string is too small."); } // Handle full groups of 4 characters - for (; src_len > 4; src_len -= 4, src += 4, dst += 3) { - // Each character of the 4 encode 6 bits of the original, grab each with + for (; inputSize > 4; inputSize -= 4, input += 4, outputBuffer += 3) { + // Each character of the 4 encodes 6 bits of the original, grab each with // the appropriate shifts to rebuild the original and then split that back - // into the original 8 bit bytes. - uint32_t last = (base64ReverseLookup(src[0], reverseIndex) << 18) | - (base64ReverseLookup(src[1], reverseIndex) << 12) | - (base64ReverseLookup(src[2], reverseIndex) << 6) | - base64ReverseLookup(src[3], reverseIndex); - dst[0] = (last >> 16) & 0xff; - dst[1] = (last >> 8) & 0xff; - dst[2] = last & 0xff; + // into the original 8-bit bytes. + uint32_t decodedBlock = + (base64ReverseLookup(input[0], reverseIndex) << 18) | + (base64ReverseLookup(input[1], reverseIndex) << 12) | + (base64ReverseLookup(input[2], reverseIndex) << 6) | + base64ReverseLookup(input[3], reverseIndex); + outputBuffer[0] = (decodedBlock >> 16) & 0xff; + outputBuffer[1] = (decodedBlock >> 8) & 0xff; + outputBuffer[2] = decodedBlock & 0xff; } - // Handle the last 2-4 characters. This is similar to the above, but the + // Handle the last 2-4 characters. This is similar to the above, but the // last 2 characters may or may not exist. - DCHECK(src_len >= 2); - uint32_t last = (base64ReverseLookup(src[0], reverseIndex) << 18) | - (base64ReverseLookup(src[1], reverseIndex) << 12); - dst[0] = (last >> 16) & 0xff; - if (src_len > 2) { - last |= base64ReverseLookup(src[2], reverseIndex) << 6; - dst[1] = (last >> 8) & 0xff; - if (src_len > 3) { - last |= base64ReverseLookup(src[3], reverseIndex); - dst[2] = last & 0xff; + DCHECK(inputSize >= 2); + uint32_t decodedBlock = (base64ReverseLookup(input[0], reverseIndex) << 18) | + (base64ReverseLookup(input[1], reverseIndex) << 12); + outputBuffer[0] = (decodedBlock >> 16) & 0xff; + if (inputSize > 2) { + decodedBlock |= base64ReverseLookup(input[2], reverseIndex) << 6; + outputBuffer[1] = (decodedBlock >> 8) & 0xff; + if (inputSize > 3) { + decodedBlock |= base64ReverseLookup(input[3], reverseIndex); + outputBuffer[2] = decodedBlock & 0xff; } } - return needed; + return decodedSize; } // static @@ -442,44 +452,46 @@ std::string Base64::encodeUrl(folly::StringPiece text) { } // static -std::string Base64::encodeUrl(const char* data, size_t len) { - return encodeUrl(folly::StringPiece(data, len)); +std::string Base64::encodeUrl(const char* input, size_t inputSize) { + return encodeUrl(folly::StringPiece(input, inputSize)); } // static -std::string Base64::encodeUrl(const folly::IOBuf* data) { - return encodeImpl(IOBufWrapper(data), kBase64UrlCharset, false); +std::string Base64::encodeUrl(const folly::IOBuf* inputBuffer) { + return encodeImpl(IOBufWrapper(inputBuffer), kBase64UrlCharset, false); } // static void Base64::decodeUrl( - const char* src, - size_t src_len, - char* dst, - size_t dst_len) { - decodeImpl(src, src_len, dst, dst_len, kBase64UrlReverseIndexTable); + const char* input, + size_t inputSize, + char* outputBuffer, + size_t outputSize) { + decodeImpl( + input, inputSize, outputBuffer, outputSize, kBase64UrlReverseIndexTable); } // static -std::string Base64::decodeUrl(folly::StringPiece encoded) { - std::string output; - Base64::decodeUrl(std::make_pair(encoded.data(), encoded.size()), output); - return output; +std::string Base64::decodeUrl(folly::StringPiece encodedText) { + std::string decodedOutput; + Base64::decodeUrl( + std::make_pair(encodedText.data(), encodedText.size()), decodedOutput); + return decodedOutput; } // static void Base64::decodeUrl( const std::pair& payload, - std::string& output) { - size_t out_len = (payload.second + 3) / 4 * 3; - output.resize(out_len, '\0'); - out_len = Base64::decodeImpl( + std::string& decodedOutput) { + size_t decodedSize = (payload.second + 3) / 4 * 3; + decodedOutput.resize(decodedSize, '\0'); + decodedSize = Base64::decodeImpl( payload.first, payload.second, - &output[0], - out_len, + &decodedOutput[0], + decodedSize, kBase64UrlReverseIndexTable); - output.resize(out_len); + decodedOutput.resize(decodedSize); } } // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index 13004175379a..a0f35f6a2e45 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -13,16 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include -#include -#include #include - #include #include - #include "velox/common/base/GTestMacros.h" namespace facebook::velox::encoding { @@ -32,125 +29,100 @@ class Base64 { static const size_t kCharsetSize = 64; static const size_t kReverseIndexSize = 256; - /// Character set used for encoding purposes. - /// Contains specific characters that form the encoding scheme. + /// Character set used for Base64 encoding. using Charset = std::array; - /// Reverse lookup table for decoding purposes. - /// Maps each possible encoded character to its corresponding numeric value - /// within the encoding base. + /// Reverse lookup table for decoding. using ReverseIndex = std::array; /// Padding character used in encoding. static const char kPadding = '='; - /// Encodes the specified number of characters from the 'data'. - static std::string encode(const char* data, size_t len); - - /// Encodes the specified text. + // Encoding Functions + /// Encodes the input data using Base64 encoding. + static std::string encode(const char* input, size_t inputSize); static std::string encode(folly::StringPiece text); + static std::string encode(const folly::IOBuf* inputBuffer); + static void encode(const char* input, size_t inputSize, char* outputBuffer); - /// Encodes the specified IOBuf data. - static std::string encode(const folly::IOBuf* text); - - /// Returns encoded size for the input of the specified size. - static size_t calculateEncodedSize(size_t size, bool withPadding = true); - - /// Encodes the specified number of characters from the 'data' and writes the - /// result to the 'output'. The output must have enough space, e.g. as - /// returned by the calculateEncodedSize(). - static void encode(const char* data, size_t size, char* output); - - /// Decodes the specified encoded text. - static std::string decode(folly::StringPiece encoded); - - /// Returns the actual size of the decoded data. Will also remove the padding - /// length from the input data 'size'. - static size_t calculateDecodedSize(const char* data, size_t& size); - - /// Decodes the specified number of characters from the 'data' and writes the - /// result to the 'output'. The output must have enough space, e.g. as - /// returned by the calculateDecodedSize(). - static void decode(const char* data, size_t size, char* output); + /// Encodes the input data using Base64 URL encoding. + static std::string encodeUrl(const char* input, size_t inputSize); + static std::string encodeUrl(folly::StringPiece text); + static std::string encodeUrl(const folly::IOBuf* inputBuffer); + static void + encodeUrl(const char* input, size_t inputSize, char* outputBuffer); + // Decoding Functions + /// Decodes the input Base64 encoded string. + static std::string decode(folly::StringPiece encodedText); static void decode( const std::pair& payload, std::string& output); - - /// Encodes the specified number of characters from the 'data' and writes the - /// result to the 'output' using URL encoding. The output must have enough - /// space as returned by the calculateEncodedSize(). - static void encodeUrl(const char* data, size_t size, char* output); - - /// Encodes the specified number of characters from the 'data' using URL - /// encoding. - static std::string encodeUrl(const char* data, size_t len); - - /// Encodes the specified IOBuf data using URL encoding. - static std::string encodeUrl(const folly::IOBuf* data); - - /// Encodes the specified text using URL encoding. - static std::string encodeUrl(folly::StringPiece text); - - /// Decodes the specified URL encoded payload and writes the result to the - /// 'output'. + static void decode(const char* input, size_t inputSize, char* outputBuffer); + static size_t decode( + const char* input, + size_t inputSize, + char* outputBuffer, + size_t outputSize); + + /// Decodes the input Base64 URL encoded string. + static std::string decodeUrl(folly::StringPiece encodedText); static void decodeUrl( const std::pair& payload, std::string& output); + static void decodeUrl( + const char* input, + size_t inputSize, + char* outputBuffer, + size_t outputSize); - /// Decodes the specified URL encoded text. - static std::string decodeUrl(folly::StringPiece text); - - /// Decodes the specified number of characters from the 'src' and writes the - /// result to the 'dst'. - static size_t - decode(const char* src, size_t src_len, char* dst, size_t dst_len); + // Helper Functions + /// Calculates the encoded size based on input size. + static size_t calculateEncodedSize(size_t inputSize, bool withPadding = true); - /// Decodes the specified number of characters from the 'src' using URL - /// encoding and writes the result to the 'dst'. - static void - decodeUrl(const char* src, size_t src_len, char* dst, size_t dst_len); + /// Calculates the decoded size based on encoded input and adjusts the input + /// size for padding. + static size_t calculateDecodedSize(const char* input, size_t& inputSize); private: - /// Checks if there is padding in encoded data. - static inline bool isPadded(const char* data, size_t len) { - return (len > 0 && data[len - 1] == kPadding); + // Checks if the input Base64 string is padded. + static inline bool isPadded(const char* input, size_t inputSize) { + return (inputSize > 0 && input[inputSize - 1] == kPadding); } - /// Counts the number of padding characters in encoded data. - static inline size_t numPadding(const char* src, size_t len) { + // Counts the number of padding characters in encoded input. + static inline size_t numPadding(const char* input, size_t inputSize) { size_t numPadding{0}; - while (len > 0 && src[len - 1] == kPadding) { + while (inputSize > 0 && input[inputSize - 1] == kPadding) { numPadding++; - len--; + inputSize--; } return numPadding; } - /// Performs a reverse lookup in the reverse index to retrieve the original - /// index of a character in the base. - static uint8_t base64ReverseLookup(char p, const ReverseIndex& reverseIndex); + // Reverse lookup helper function to get the original index of a Base64 + // character. + static uint8_t base64ReverseLookup( + char encodedChar, + const ReverseIndex& reverseIndex); - /// Encodes the specified data using the provided charset. template static std::string - encodeImpl(const T& data, const Charset& charset, bool include_pad); + encodeImpl(const T& input, const Charset& charset, bool includePadding); - /// Encodes the specified data using the provided charset. template static void encodeImpl( - const T& data, + const T& input, const Charset& charset, - bool include_pad, - char* out); + bool includePadding, + char* outputBuffer); - /// Decodes the specified data using the provided reverse lookup table. static size_t decodeImpl( - const char* src, - size_t src_len, - char* dst, - size_t dst_len, - const ReverseIndex& table); + const char* input, + size_t inputSize, + char* outputBuffer, + size_t outputSize, + const ReverseIndex& reverseIndex); VELOX_FRIEND_TEST(Base64Test, checksPadding); VELOX_FRIEND_TEST(Base64Test, countsPaddingCorrectly); From 22d47652cf43db92a2c3a761f8527e27b4c826cd Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Thu, 3 Oct 2024 00:22:57 +0530 Subject: [PATCH 2/7] Refactor Base64 APIs as non-throwing APIs --- velox/common/encode/Base64.cpp | 171 ++++++++++++-------- velox/common/encode/Base64.h | 25 +-- velox/common/encode/tests/Base64Test.cpp | 39 +++-- velox/common/encode/tests/CMakeLists.txt | 2 +- velox/docs/functions/presto/binary.rst | 11 +- velox/functions/prestosql/BinaryFunctions.h | 44 ++--- 6 files changed, 172 insertions(+), 120 deletions(-) diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index 82244c106a4a..742e078379f8 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -18,9 +18,7 @@ #include #include #include -#include - -#include "velox/common/base/Exceptions.h" +#include namespace facebook::velox::encoding { @@ -157,27 +155,28 @@ static_assert( // "kBase64UrlReverseIndexTable has incorrect entries."); // Implementation of Base64 encoding and decoding functions. +// static template -/* static */ std::string Base64::encodeImpl( +std::string Base64::encodeImpl( const T& input, - const Base64::Charset& charset, + const Charset& charset, bool includePadding) { - size_t encodedSize = calculateEncodedSize(input.size(), includePadding); + const size_t encodedSize{calculateEncodedSize(input.size(), includePadding)}; std::string encodedResult; encodedResult.resize(encodedSize); - encodeImpl(input, charset, includePadding, encodedResult.data()); + (void)encodeImpl(input, charset, includePadding, encodedResult.data()); return encodedResult; } // static -size_t Base64::calculateEncodedSize(size_t inputSize, bool withPadding) { +size_t Base64::calculateEncodedSize(size_t inputSize, bool includePadding) { if (inputSize == 0) { return 0; } // Calculate the output size assuming that we are including padding. size_t encodedSize = ((inputSize + 2) / 3) * 4; - if (!withPadding) { + if (!includePadding) { // If the padding was not requested, subtract the padding bytes. encodedSize -= (3 - (inputSize % 3)) % 3; } @@ -185,26 +184,31 @@ size_t Base64::calculateEncodedSize(size_t inputSize, bool withPadding) { } // static -void Base64::encode(const char* input, size_t inputSize, char* output) { - encodeImpl( +Status Base64::encode(const char* input, size_t inputSize, char* output) { + return encodeImpl( folly::StringPiece(input, inputSize), kBase64Charset, true, output); } // static -void Base64::encodeUrl(const char* input, size_t inputSize, char* output) { - encodeImpl( - folly::StringPiece(input, inputSize), kBase64UrlCharset, true, output); +Status +Base64::encodeUrl(const char* input, size_t inputSize, char* outputBuffer) { + return encodeImpl( + folly::StringPiece(input, inputSize), + kBase64UrlCharset, + true, + outputBuffer); } +// static template -/* static */ void Base64::encodeImpl( +Status Base64::encodeImpl( const T& input, const Base64::Charset& charset, bool includePadding, char* outputBuffer) { auto inputSize = input.size(); if (inputSize == 0) { - return; + return Status::OK(); } auto outputPointer = outputBuffer; @@ -213,9 +217,9 @@ template // For each group of 3 bytes (24 bits) in the input, split that into // 4 groups of 6 bits and encode that using the supplied charset lookup for (; inputSize > 2; inputSize -= 3) { - uint32_t inputBlock = uint8_t(*inputIterator++) << 16; - inputBlock |= uint8_t(*inputIterator++) << 8; - inputBlock |= uint8_t(*inputIterator++); + uint32_t inputBlock = static_cast(*inputIterator++) << 16; + inputBlock |= static_cast(*inputIterator++) << 8; + inputBlock |= static_cast(*inputIterator++); *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; @@ -227,10 +231,10 @@ template // We have either 1 or 2 input bytes left. Encode this similar to the // above (assuming 0 for all other bytes). Optionally append the '=' // character if it is requested. - uint32_t inputBlock = uint8_t(*inputIterator++) << 16; + uint32_t inputBlock = static_cast(*inputIterator++) << 16; *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; if (inputSize > 1) { - inputBlock |= uint8_t(*inputIterator) << 8; + inputBlock |= static_cast(*inputIterator) << 8; *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; *outputPointer++ = charset[(inputBlock >> 6) & 0x3f]; if (includePadding) { @@ -244,6 +248,8 @@ template } } } + + return Status::OK(); } // static @@ -320,29 +326,35 @@ void Base64::decode( const std::pair& payload, std::string& decodedOutput) { size_t inputSize = payload.second; - decodedOutput.resize(calculateDecodedSize(payload.first, inputSize)); - decode(payload.first, inputSize, decodedOutput.data(), decodedOutput.size()); + size_t decodedSize; + (void)calculateDecodedSize(payload.first, inputSize, decodedSize); + decodedOutput.resize(decodedSize); + (void)decode( + payload.first, inputSize, decodedOutput.data(), decodedOutput.size()); } // static -void Base64::decode(const char* input, size_t size, char* output) { - size_t expectedOutputSize = size / 4 * 3; - Base64::decode(input, size, output, expectedOutputSize); +void Base64::decode(const char* input, size_t inputSize, char* outputBuffer) { + size_t outputSize; + (void)calculateDecodedSize(input, inputSize, outputSize); + (void)Base64::decode(input, inputSize, outputBuffer, outputSize); } // static uint8_t Base64::base64ReverseLookup( char encodedChar, - const Base64::ReverseIndex& reverseIndex) { - auto reverseLookupValue = reverseIndex[(uint8_t)encodedChar]; + const Base64::ReverseIndex& reverseIndex, + Status& status) { + auto reverseLookupValue = reverseIndex[static_cast(encodedChar)]; if (reverseLookupValue >= 0x40) { - VELOX_USER_FAIL("decode() - invalid input string: invalid characters"); + status = Status::UserError( + "decode() - invalid input string: invalid characters"); } return reverseLookupValue; } // static -size_t Base64::decode( +Status Base64::decode( const char* input, size_t inputSize, char* output, @@ -352,9 +364,13 @@ size_t Base64::decode( } // static -size_t Base64::calculateDecodedSize(const char* input, size_t& inputSize) { +Status Base64::calculateDecodedSize( + const char* input, + size_t& inputSize, + size_t& decodedSize) { if (inputSize == 0) { - return 0; + decodedSize = 0; + return Status::OK(); } // Check if the input string is padded @@ -362,88 +378,106 @@ size_t Base64::calculateDecodedSize(const char* input, size_t& inputSize) { // If padded, ensure that the string length is a multiple of the encoded // block size if (inputSize % kEncodedBlockByteSize != 0) { - VELOX_USER_FAIL( + return Status::UserError( "Base64::decode() - invalid input string: " "string length is not a multiple of 4."); } - auto decodedSize = - (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize; + decodedSize = (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize; auto paddingCount = numPadding(input, inputSize); inputSize -= paddingCount; // Adjust the needed size by deducting the bytes corresponding to the // padding from the calculated size. - return decodedSize - + decodedSize -= ((paddingCount * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / kEncodedBlockByteSize; + return Status::OK(); } // If not padded, Calculate extra bytes, if any auto extraBytes = inputSize % kEncodedBlockByteSize; - auto decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize; + decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize; // Adjust the needed size for extra bytes, if present if (extraBytes) { if (extraBytes == 1) { - VELOX_USER_FAIL( + return Status::UserError( "Base64::decode() - invalid input string: " "string length cannot be 1 more than a multiple of 4."); } decodedSize += (extraBytes * kBinaryBlockByteSize) / kEncodedBlockByteSize; } - return decodedSize; + return Status::OK(); } // static -size_t Base64::decodeImpl( +Status Base64::decodeImpl( const char* input, size_t inputSize, char* outputBuffer, size_t outputSize, const Base64::ReverseIndex& reverseIndex) { - if (!inputSize) { - return 0; + if (inputSize == 0) { + return Status::OK(); } - auto decodedSize = calculateDecodedSize(input, inputSize); + size_t decodedSize; + auto status = calculateDecodedSize(input, inputSize, decodedSize); + if (!status.ok()) { + return status; + } if (outputSize < decodedSize) { - VELOX_USER_FAIL( - "Base64::decode() - invalid output string: " - "output string is too small."); + return Status::UserError( + "Base64::decode() - invalid output string: output string is too small."); } + Status lookupStatus; // Handle full groups of 4 characters for (; inputSize > 4; inputSize -= 4, input += 4, outputBuffer += 3) { // Each character of the 4 encodes 6 bits of the original, grab each with // the appropriate shifts to rebuild the original and then split that back // into the original 8-bit bytes. uint32_t decodedBlock = - (base64ReverseLookup(input[0], reverseIndex) << 18) | - (base64ReverseLookup(input[1], reverseIndex) << 12) | - (base64ReverseLookup(input[2], reverseIndex) << 6) | - base64ReverseLookup(input[3], reverseIndex); - outputBuffer[0] = (decodedBlock >> 16) & 0xff; - outputBuffer[1] = (decodedBlock >> 8) & 0xff; - outputBuffer[2] = decodedBlock & 0xff; + (base64ReverseLookup(input[0], reverseIndex, lookupStatus) << 18) | + (base64ReverseLookup(input[1], reverseIndex, lookupStatus) << 12) | + (base64ReverseLookup(input[2], reverseIndex, lookupStatus) << 6) | + base64ReverseLookup(input[3], reverseIndex, lookupStatus); + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputBuffer[0] = static_cast((decodedBlock >> 16) & 0xff); + outputBuffer[1] = static_cast((decodedBlock >> 8) & 0xff); + outputBuffer[2] = static_cast(decodedBlock & 0xff); } // Handle the last 2-4 characters. This is similar to the above, but the // last 2 characters may or may not exist. DCHECK(inputSize >= 2); - uint32_t decodedBlock = (base64ReverseLookup(input[0], reverseIndex) << 18) | - (base64ReverseLookup(input[1], reverseIndex) << 12); - outputBuffer[0] = (decodedBlock >> 16) & 0xff; + uint32_t decodedBlock = + (base64ReverseLookup(input[0], reverseIndex, lookupStatus) << 18) | + (base64ReverseLookup(input[1], reverseIndex, lookupStatus) << 12); + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputBuffer[0] = static_cast((decodedBlock >> 16) & 0xff); if (inputSize > 2) { - decodedBlock |= base64ReverseLookup(input[2], reverseIndex) << 6; - outputBuffer[1] = (decodedBlock >> 8) & 0xff; + decodedBlock |= base64ReverseLookup(input[2], reverseIndex, lookupStatus) + << 6; + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputBuffer[1] = static_cast((decodedBlock >> 8) & 0xff); if (inputSize > 3) { - decodedBlock |= base64ReverseLookup(input[3], reverseIndex); - outputBuffer[2] = decodedBlock & 0xff; + decodedBlock |= base64ReverseLookup(input[3], reverseIndex, lookupStatus); + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputBuffer[2] = static_cast(decodedBlock & 0xff); } } - return decodedSize; + return Status::OK(); } // static @@ -462,12 +496,12 @@ std::string Base64::encodeUrl(const folly::IOBuf* inputBuffer) { } // static -void Base64::decodeUrl( +Status Base64::decodeUrl( const char* input, size_t inputSize, char* outputBuffer, size_t outputSize) { - decodeImpl( + return decodeImpl( input, inputSize, outputBuffer, outputSize, kBase64UrlReverseIndexTable); } @@ -483,15 +517,16 @@ std::string Base64::decodeUrl(folly::StringPiece encodedText) { void Base64::decodeUrl( const std::pair& payload, std::string& decodedOutput) { - size_t decodedSize = (payload.second + 3) / 4 * 3; - decodedOutput.resize(decodedSize, '\0'); - decodedSize = Base64::decodeImpl( + size_t inputSize = payload.second; + size_t decodedSize; + (void)calculateDecodedSize(payload.first, inputSize, decodedSize); + decodedOutput.resize(decodedSize); + (void)Base64::decodeImpl( payload.first, payload.second, &decodedOutput[0], - decodedSize, + decodedOutput.size(), kBase64UrlReverseIndexTable); - decodedOutput.resize(decodedSize); } } // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index a0f35f6a2e45..e8dd49df1985 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -16,11 +16,12 @@ #pragma once -#include -#include #include #include +#include +#include #include "velox/common/base/GTestMacros.h" +#include "velox/common/base/Status.h" namespace facebook::velox::encoding { @@ -43,13 +44,13 @@ class Base64 { static std::string encode(const char* input, size_t inputSize); static std::string encode(folly::StringPiece text); static std::string encode(const folly::IOBuf* inputBuffer); - static void encode(const char* input, size_t inputSize, char* outputBuffer); + static Status encode(const char* input, size_t inputSize, char* outputBuffer); /// Encodes the input data using Base64 URL encoding. static std::string encodeUrl(const char* input, size_t inputSize); static std::string encodeUrl(folly::StringPiece text); static std::string encodeUrl(const folly::IOBuf* inputBuffer); - static void + static Status encodeUrl(const char* input, size_t inputSize, char* outputBuffer); // Decoding Functions @@ -59,7 +60,7 @@ class Base64 { const std::pair& payload, std::string& output); static void decode(const char* input, size_t inputSize, char* outputBuffer); - static size_t decode( + static Status decode( const char* input, size_t inputSize, char* outputBuffer, @@ -70,7 +71,7 @@ class Base64 { static void decodeUrl( const std::pair& payload, std::string& output); - static void decodeUrl( + static Status decodeUrl( const char* input, size_t inputSize, char* outputBuffer, @@ -82,7 +83,10 @@ class Base64 { /// Calculates the decoded size based on encoded input and adjusts the input /// size for padding. - static size_t calculateDecodedSize(const char* input, size_t& inputSize); + static Status calculateDecodedSize( + const char* input, + size_t& inputSize, + size_t& decodedSize); private: // Checks if the input Base64 string is padded. @@ -104,20 +108,21 @@ class Base64 { // character. static uint8_t base64ReverseLookup( char encodedChar, - const ReverseIndex& reverseIndex); + const ReverseIndex& reverseIndex, + Status& status); template static std::string encodeImpl(const T& input, const Charset& charset, bool includePadding); template - static void encodeImpl( + static Status encodeImpl( const T& input, const Charset& charset, bool includePadding, char* outputBuffer); - static size_t decodeImpl( + static Status decodeImpl( const char* input, size_t inputSize, char* outputBuffer, diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp index 9cbbbad47124..ecfbf20a09f2 100644 --- a/velox/common/encode/tests/Base64Test.cpp +++ b/velox/common/encode/tests/Base64Test.cpp @@ -50,43 +50,48 @@ TEST_F(Base64Test, fromBase64) { TEST_F(Base64Test, calculateDecodedSizeProperSize) { size_t encoded_size{0}; + size_t decoded_size{0}; encoded_size = 20; - EXPECT_EQ( - 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size)); + Base64::calculateDecodedSize( + "SGVsbG8sIFdvcmxkIQ==", encoded_size, decoded_size); EXPECT_EQ(18, encoded_size); + EXPECT_EQ(13, decoded_size); encoded_size = 18; - EXPECT_EQ( - 13, Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ", encoded_size)); + Base64::calculateDecodedSize( + "SGVsbG8sIFdvcmxkIQ", encoded_size, decoded_size); EXPECT_EQ(18, encoded_size); + EXPECT_EQ(13, decoded_size); encoded_size = 21; - VELOX_ASSERT_THROW( - Base64::calculateDecodedSize("SGVsbG8sIFdvcmxkIQ==", encoded_size), - "Base64::decode() - invalid input string: string length cannot be 1 more than a multiple of 4."); - - encoded_size = 32; EXPECT_EQ( - 23, + Status::UserError( + "Base64::decode() - invalid input string: string length is not a multiple of 4."), Base64::calculateDecodedSize( - "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size)); + "SGVsbG8sIFdvcmxkIQ===", encoded_size, decoded_size)); + + encoded_size = 32; + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size, decoded_size); EXPECT_EQ(31, encoded_size); + EXPECT_EQ(23, decoded_size); encoded_size = 31; - EXPECT_EQ( - 23, - Base64::calculateDecodedSize( - "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size)); + Base64::calculateDecodedSize( + "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size, decoded_size); EXPECT_EQ(31, encoded_size); + EXPECT_EQ(23, decoded_size); encoded_size = 16; - EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size)); + Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size, decoded_size); EXPECT_EQ(14, encoded_size); + EXPECT_EQ(10, decoded_size); encoded_size = 14; - EXPECT_EQ(10, Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size)); + Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size, decoded_size); EXPECT_EQ(14, encoded_size); + EXPECT_EQ(10, decoded_size); } TEST_F(Base64Test, checksPadding) { diff --git a/velox/common/encode/tests/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt index 90c9733ecf22..63f718c24745 100644 --- a/velox/common/encode/tests/CMakeLists.txt +++ b/velox/common/encode/tests/CMakeLists.txt @@ -17,4 +17,4 @@ add_test(velox_common_encode_test velox_common_encode_test) target_link_libraries( velox_common_encode_test PUBLIC Folly::folly - PRIVATE velox_encode velox_exception GTest::gtest GTest::gtest_main) + PRIVATE velox_encode velox_status GTest::gtest GTest::gtest_main) diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 8b4ddc26832e..07deb3e4b0e9 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -8,26 +8,25 @@ Binary Functions .. function:: from_base64(string) -> varbinary - Decodes a Base64-encoded ``string`` back into its original binary form. - This function is capable of handling both fully padded and non-padded Base64 encoded strings. - Partially padded Base64 strings are not supported and will result in an error. + Decodes a Base64-encoded ``string`` back into its original binary form. + This function is capable of handling both fully padded and non-padded Base64 encoded strings. + Partially padded Base64 strings are not supported and will result in a "UserError" status being returned. Examples -------- Query with padded Base64 string: :: SELECT from_base64('SGVsbG8gV29ybGQ='); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] - Query with non-padded Base64 string: :: SELECT from_base64('SGVsbG8gV29ybGQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] Query with partial-padded Base64 string: :: - SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- Error : Base64::decode() - invalid input string: string length is not a multiple of 4. + SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- UserError: Base64::decode() - invalid input string: string length is not a multiple of 4. In the above examples, both the fully padded and non-padded Base64 strings ('SGVsbG8gV29ybGQ=' and 'SGVsbG8gV29ybGQ') decode to the binary representation of the text 'Hello World'. - While, partial-padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will lead to an velox error. + A partial-padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will result in a "UserError" status indicating the Base64 string is invalid. .. function:: from_base64url(string) -> varbinary diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index ce153ee349fc..ea151078a26f 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -278,11 +278,10 @@ template struct ToBase64Function { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE void call( - out_type& result, - const arg_type& input) { + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { result.resize(encoding::Base64::calculateEncodedSize(input.size())); - encoding::Base64::encode(input.data(), input.size(), result.data()); + return encoding::Base64::encode(input.data(), input.size(), result.data()); } }; @@ -293,11 +292,16 @@ struct FromBase64Function { // T can be either arg_type or arg_type. These are the // same, but hard-coding one of them might be confusing. template - FOLLY_ALWAYS_INLINE void call(out_type& result, const T& input) { + FOLLY_ALWAYS_INLINE Status call(out_type& result, const T& input) { auto inputSize = input.size(); - result.resize( - encoding::Base64::calculateDecodedSize(input.data(), inputSize)); - encoding::Base64::decode( + size_t decodedSize; + auto status = encoding::Base64::calculateDecodedSize( + input.data(), inputSize, decodedSize); + if (!status.ok()) { + return status; + } + result.resize(decodedSize); + return encoding::Base64::decode( input.data(), inputSize, result.data(), result.size()); } }; @@ -305,13 +309,17 @@ struct FromBase64Function { template struct FromBase64UrlFunction { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE void call( - out_type& result, - const arg_type& input) { + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { auto inputSize = input.size(); - result.resize( - encoding::Base64::calculateDecodedSize(input.data(), inputSize)); - encoding::Base64::decodeUrl( + size_t decodedSize; + auto status = encoding::Base64::calculateDecodedSize( + input.data(), inputSize, decodedSize); + if (!status.ok()) { + return status; + } + result.resize(decodedSize); + return encoding::Base64::decodeUrl( input.data(), inputSize, result.data(), result.size()); } }; @@ -320,11 +328,11 @@ template struct ToBase64UrlFunction { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE void call( - out_type& result, - const arg_type& input) { + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { result.resize(encoding::Base64::calculateEncodedSize(input.size())); - encoding::Base64::encodeUrl(input.data(), input.size(), result.data()); + return encoding::Base64::encodeUrl( + input.data(), input.size(), result.data()); } }; From 87da0b07c4166e1f0cdcc3355a329e0a143c0f5f Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Fri, 4 Oct 2024 08:47:52 +0530 Subject: [PATCH 3/7] Refactor Base64 Decode API --- velox/common/encode/Base64.cpp | 157 +++++++++----------- velox/common/encode/Base64.h | 43 +++--- velox/common/encode/tests/Base64Test.cpp | 126 +++++++--------- velox/functions/prestosql/BinaryFunctions.h | 26 ++-- 4 files changed, 156 insertions(+), 196 deletions(-) diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index 742e078379f8..5cd6491a5123 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -315,35 +315,32 @@ std::string Base64::encode(const folly::IOBuf* inputBuffer) { // static std::string Base64::decode(folly::StringPiece encodedText) { - std::string decodedResult; - Base64::decode( - std::make_pair(encodedText.data(), encodedText.size()), decodedResult); - return decodedResult; + std::string decodedOutput; + std::string_view input(encodedText.data(), encodedText.size()); + (void)decodeImpl(input, decodedOutput, kBase64ReverseIndexTable); + return decodedOutput; } // static void Base64::decode( const std::pair& payload, std::string& decodedOutput) { - size_t inputSize = payload.second; - size_t decodedSize; - (void)calculateDecodedSize(payload.first, inputSize, decodedSize); - decodedOutput.resize(decodedSize); - (void)decode( - payload.first, inputSize, decodedOutput.data(), decodedOutput.size()); + std::string_view input(payload.first, payload.second); + (void)decodeImpl(input, decodedOutput, kBase64ReverseIndexTable); } // static void Base64::decode(const char* input, size_t inputSize, char* outputBuffer) { - size_t outputSize; - (void)calculateDecodedSize(input, inputSize, outputSize); - (void)Base64::decode(input, inputSize, outputBuffer, outputSize); + std::string_view inputView(input, inputSize); + std::string output; + (void)decodeImpl(inputView, output, kBase64ReverseIndexTable); + memcpy(outputBuffer, output.data(), output.size()); } // static uint8_t Base64::base64ReverseLookup( char encodedChar, - const Base64::ReverseIndex& reverseIndex, + const ReverseIndex& reverseIndex, Status& status) { auto reverseLookupValue = reverseIndex[static_cast(encodedChar)]; if (reverseLookupValue >= 0x40) { @@ -354,18 +351,13 @@ uint8_t Base64::base64ReverseLookup( } // static -Status Base64::decode( - const char* input, - size_t inputSize, - char* output, - size_t outputSize) { - return decodeImpl( - input, inputSize, output, outputSize, kBase64ReverseIndexTable); +Status Base64::decode(std::string_view input, std::string& output) { + return decodeImpl(input, output, kBase64ReverseIndexTable); } // static Status Base64::calculateDecodedSize( - const char* input, + std::string_view input, size_t& inputSize, size_t& decodedSize) { if (inputSize == 0) { @@ -374,7 +366,7 @@ Status Base64::calculateDecodedSize( } // Check if the input string is padded - if (isPadded(input, inputSize)) { + if (isPadded(input)) { // If padded, ensure that the string length is a multiple of the encoded // block size if (inputSize % kEncodedBlockByteSize != 0) { @@ -384,7 +376,7 @@ Status Base64::calculateDecodedSize( } decodedSize = (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize; - auto paddingCount = numPadding(input, inputSize); + auto paddingCount = numPadding(input); inputSize -= paddingCount; // Adjust the needed size by deducting the bytes corresponding to the @@ -394,7 +386,7 @@ Status Base64::calculateDecodedSize( kEncodedBlockByteSize; return Status::OK(); } - // If not padded, Calculate extra bytes, if any + // If not padded, calculate extra bytes, if any auto extraBytes = inputSize % kEncodedBlockByteSize; decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize; @@ -413,69 +405,71 @@ Status Base64::calculateDecodedSize( // static Status Base64::decodeImpl( - const char* input, - size_t inputSize, - char* outputBuffer, - size_t outputSize, - const Base64::ReverseIndex& reverseIndex) { + std::string_view input, + std::string& output, + const ReverseIndex& reverseIndex) { + size_t inputSize = input.size(); if (inputSize == 0) { + output.clear(); return Status::OK(); } + // Calculate the decoded size based on the input size size_t decodedSize; auto status = calculateDecodedSize(input, inputSize, decodedSize); if (!status.ok()) { return status; } - if (outputSize < decodedSize) { - return Status::UserError( - "Base64::decode() - invalid output string: output string is too small."); - } + // Resize the output string to fit the decoded data + output.resize(decodedSize); + + // Set up input and output pointers + const char* inputPtr = input.data(); + char* outputPtr = output.data(); Status lookupStatus; - // Handle full groups of 4 characters - for (; inputSize > 4; inputSize -= 4, input += 4, outputBuffer += 3) { - // Each character of the 4 encodes 6 bits of the original, grab each with - // the appropriate shifts to rebuild the original and then split that back - // into the original 8-bit bytes. - uint32_t decodedBlock = - (base64ReverseLookup(input[0], reverseIndex, lookupStatus) << 18) | - (base64ReverseLookup(input[1], reverseIndex, lookupStatus) << 12) | - (base64ReverseLookup(input[2], reverseIndex, lookupStatus) << 6) | - base64ReverseLookup(input[3], reverseIndex, lookupStatus); + + // Process full blocks of 4 characters + size_t fullBlockCount = inputSize / 4; + for (size_t i = 0; i < fullBlockCount; ++i) { + uint8_t val0 = base64ReverseLookup(inputPtr[0], reverseIndex, lookupStatus); + uint8_t val1 = base64ReverseLookup(inputPtr[1], reverseIndex, lookupStatus); + uint8_t val2 = base64ReverseLookup(inputPtr[2], reverseIndex, lookupStatus); + uint8_t val3 = base64ReverseLookup(inputPtr[3], reverseIndex, lookupStatus); + if (!lookupStatus.ok()) { return lookupStatus; } - outputBuffer[0] = static_cast((decodedBlock >> 16) & 0xff); - outputBuffer[1] = static_cast((decodedBlock >> 8) & 0xff); - outputBuffer[2] = static_cast(decodedBlock & 0xff); + + uint32_t currentBlock = (val0 << 18) | (val1 << 12) | (val2 << 6) | val3; + outputPtr[0] = static_cast((currentBlock >> 16) & 0xFF); + outputPtr[1] = static_cast((currentBlock >> 8) & 0xFF); + outputPtr[2] = static_cast(currentBlock & 0xFF); + + inputPtr += 4; + outputPtr += 3; + } + + // Handle remaining characters (2 or 3 characters at the end) + size_t remaining = inputSize % 4; + if (remaining > 1) { + uint8_t val0 = base64ReverseLookup(inputPtr[0], reverseIndex, lookupStatus); + uint8_t val1 = base64ReverseLookup(inputPtr[1], reverseIndex, lookupStatus); + uint32_t currentBlock = (val0 << 18) | (val1 << 12); + outputPtr[0] = static_cast((currentBlock >> 16) & 0xFF); + + if (remaining == 3) { + uint8_t val2 = + base64ReverseLookup(inputPtr[2], reverseIndex, lookupStatus); + currentBlock |= (val2 << 6); + outputPtr[1] = static_cast((currentBlock >> 8) & 0xFF); + } } - // Handle the last 2-4 characters. This is similar to the above, but the - // last 2 characters may or may not exist. - DCHECK(inputSize >= 2); - uint32_t decodedBlock = - (base64ReverseLookup(input[0], reverseIndex, lookupStatus) << 18) | - (base64ReverseLookup(input[1], reverseIndex, lookupStatus) << 12); + // Check for any lookup errors if (!lookupStatus.ok()) { return lookupStatus; } - outputBuffer[0] = static_cast((decodedBlock >> 16) & 0xff); - if (inputSize > 2) { - decodedBlock |= base64ReverseLookup(input[2], reverseIndex, lookupStatus) - << 6; - if (!lookupStatus.ok()) { - return lookupStatus; - } - outputBuffer[1] = static_cast((decodedBlock >> 8) & 0xff); - if (inputSize > 3) { - decodedBlock |= base64ReverseLookup(input[3], reverseIndex, lookupStatus); - if (!lookupStatus.ok()) { - return lookupStatus; - } - outputBuffer[2] = static_cast(decodedBlock & 0xff); - } - } return Status::OK(); } @@ -496,20 +490,15 @@ std::string Base64::encodeUrl(const folly::IOBuf* inputBuffer) { } // static -Status Base64::decodeUrl( - const char* input, - size_t inputSize, - char* outputBuffer, - size_t outputSize) { - return decodeImpl( - input, inputSize, outputBuffer, outputSize, kBase64UrlReverseIndexTable); +Status Base64::decodeUrl(std::string_view input, std::string& output) { + return decodeImpl(input, output, kBase64UrlReverseIndexTable); } // static std::string Base64::decodeUrl(folly::StringPiece encodedText) { std::string decodedOutput; - Base64::decodeUrl( - std::make_pair(encodedText.data(), encodedText.size()), decodedOutput); + std::string_view input(encodedText.data(), encodedText.size()); + (void)decodeImpl(input, decodedOutput, kBase64UrlReverseIndexTable); return decodedOutput; } @@ -517,16 +506,8 @@ std::string Base64::decodeUrl(folly::StringPiece encodedText) { void Base64::decodeUrl( const std::pair& payload, std::string& decodedOutput) { - size_t inputSize = payload.second; - size_t decodedSize; - (void)calculateDecodedSize(payload.first, inputSize, decodedSize); - decodedOutput.resize(decodedSize); - (void)Base64::decodeImpl( - payload.first, - payload.second, - &decodedOutput[0], - decodedOutput.size(), - kBase64UrlReverseIndexTable); + std::string_view inputView(payload.first, payload.second); + (void)decodeImpl(inputView, decodedOutput, kBase64UrlReverseIndexTable); } } // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index e8dd49df1985..56ff7df84104 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -60,43 +60,30 @@ class Base64 { const std::pair& payload, std::string& output); static void decode(const char* input, size_t inputSize, char* outputBuffer); - static Status decode( - const char* input, - size_t inputSize, - char* outputBuffer, - size_t outputSize); + static Status decode(std::string_view input, std::string& output); /// Decodes the input Base64 URL encoded string. static std::string decodeUrl(folly::StringPiece encodedText); static void decodeUrl( const std::pair& payload, std::string& output); - static Status decodeUrl( - const char* input, - size_t inputSize, - char* outputBuffer, - size_t outputSize); + static Status decodeUrl(std::string_view input, std::string& output); // Helper Functions /// Calculates the encoded size based on input size. static size_t calculateEncodedSize(size_t inputSize, bool withPadding = true); - /// Calculates the decoded size based on encoded input and adjusts the input - /// size for padding. - static Status calculateDecodedSize( - const char* input, - size_t& inputSize, - size_t& decodedSize); - private: // Checks if the input Base64 string is padded. - static inline bool isPadded(const char* input, size_t inputSize) { + static inline bool isPadded(std::string_view input) { + size_t inputSize{input.size()}; return (inputSize > 0 && input[inputSize - 1] == kPadding); } // Counts the number of padding characters in encoded input. - static inline size_t numPadding(const char* input, size_t inputSize) { + static inline size_t numPadding(std::string_view input) { size_t numPadding{0}; + size_t inputSize{input.size()}; while (inputSize > 0 && input[inputSize - 1] == kPadding) { numPadding++; inputSize--; @@ -123,14 +110,20 @@ class Base64 { char* outputBuffer); static Status decodeImpl( - const char* input, - size_t inputSize, - char* outputBuffer, - size_t outputSize, + std::string_view input, + std::string& output, const ReverseIndex& reverseIndex); - VELOX_FRIEND_TEST(Base64Test, checksPadding); - VELOX_FRIEND_TEST(Base64Test, countsPaddingCorrectly); + // Returns the actual size of the decoded data. Will also remove the padding + // length from the 'inputSize'. + static Status calculateDecodedSize( + std::string_view input, + size_t& inputSize, + size_t& decodedSize); + + VELOX_FRIEND_TEST(Base64Test, isPadded); + VELOX_FRIEND_TEST(Base64Test, numPadding); + VELOX_FRIEND_TEST(Base64Test, calculateDecodedSize); }; } // namespace facebook::velox::encoding diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp index ecfbf20a09f2..41f173b7d25c 100644 --- a/velox/common/encode/tests/Base64Test.cpp +++ b/velox/common/encode/tests/Base64Test.cpp @@ -25,83 +25,71 @@ namespace facebook::velox::encoding { class Base64Test : public ::testing::Test {}; TEST_F(Base64Test, fromBase64) { - EXPECT_EQ( - "Hello, World!", - Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ=="))); - EXPECT_EQ( - "Base64 encoding is fun.", - Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4="))); - EXPECT_EQ( - "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ="))); - EXPECT_EQ( - "1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA=="))); + // Lambda function to reduce repetition in test cases + auto checkBase64Decode = [](const std::string& expected, + const std::string& encoded) { + EXPECT_EQ(expected, Base64::decode(folly::StringPiece(encoded))); + }; + + // Check encoded strings with padding + checkBase64Decode("Hello, World!", "SGVsbG8sIFdvcmxkIQ=="); + checkBase64Decode( + "Base64 encoding is fun.", "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4="); + checkBase64Decode("Simple text", "U2ltcGxlIHRleHQ="); + checkBase64Decode("1234567890", "MTIzNDU2Nzg5MA=="); // Check encoded strings without padding - EXPECT_EQ( - "Hello, World!", - Base64::decode(folly::StringPiece("SGVsbG8sIFdvcmxkIQ"))); - EXPECT_EQ( - "Base64 encoding is fun.", - Base64::decode(folly::StringPiece("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4"))); - EXPECT_EQ( - "Simple text", Base64::decode(folly::StringPiece("U2ltcGxlIHRleHQ"))); - EXPECT_EQ("1234567890", Base64::decode(folly::StringPiece("MTIzNDU2Nzg5MA"))); + checkBase64Decode("Hello, World!", "SGVsbG8sIFdvcmxkIQ"); + checkBase64Decode( + "Base64 encoding is fun.", "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4"); + checkBase64Decode("Simple text", "U2ltcGxlIHRleHQ"); + checkBase64Decode("1234567890", "MTIzNDU2Nzg5MA"); } -TEST_F(Base64Test, calculateDecodedSizeProperSize) { - size_t encoded_size{0}; - size_t decoded_size{0}; - - encoded_size = 20; - Base64::calculateDecodedSize( - "SGVsbG8sIFdvcmxkIQ==", encoded_size, decoded_size); - EXPECT_EQ(18, encoded_size); - EXPECT_EQ(13, decoded_size); - - encoded_size = 18; - Base64::calculateDecodedSize( - "SGVsbG8sIFdvcmxkIQ", encoded_size, decoded_size); - EXPECT_EQ(18, encoded_size); - EXPECT_EQ(13, decoded_size); - - encoded_size = 21; - EXPECT_EQ( +TEST_F(Base64Test, calculateDecodedSize) { + auto checkDecodedSize = [](std::string_view encodedString, + size_t initialEncodedSize, + size_t expectedEncodedSize, + size_t expectedDecodedSize, + Status expectedStatus = Status::OK()) { + size_t encoded_size = initialEncodedSize; + size_t decoded_size = 0; + Status status = + Base64::calculateDecodedSize(encodedString, encoded_size, decoded_size); + + if (expectedStatus.ok()) { + EXPECT_EQ(Status::OK(), status); + EXPECT_EQ(expectedEncodedSize, encoded_size); + EXPECT_EQ(expectedDecodedSize, decoded_size); + } else { + EXPECT_EQ(expectedStatus, status); + } + }; + + // Using the lambda to reduce repetitive code + checkDecodedSize("SGVsbG8sIFdvcmxkIQ==", 20, 18, 13); + checkDecodedSize("SGVsbG8sIFdvcmxkIQ", 18, 18, 13); + checkDecodedSize( + "SGVsbG8sIFdvcmxkIQ===", + 21, + 0, + 0, Status::UserError( - "Base64::decode() - invalid input string: string length is not a multiple of 4."), - Base64::calculateDecodedSize( - "SGVsbG8sIFdvcmxkIQ===", encoded_size, decoded_size)); - - encoded_size = 32; - Base64::calculateDecodedSize( - "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", encoded_size, decoded_size); - EXPECT_EQ(31, encoded_size); - EXPECT_EQ(23, decoded_size); - - encoded_size = 31; - Base64::calculateDecodedSize( - "QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", encoded_size, decoded_size); - EXPECT_EQ(31, encoded_size); - EXPECT_EQ(23, decoded_size); - - encoded_size = 16; - Base64::calculateDecodedSize("MTIzNDU2Nzg5MA==", encoded_size, decoded_size); - EXPECT_EQ(14, encoded_size); - EXPECT_EQ(10, decoded_size); - - encoded_size = 14; - Base64::calculateDecodedSize("MTIzNDU2Nzg5MA", encoded_size, decoded_size); - EXPECT_EQ(14, encoded_size); - EXPECT_EQ(10, decoded_size); + "Base64::decode() - invalid input string: string length is not a multiple of 4.")); + checkDecodedSize("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", 32, 31, 23); + checkDecodedSize("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", 31, 31, 23); + checkDecodedSize("MTIzNDU2Nzg5MA==", 16, 14, 10); + checkDecodedSize("MTIzNDU2Nzg5MA", 14, 14, 10); } -TEST_F(Base64Test, checksPadding) { - EXPECT_TRUE(Base64::isPadded("ABC=", 4)); - EXPECT_FALSE(Base64::isPadded("ABC", 3)); +TEST_F(Base64Test, isPadded) { + EXPECT_TRUE(Base64::isPadded("ABC=")); + EXPECT_FALSE(Base64::isPadded("ABC")); } -TEST_F(Base64Test, countsPaddingCorrectly) { - EXPECT_EQ(0, Base64::numPadding("ABC", 3)); - EXPECT_EQ(1, Base64::numPadding("ABC=", 4)); - EXPECT_EQ(2, Base64::numPadding("AB==", 4)); +TEST_F(Base64Test, numPadding) { + EXPECT_EQ(0, Base64::numPadding("ABC")); + EXPECT_EQ(1, Base64::numPadding("ABC=")); + EXPECT_EQ(2, Base64::numPadding("AB==")); } } // namespace facebook::velox::encoding diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index ea151078a26f..d3673add45b0 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -293,16 +293,15 @@ struct FromBase64Function { // same, but hard-coding one of them might be confusing. template FOLLY_ALWAYS_INLINE Status call(out_type& result, const T& input) { - auto inputSize = input.size(); - size_t decodedSize; - auto status = encoding::Base64::calculateDecodedSize( - input.data(), inputSize, decodedSize); + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base64::decode(inputView, output); if (!status.ok()) { return status; } - result.resize(decodedSize); - return encoding::Base64::decode( - input.data(), inputSize, result.data(), result.size()); + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); } }; @@ -311,16 +310,15 @@ struct FromBase64UrlFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE Status call(out_type& result, const arg_type& input) { - auto inputSize = input.size(); - size_t decodedSize; - auto status = encoding::Base64::calculateDecodedSize( - input.data(), inputSize, decodedSize); + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base64::decodeUrl(inputView, output); if (!status.ok()) { return status; } - result.resize(decodedSize); - return encoding::Base64::decodeUrl( - input.data(), inputSize, result.data(), result.size()); + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); } }; From aa85bad3a40c4e6d1646451c316a4643237f2a43 Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Fri, 4 Oct 2024 11:14:09 +0530 Subject: [PATCH 4/7] Refactor Base64 Encode API --- velox/common/encode/Base64.cpp | 75 ++++++++++----------- velox/common/encode/Base64.h | 16 ++--- velox/functions/prestosql/BinaryFunctions.h | 23 +++++-- 3 files changed, 60 insertions(+), 54 deletions(-) diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index 5cd6491a5123..f5a0a8c54d37 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -161,10 +161,8 @@ std::string Base64::encodeImpl( const T& input, const Charset& charset, bool includePadding) { - const size_t encodedSize{calculateEncodedSize(input.size(), includePadding)}; std::string encodedResult; - encodedResult.resize(encodedSize); - (void)encodeImpl(input, charset, includePadding, encodedResult.data()); + (void)encodeImpl(input, charset, includePadding, encodedResult); return encodedResult; } @@ -184,42 +182,41 @@ size_t Base64::calculateEncodedSize(size_t inputSize, bool includePadding) { } // static -Status Base64::encode(const char* input, size_t inputSize, char* output) { - return encodeImpl( - folly::StringPiece(input, inputSize), kBase64Charset, true, output); +Status Base64::encode(std::string_view input, std::string& output) { + return encodeImpl(input, kBase64Charset, true, output); } // static -Status -Base64::encodeUrl(const char* input, size_t inputSize, char* outputBuffer) { - return encodeImpl( - folly::StringPiece(input, inputSize), - kBase64UrlCharset, - true, - outputBuffer); +Status Base64::encodeUrl(std::string_view input, std::string& output) { + return encodeImpl(input, kBase64UrlCharset, true, output); } // static template Status Base64::encodeImpl( const T& input, - const Base64::Charset& charset, + const Charset& charset, bool includePadding, - char* outputBuffer) { + std::string& output) { auto inputSize = input.size(); if (inputSize == 0) { + output.clear(); return Status::OK(); } - auto outputPointer = outputBuffer; + // Calculate the output size and resize the string beforehand + size_t outputSize = calculateEncodedSize(inputSize, includePadding); + output.resize(outputSize); // Resize the output string to the required size + + // Use a pointer to write into the pre-allocated buffer + auto outputPointer = output.data(); auto inputIterator = input.begin(); - // For each group of 3 bytes (24 bits) in the input, split that into - // 4 groups of 6 bits and encode that using the supplied charset lookup + // Encode input in chunks of 3 bytes for (; inputSize > 2; inputSize -= 3) { - uint32_t inputBlock = static_cast(*inputIterator++) << 16; - inputBlock |= static_cast(*inputIterator++) << 8; - inputBlock |= static_cast(*inputIterator++); + uint32_t inputBlock = uint8_t(*inputIterator++) << 16; + inputBlock |= uint8_t(*inputIterator++) << 8; + inputBlock |= uint8_t(*inputIterator++); *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; @@ -227,24 +224,22 @@ Status Base64::encodeImpl( *outputPointer++ = charset[inputBlock & 0x3f]; } + // Handle remaining bytes (1 or 2 bytes) if (inputSize > 0) { - // We have either 1 or 2 input bytes left. Encode this similar to the - // above (assuming 0 for all other bytes). Optionally append the '=' - // character if it is requested. - uint32_t inputBlock = static_cast(*inputIterator++) << 16; + uint32_t inputBlock = uint8_t(*inputIterator++) << 16; *outputPointer++ = charset[(inputBlock >> 18) & 0x3f]; if (inputSize > 1) { - inputBlock |= static_cast(*inputIterator) << 8; + inputBlock |= uint8_t(*inputIterator) << 8; *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; *outputPointer++ = charset[(inputBlock >> 6) & 0x3f]; if (includePadding) { - *outputPointer = kPadding; + *outputPointer++ = kPadding; } } else { *outputPointer++ = charset[(inputBlock >> 12) & 0x3f]; if (includePadding) { *outputPointer++ = kPadding; - *outputPointer = kPadding; + *outputPointer++ = kPadding; } } } @@ -253,8 +248,8 @@ Status Base64::encodeImpl( } // static -std::string Base64::encode(folly::StringPiece text) { - return encodeImpl(text, kBase64Charset, true); +std::string Base64::encode(folly::StringPiece input) { + return encodeImpl(input, kBase64Charset, true); } // static @@ -426,7 +421,7 @@ Status Base64::decodeImpl( // Set up input and output pointers const char* inputPtr = input.data(); - char* outputPtr = output.data(); + char* outputPointer = output.data(); Status lookupStatus; // Process full blocks of 4 characters @@ -441,13 +436,13 @@ Status Base64::decodeImpl( return lookupStatus; } - uint32_t currentBlock = (val0 << 18) | (val1 << 12) | (val2 << 6) | val3; - outputPtr[0] = static_cast((currentBlock >> 16) & 0xFF); - outputPtr[1] = static_cast((currentBlock >> 8) & 0xFF); - outputPtr[2] = static_cast(currentBlock & 0xFF); + uint32_t inputBlock = (val0 << 18) | (val1 << 12) | (val2 << 6) | val3; + outputPointer[0] = static_cast((inputBlock >> 16) & 0xFF); + outputPointer[1] = static_cast((inputBlock >> 8) & 0xFF); + outputPointer[2] = static_cast(inputBlock & 0xFF); inputPtr += 4; - outputPtr += 3; + outputPointer += 3; } // Handle remaining characters (2 or 3 characters at the end) @@ -455,14 +450,14 @@ Status Base64::decodeImpl( if (remaining > 1) { uint8_t val0 = base64ReverseLookup(inputPtr[0], reverseIndex, lookupStatus); uint8_t val1 = base64ReverseLookup(inputPtr[1], reverseIndex, lookupStatus); - uint32_t currentBlock = (val0 << 18) | (val1 << 12); - outputPtr[0] = static_cast((currentBlock >> 16) & 0xFF); + uint32_t inputBlock = (val0 << 18) | (val1 << 12); + outputPointer[0] = static_cast((inputBlock >> 16) & 0xFF); if (remaining == 3) { uint8_t val2 = base64ReverseLookup(inputPtr[2], reverseIndex, lookupStatus); - currentBlock |= (val2 << 6); - outputPtr[1] = static_cast((currentBlock >> 8) & 0xFF); + inputBlock |= (val2 << 6); + outputPointer[1] = static_cast((inputBlock >> 8) & 0xFF); } } diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index 56ff7df84104..c45e745c8e8b 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -42,16 +42,15 @@ class Base64 { // Encoding Functions /// Encodes the input data using Base64 encoding. static std::string encode(const char* input, size_t inputSize); - static std::string encode(folly::StringPiece text); + static std::string encode(folly::StringPiece input); static std::string encode(const folly::IOBuf* inputBuffer); - static Status encode(const char* input, size_t inputSize, char* outputBuffer); + static Status encode(std::string_view input, std::string& outputBuffer); /// Encodes the input data using Base64 URL encoding. static std::string encodeUrl(const char* input, size_t inputSize); static std::string encodeUrl(folly::StringPiece text); static std::string encodeUrl(const folly::IOBuf* inputBuffer); - static Status - encodeUrl(const char* input, size_t inputSize, char* outputBuffer); + static Status encodeUrl(std::string_view input, std::string& output); // Decoding Functions /// Decodes the input Base64 encoded string. @@ -69,10 +68,6 @@ class Base64 { std::string& output); static Status decodeUrl(std::string_view input, std::string& output); - // Helper Functions - /// Calculates the encoded size based on input size. - static size_t calculateEncodedSize(size_t inputSize, bool withPadding = true); - private: // Checks if the input Base64 string is padded. static inline bool isPadded(std::string_view input) { @@ -107,7 +102,7 @@ class Base64 { const T& input, const Charset& charset, bool includePadding, - char* outputBuffer); + std::string& output); static Status decodeImpl( std::string_view input, @@ -121,6 +116,9 @@ class Base64 { size_t& inputSize, size_t& decodedSize); + // Calculates the encoded size based on input size. + static size_t calculateEncodedSize(size_t inputSize, bool withPadding = true); + VELOX_FRIEND_TEST(Base64Test, isPadded); VELOX_FRIEND_TEST(Base64Test, numPadding); VELOX_FRIEND_TEST(Base64Test, calculateDecodedSize); diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index d3673add45b0..050ab7a5d42f 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -280,8 +280,15 @@ struct ToBase64Function { FOLLY_ALWAYS_INLINE Status call(out_type& result, const arg_type& input) { - result.resize(encoding::Base64::calculateEncodedSize(input.size())); - return encoding::Base64::encode(input.data(), input.size(), result.data()); + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base64::encode(inputView, output); + if (!status.ok()) { + return status; + } + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); } }; @@ -328,9 +335,15 @@ struct ToBase64UrlFunction { FOLLY_ALWAYS_INLINE Status call(out_type& result, const arg_type& input) { - result.resize(encoding::Base64::calculateEncodedSize(input.size())); - return encoding::Base64::encodeUrl( - input.data(), input.size(), result.data()); + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base64::encodeUrl(inputView, output); + if (!status.ok()) { + return status; + } + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); } }; From 9cf55d52ceb5baf7c1c5b94e6a511b3c4b98d46f Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Wed, 7 Aug 2024 18:44:55 +0530 Subject: [PATCH 5/7] Introduce utility class for encoding --- velox/common/encode/EncoderUtils.h | 167 ++++++++++++++++++ velox/common/encode/tests/CMakeLists.txt | 2 +- .../common/encode/tests/EncoderUtilsTests.cpp | 35 ++++ 3 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 velox/common/encode/EncoderUtils.h create mode 100644 velox/common/encode/tests/EncoderUtilsTests.cpp diff --git a/velox/common/encode/EncoderUtils.h b/velox/common/encode/EncoderUtils.h new file mode 100644 index 000000000000..7c5a8a5b09e5 --- /dev/null +++ b/velox/common/encode/EncoderUtils.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include "velox/common/base/Status.h" + +namespace facebook::velox::encoding { + +/// Padding character used in encoding. +const static char kPadding = '='; + +// Checks if the input Base64 string is padded. +static inline bool isPadded(std::string_view input) { + size_t inputSize{input.size()}; + return (inputSize > 0 && input[inputSize - 1] == kPadding); +} + +// Counts the number of padding characters in encoded input. +static inline size_t numPadding(std::string_view input) { + size_t numPadding{0}; + size_t inputSize{input.size()}; + while (inputSize > 0 && input[inputSize - 1] == kPadding) { + numPadding++; + inputSize--; + } + return numPadding; +} + +// Validate the character in charset with ReverseIndex table +template +constexpr bool checkForwardIndex( + uint8_t index, + const Charset& charset, + const ReverseIndex& reverseIndex) { + return (reverseIndex[static_cast(charset[index])] == index) && + (index > 0 ? checkForwardIndex(index - 1, charset, reverseIndex) : true); +} + +// Searches for a character within a charset up to a certain index. +template +constexpr bool findCharacterInCharset( + const Charset& charset, + uint8_t index, + const char targetChar) { + return index < charset.size() && + ((charset[index] == targetChar) || + findCharacterInCharset(charset, index + 1, targetChar)); +} + +// Checks the consistency of a reverse index mapping for a given character set. +template +constexpr bool checkReverseIndex( + uint8_t index, + const Charset& charset, + const ReverseIndex& reverseIndex) { + return (reverseIndex[index] == 255 + ? !findCharacterInCharset(charset, 0, static_cast(index)) + : (charset[reverseIndex[index]] == index)) && + (index > 0 ? checkReverseIndex(index - 1, charset, reverseIndex) : true); +} + +template +uint8_t reverseLookup( + char encodedChar, + const ReverseIndexType& reverseIndex, + Status& status, + uint8_t kBase) { + auto curr = reverseIndex[static_cast(encodedChar)]; + if (curr >= kBase) { + status = + Status::UserError("invalid input string: contains invalid characters."); + return 0; // Return 0 or any other error code indicating failure + } + return curr; +} + +// Returns the actual size of the decoded data. Will also remove the padding +// length from the 'inputSize'. +static Status calculateDecodedSize( + std::string_view input, + size_t& inputSize, + size_t& decodedSize, + const int binaryBlockByteSize, + const int encodedBlockByteSize) { + if (inputSize == 0) { + decodedSize = 0; + return Status::OK(); + } + + // Check if the input string is padded + if (isPadded(input)) { + // If padded, ensure that the string length is a multiple of the encoded + // block size + if (inputSize % encodedBlockByteSize != 0) { + return Status::UserError( + "decode() - invalid input string: " + "string length is not a multiple of 4."); + } + + decodedSize = (inputSize * binaryBlockByteSize) / encodedBlockByteSize; + auto paddingCount = numPadding(input); + inputSize -= paddingCount; + + // Adjust the needed size by deducting the bytes corresponding to the + // padding from the calculated size. + decodedSize -= + ((paddingCount * binaryBlockByteSize) + (encodedBlockByteSize - 1)) / + encodedBlockByteSize; + } else { + // If not padded, calculate extra bytes, if any + auto extraBytes = inputSize % encodedBlockByteSize; + decodedSize = (inputSize / encodedBlockByteSize) * binaryBlockByteSize; + // Adjust the needed size for extra bytes, if present + if (extraBytes) { + if (extraBytes == 1) { + return Status::UserError( + "Base64::decode() - invalid input string: " + "string length cannot be 1 more than a multiple of 4."); + } + decodedSize += (extraBytes * binaryBlockByteSize) / encodedBlockByteSize; + } + } + + return Status::OK(); +} + +// Calculates the encoded size based on input size. +static size_t calculateEncodedSize( + size_t inputSize, + bool includePadding, + const int binaryBlockByteSize, + const int encodedBlockByteSize) { + if (inputSize == 0) { + return 0; + } + + // Calculate the output size assuming that we are including padding. + size_t encodedSize = + ((inputSize + binaryBlockByteSize - 1) / binaryBlockByteSize) * + encodedBlockByteSize; + + if (!includePadding) { + // If the padding was not requested, subtract the padding bytes. + size_t remainder = inputSize % binaryBlockByteSize; + if (remainder != 0) { + encodedSize -= (binaryBlockByteSize - remainder); + } + } + + return encodedSize; +} + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/tests/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt index 63f718c24745..663b2413557a 100644 --- a/velox/common/encode/tests/CMakeLists.txt +++ b/velox/common/encode/tests/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_common_encode_test Base64Test.cpp) +add_executable(velox_common_encode_test Base64Test.cpp EncoderUtilsTests.cpp) add_test(velox_common_encode_test velox_common_encode_test) target_link_libraries( velox_common_encode_test diff --git a/velox/common/encode/tests/EncoderUtilsTests.cpp b/velox/common/encode/tests/EncoderUtilsTests.cpp new file mode 100644 index 000000000000..e112f8125349 --- /dev/null +++ b/velox/common/encode/tests/EncoderUtilsTests.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/encode/EncoderUtils.h" + +namespace facebook::velox::encoding { +class EncoderUtilsTest : public ::testing::Test {}; + +TEST_F(EncoderUtilsTest, isPadded) { + EXPECT_TRUE(isPadded("ABC=")); + EXPECT_FALSE(isPadded("ABC")); +} + +TEST_F(EncoderUtilsTest, numPadding) { + EXPECT_EQ(0, numPadding("ABC")); + EXPECT_EQ(1, numPadding("ABC=")); + EXPECT_EQ(2, numPadding("AB==")); +} + +} // namespace facebook::velox::encoding From 0826e7d3407e3643c84a90c82553783fa7b6e04f Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Wed, 7 Aug 2024 18:44:55 +0530 Subject: [PATCH 6/7] Refactor Base64 to use EncoderUtils --- velox/common/encode/Base64.cpp | 111 ++--------------------- velox/common/encode/Base64.h | 11 +-- velox/common/encode/tests/Base64Test.cpp | 4 +- 3 files changed, 12 insertions(+), 114 deletions(-) diff --git a/velox/common/encode/Base64.cpp b/velox/common/encode/Base64.cpp index f5a0a8c54d37..6e68f01bbaeb 100644 --- a/velox/common/encode/Base64.cpp +++ b/velox/common/encode/Base64.cpp @@ -85,15 +85,6 @@ constexpr const Base64::ReverseIndex kBase64UrlReverseIndexTable = { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}; -// Validate the character in charset with ReverseIndex table -constexpr bool checkForwardIndex( - uint8_t index, - const Base64::Charset& charset, - const Base64::ReverseIndex& reverseIndex) { - return (reverseIndex[static_cast(charset[index])] == index) && - (index > 0 ? checkForwardIndex(index - 1, charset, reverseIndex) : true); -} - // Verify that for every entry in kBase64Charset, the corresponding entry // in kBase64ReverseIndexTable is correct. static_assert( @@ -112,28 +103,6 @@ static_assert( kBase64UrlReverseIndexTable), "kBase64UrlCharset has incorrect entries"); -// Searches for a character within a charset up to a certain index. -constexpr bool findCharacterInCharset( - const Base64::Charset& charset, - uint8_t index, - const char targetChar) { - return index < charset.size() && - ((charset[index] == targetChar) || - findCharacterInCharset(charset, index + 1, targetChar)); -} - -// Checks the consistency of a reverse index mapping for a given character -// set. -constexpr bool checkReverseIndex( - uint8_t index, - const Base64::Charset& charset, - const Base64::ReverseIndex& reverseIndex) { - return (reverseIndex[index] == 255 - ? !findCharacterInCharset(charset, 0, static_cast(index)) - : (charset[reverseIndex[index]] == index)) && - (index > 0 ? checkReverseIndex(index - 1, charset, reverseIndex) : true); -} - // Verify that for every entry in kBase64ReverseIndexTable, the corresponding // entry in kBase64Charset is correct. static_assert( @@ -166,21 +135,6 @@ std::string Base64::encodeImpl( return encodedResult; } -// static -size_t Base64::calculateEncodedSize(size_t inputSize, bool includePadding) { - if (inputSize == 0) { - return 0; - } - - // Calculate the output size assuming that we are including padding. - size_t encodedSize = ((inputSize + 2) / 3) * 4; - if (!includePadding) { - // If the padding was not requested, subtract the padding bytes. - encodedSize -= (3 - (inputSize % 3)) % 3; - } - return encodedSize; -} - // static Status Base64::encode(std::string_view input, std::string& output) { return encodeImpl(input, kBase64Charset, true, output); @@ -205,7 +159,8 @@ Status Base64::encodeImpl( } // Calculate the output size and resize the string beforehand - size_t outputSize = calculateEncodedSize(inputSize, includePadding); + size_t outputSize = calculateEncodedSize( + inputSize, includePadding, kBinaryBlockByteSize, kEncodedBlockByteSize); output.resize(outputSize); // Resize the output string to the required size // Use a pointer to write into the pre-allocated buffer @@ -337,12 +292,7 @@ uint8_t Base64::base64ReverseLookup( char encodedChar, const ReverseIndex& reverseIndex, Status& status) { - auto reverseLookupValue = reverseIndex[static_cast(encodedChar)]; - if (reverseLookupValue >= 0x40) { - status = Status::UserError( - "decode() - invalid input string: invalid characters"); - } - return reverseLookupValue; + return reverseLookup(encodedChar, reverseIndex, status, kCharsetSize); } // static @@ -350,54 +300,6 @@ Status Base64::decode(std::string_view input, std::string& output) { return decodeImpl(input, output, kBase64ReverseIndexTable); } -// static -Status Base64::calculateDecodedSize( - std::string_view input, - size_t& inputSize, - size_t& decodedSize) { - if (inputSize == 0) { - decodedSize = 0; - return Status::OK(); - } - - // Check if the input string is padded - if (isPadded(input)) { - // If padded, ensure that the string length is a multiple of the encoded - // block size - if (inputSize % kEncodedBlockByteSize != 0) { - return Status::UserError( - "Base64::decode() - invalid input string: " - "string length is not a multiple of 4."); - } - - decodedSize = (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize; - auto paddingCount = numPadding(input); - inputSize -= paddingCount; - - // Adjust the needed size by deducting the bytes corresponding to the - // padding from the calculated size. - decodedSize -= - ((paddingCount * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) / - kEncodedBlockByteSize; - return Status::OK(); - } - // If not padded, calculate extra bytes, if any - auto extraBytes = inputSize % kEncodedBlockByteSize; - decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize; - - // Adjust the needed size for extra bytes, if present - if (extraBytes) { - if (extraBytes == 1) { - return Status::UserError( - "Base64::decode() - invalid input string: " - "string length cannot be 1 more than a multiple of 4."); - } - decodedSize += (extraBytes * kBinaryBlockByteSize) / kEncodedBlockByteSize; - } - - return Status::OK(); -} - // static Status Base64::decodeImpl( std::string_view input, @@ -411,7 +313,12 @@ Status Base64::decodeImpl( // Calculate the decoded size based on the input size size_t decodedSize; - auto status = calculateDecodedSize(input, inputSize, decodedSize); + auto status = calculateDecodedSize( + input, + inputSize, + decodedSize, + kBinaryBlockByteSize, + kEncodedBlockByteSize); if (!status.ok()) { return status; } diff --git a/velox/common/encode/Base64.h b/velox/common/encode/Base64.h index c45e745c8e8b..a9c515ee6078 100644 --- a/velox/common/encode/Base64.h +++ b/velox/common/encode/Base64.h @@ -22,6 +22,7 @@ #include #include "velox/common/base/GTestMacros.h" #include "velox/common/base/Status.h" +#include "velox/common/encode/EncoderUtils.h" namespace facebook::velox::encoding { @@ -109,16 +110,6 @@ class Base64 { std::string& output, const ReverseIndex& reverseIndex); - // Returns the actual size of the decoded data. Will also remove the padding - // length from the 'inputSize'. - static Status calculateDecodedSize( - std::string_view input, - size_t& inputSize, - size_t& decodedSize); - - // Calculates the encoded size based on input size. - static size_t calculateEncodedSize(size_t inputSize, bool withPadding = true); - VELOX_FRIEND_TEST(Base64Test, isPadded); VELOX_FRIEND_TEST(Base64Test, numPadding); VELOX_FRIEND_TEST(Base64Test, calculateDecodedSize); diff --git a/velox/common/encode/tests/Base64Test.cpp b/velox/common/encode/tests/Base64Test.cpp index 41f173b7d25c..ed0bbb7e693c 100644 --- a/velox/common/encode/tests/Base64Test.cpp +++ b/velox/common/encode/tests/Base64Test.cpp @@ -55,7 +55,7 @@ TEST_F(Base64Test, calculateDecodedSize) { size_t encoded_size = initialEncodedSize; size_t decoded_size = 0; Status status = - Base64::calculateDecodedSize(encodedString, encoded_size, decoded_size); + calculateDecodedSize(encodedString, encoded_size, decoded_size, 3, 4); if (expectedStatus.ok()) { EXPECT_EQ(Status::OK(), status); @@ -75,7 +75,7 @@ TEST_F(Base64Test, calculateDecodedSize) { 0, 0, Status::UserError( - "Base64::decode() - invalid input string: string length is not a multiple of 4.")); + "decode() - invalid input string: string length is not a multiple of 4.")); checkDecodedSize("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4=", 32, 31, 23); checkDecodedSize("QmFzZTY0IGVuY29kaW5nIGlzIGZ1bi4", 31, 31, 23); checkDecodedSize("MTIzNDU2Nzg5MA==", 16, 14, 10); From c01a22e083194c2c722fe7eb2656cf934adea192 Mon Sep 17 00:00:00 2001 From: Joe Abraham Date: Sat, 5 Oct 2024 10:41:36 +0530 Subject: [PATCH 7/7] Add presto function `to_base32` and `from_base32` --- velox/common/encode/Base32.cpp | 304 ++++++++++++++++++ velox/common/encode/Base32.h | 61 ++++ velox/common/encode/CMakeLists.txt | 2 +- velox/common/encode/EncoderUtils.h | 8 +- velox/docs/functions/presto/binary.rst | 41 +++ velox/functions/prestosql/BinaryFunctions.h | 39 +++ .../BinaryFunctionsRegistration.cpp | 7 + .../prestosql/tests/BinaryFunctionsTest.cpp | 71 ++++ 8 files changed, 526 insertions(+), 7 deletions(-) create mode 100644 velox/common/encode/Base32.cpp create mode 100644 velox/common/encode/Base32.h diff --git a/velox/common/encode/Base32.cpp b/velox/common/encode/Base32.cpp new file mode 100644 index 000000000000..846fb113b007 --- /dev/null +++ b/velox/common/encode/Base32.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/encode/Base32.h" + +#include + +namespace facebook::velox::encoding { + +// Constants defining the size in bytes of binary and encoded blocks for Base32 +// encoding. +// Size of a binary block in bytes (5 bytes = 40 bits) +constexpr static int kBinaryBlockByteSize = 5; +// Size of an encoded block in bytes (8 bytes = 40 bits) +constexpr static int kEncodedBlockByteSize = 8; + +constexpr Base32::Charset kBase32Charset = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + 'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'}; + +constexpr Base32::ReverseIndex kBase32ReverseIndexTable = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + +// Verify that for each 32 entries in kBase32Charset, the corresponding entry +// in kBase32ReverseIndexTable is correct. +static_assert( + checkForwardIndex( + sizeof(kBase32Charset) / 2 - 1, + kBase32Charset, + kBase32ReverseIndexTable), + "kBase32Charset has incorrect entries"); + +// Verify that for every entry in kBase32ReverseIndexTable, the corresponding +// entry in kBase32Charset is correct. +static_assert( + checkReverseIndex( + sizeof(kBase32ReverseIndexTable) - 1, + kBase32Charset, + kBase32ReverseIndexTable), + "kBase32ReverseIndexTable has incorrect entries."); + +// static +Status Base32::encode(std::string_view input, std::string& output) { + return encodeImpl(input, true, output); +} + +// static +template +Status +Base32::encodeImpl(const T& input, bool includePadding, std::string& output) { + auto inputSize = input.size(); + if (inputSize == 0) { + output.clear(); + return Status::OK(); + } + + // Calculate the output size and resize the string beforehand + size_t outputSize = calculateEncodedSize( + inputSize, includePadding, kBinaryBlockByteSize, kEncodedBlockByteSize); + output.resize(outputSize); + + // Use a pointer to write into the pre-allocated buffer + auto outputPointer = output.data(); + auto inputIterator = input.begin(); + + // Process 5-byte (40-bit) blocks, split into 8 groups of 5 bits + for (; inputSize > 4; inputSize -= 5) { + uint64_t currentBlock = static_cast(*inputIterator++) << 32; + currentBlock |= static_cast(*inputIterator++) << 24; + currentBlock |= static_cast(*inputIterator++) << 16; + currentBlock |= static_cast(*inputIterator++) << 8; + currentBlock |= static_cast(*inputIterator++); + + *outputPointer++ = kBase32Charset[(currentBlock >> 35) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 30) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 25) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 20) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 15) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 10) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 5) & 0x1f]; + *outputPointer++ = kBase32Charset[currentBlock & 0x1f]; + } + + // Handle remaining bytes (1 to 4 bytes) + if (inputSize > 0) { + uint64_t currentBlock = static_cast(*inputIterator++) << 32; + *outputPointer++ = kBase32Charset[(currentBlock >> 35) & 0x1f]; + + if (inputSize > 3) { + currentBlock |= static_cast(*inputIterator++) << 24; + currentBlock |= static_cast(*inputIterator++) << 16; + currentBlock |= static_cast(*inputIterator++) << 8; + + *outputPointer++ = kBase32Charset[(currentBlock >> 30) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 25) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 20) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 15) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 10) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 5) & 0x1f]; + if (includePadding) { + *outputPointer++ = kPadding; + } + } else if (inputSize > 2) { + currentBlock |= static_cast(*inputIterator++) << 24; + currentBlock |= static_cast(*inputIterator++) << 16; + + *outputPointer++ = kBase32Charset[(currentBlock >> 30) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 25) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 20) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 15) & 0x1f]; + if (includePadding) { + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + } + } else if (inputSize > 1) { + currentBlock |= static_cast(*inputIterator++) << 24; + + *outputPointer++ = kBase32Charset[(currentBlock >> 30) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 25) & 0x1f]; + *outputPointer++ = kBase32Charset[(currentBlock >> 20) & 0x1f]; + if (includePadding) { + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + } + } else { + *outputPointer++ = kBase32Charset[(currentBlock >> 30) & 0x1f]; + if (includePadding) { + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + *outputPointer++ = kPadding; + } + } + } + + return Status::OK(); +} + +// static +uint8_t Base32::base32ReverseLookup(char encodedChar, Status& status) { + return reverseLookup( + encodedChar, kBase32ReverseIndexTable, status, kCharsetSize); +} + +// static +Status Base32::decode(std::string_view input, std::string& output) { + return decodeImpl(input, output); +} + +// static +Status Base32::decodeImpl(std::string_view input, std::string& output) { + size_t inputSize = input.size(); + + // If input is empty, clear output and return OK status. + if (inputSize == 0) { + output.clear(); + return Status::OK(); + } + + // Calculate the decoded size based on the input size. + size_t decodedSize; + auto status = calculateDecodedSize( + input, + inputSize, + decodedSize, + kBinaryBlockByteSize, + kEncodedBlockByteSize); + if (!status.ok()) { + return status; + } + + // Resize the output to accommodate the decoded data. + output.resize(decodedSize); + + const char* inputPtr = input.data(); + char* outputPtr = output.data(); + Status lookupStatus; + + // Process full blocks of 8 characters + size_t fullBlockCount = inputSize / 8; + for (size_t i = 0; i < fullBlockCount; ++i) { + uint64_t inputBlock = 0; + + // Decode 8 characters into a 40-bit block + for (int shift = 35, j = 0; j < 8; ++j, shift -= 5) { + uint64_t value = base32ReverseLookup(inputPtr[j], lookupStatus); + if (!lookupStatus.ok()) { + return lookupStatus; + } + inputBlock |= (value << shift); + } + + // Write the decoded block to the output + outputPtr[0] = static_cast((inputBlock >> 32) & 0xFF); + outputPtr[1] = static_cast((inputBlock >> 24) & 0xFF); + outputPtr[2] = static_cast((inputBlock >> 16) & 0xFF); + outputPtr[3] = static_cast((inputBlock >> 8) & 0xFF); + outputPtr[4] = static_cast(inputBlock & 0xFF); + + inputPtr += 8; + outputPtr += 5; + } + + // Handle remaining characters (2, 4, 5, 7) + size_t remaining = inputSize % 8; + if (remaining >= 2) { + uint64_t inputBlock = 0; + + // Decode the first two characters + inputBlock |= + (static_cast(base32ReverseLookup(inputPtr[0], lookupStatus)) + << 35); + inputBlock |= + (static_cast(base32ReverseLookup(inputPtr[1], lookupStatus)) + << 30); + + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputPtr[0] = static_cast((inputBlock >> 32) & 0xFF); + + if (remaining > 2) { + // Decode the next two characters + inputBlock |= (base32ReverseLookup(inputPtr[2], lookupStatus) << 25); + inputBlock |= (base32ReverseLookup(inputPtr[3], lookupStatus) << 20); + + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputPtr[1] = static_cast((inputBlock >> 24) & 0xFF); + + if (remaining > 4) { + // Decode the next character + inputBlock |= (base32ReverseLookup(inputPtr[4], lookupStatus) << 15); + + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputPtr[2] = static_cast((inputBlock >> 16) & 0xFF); + + if (remaining > 5) { + // Decode the next two characters + inputBlock |= (base32ReverseLookup(inputPtr[5], lookupStatus) << 10); + inputBlock |= (base32ReverseLookup(inputPtr[6], lookupStatus) << 5); + + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputPtr[3] = static_cast((inputBlock >> 8) & 0xFF); + + if (remaining > 7) { + // Decode the last character + inputBlock |= base32ReverseLookup(inputPtr[7], lookupStatus); + + if (!lookupStatus.ok()) { + return lookupStatus; + } + outputPtr[4] = static_cast(inputBlock & 0xFF); + } + } + } + } + } + + // Return status + return Status::OK(); +} + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base32.h b/velox/common/encode/Base32.h new file mode 100644 index 000000000000..612f25e69801 --- /dev/null +++ b/velox/common/encode/Base32.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "velox/common/base/Status.h" +#include "velox/common/encode/EncoderUtils.h" + +namespace facebook::velox::encoding { + +class Base32 { + public: + static const size_t kCharsetSize = 32; + static const size_t kReverseIndexSize = 256; + + /// Character set used for encoding purposes. + /// Contains specific characters that form the encoding scheme. + using Charset = std::array; + + /// Reverse lookup table for decoding purposes. + /// Maps each possible encoded character to its corresponding numeric value + /// within the encoding base. + using ReverseIndex = std::array; + + /// Encodes the specified number of characters from the 'input' and writes the + /// result to the 'output'. + static Status encode(std::string_view input, std::string& output); + + /// Decodes the specified number of characters from the 'input' and writes the + /// result to the 'output'. + static Status decode(std::string_view input, std::string& output); + + private: + // Performs a reverse lookup in the reverse index to retrieve the original + // index of a character in the base. + static uint8_t base32ReverseLookup(char encodedChar, Status& status); + + // Encodes the specified input using the provided charset. + template + static Status + encodeImpl(const T& input, bool includePadding, std::string& output); + + // Decodes the specified input using the provided reverse lookup table. + static Status decodeImpl(std::string_view input, std::string& output); +}; + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt index 501c690c476b..b897399daf8a 100644 --- a/velox/common/encode/CMakeLists.txt +++ b/velox/common/encode/CMakeLists.txt @@ -16,5 +16,5 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -velox_add_library(velox_encode Base64.cpp) +velox_add_library(velox_encode Base32.cpp Base64.cpp) velox_link_libraries(velox_encode PUBLIC Folly::folly) diff --git a/velox/common/encode/EncoderUtils.h b/velox/common/encode/EncoderUtils.h index 7c5a8a5b09e5..663ef22fc94d 100644 --- a/velox/common/encode/EncoderUtils.h +++ b/velox/common/encode/EncoderUtils.h @@ -106,9 +106,7 @@ static Status calculateDecodedSize( // If padded, ensure that the string length is a multiple of the encoded // block size if (inputSize % encodedBlockByteSize != 0) { - return Status::UserError( - "decode() - invalid input string: " - "string length is not a multiple of 4."); + return Status::UserError("decode() - invalid input string length."); } decodedSize = (inputSize * binaryBlockByteSize) / encodedBlockByteSize; @@ -127,9 +125,7 @@ static Status calculateDecodedSize( // Adjust the needed size for extra bytes, if present if (extraBytes) { if (extraBytes == 1) { - return Status::UserError( - "Base64::decode() - invalid input string: " - "string length cannot be 1 more than a multiple of 4."); + return Status::UserError("decode() - invalid input string length."); } decodedSize += (extraBytes * binaryBlockByteSize) / encodedBlockByteSize; } diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 07deb3e4b0e9..97904d62ed9c 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -32,6 +32,29 @@ Binary Functions Decodes ``string`` data from the base64 encoded representation using the `URL safe alphabet `_ into a varbinary. +.. function:: from_base64(string) -> varbinary + + Decodes a Base64-encoded ``string`` back into its original binary form. + This function can handle both padded and non-padded Base64 encoded strings. + Partially padded Base64 strings will result in a "UserError" status being returned. + + Examples + -------- + Query with padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ='); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with non-padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with partially padded Base64 string: + :: + SELECT from_base64('SGVsbG8gV29ybGQgZm9yIHZlbG94IQ='); -- Error: Base64::decode() - invalid input string: length is not a multiple of 4. + + In the examples above, both fully padded and non-padded Base64 strings ('SGVsbG8gV29ybGQ=' and 'SGVsbG8gV29ybGQ') decode to the binary representation of the text 'Hello World'. + The partially padded Base64 string 'SGVsbG8gV29ybGQgZm9yIHZlbG94IQ=' will result in a "UserError" status indicating the Base64 string is invalid. + .. function:: from_big_endian_32(varbinary) -> integer Decodes ``integer`` value from a 32-bit 2’s complement big endian ``binary``. @@ -122,6 +145,24 @@ Binary Functions Encodes ``binary`` into a base64 string representation. +.. function:: to_base32(varbinary) -> string + + Encodes a binary ``varbinary`` value into its Base32 string representation. + This function generates padded Base32 strings by default. + + Examples + -------- + Query to encode a binary value to a padded Base32 string: + :: + SELECT to_base32(ARRAY[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]); -- 'JBSWY3DPEBLW64TMMQ======' + + Query to encode a binary value with fewer bytes: + :: + SELECT to_base32(ARRAY[104, 101, 108, 108, 111]); -- 'NBSWY3DP' + + In the above examples, the binary array `[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]` is encoded to the padded Base32 string 'JBSWY3DPEBLW64TMMQ======'. + The binary array `[104, 101, 108, 108, 111]` is encoded to 'NBSWY3DP'. + .. function:: to_base64url(binary) -> varchar Encodes ``binary`` into a base64 string representation using the `URL safe alphabet `_. diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index 050ab7a5d42f..0118765dabb8 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -21,6 +21,7 @@ #include "folly/ssl/OpenSSLHash.h" #include "velox/common/base/BitUtil.h" +#include "velox/common/encode/Base32.h" #include "velox/common/encode/Base64.h" #include "velox/external/md5/md5.h" #include "velox/functions/Udf.h" @@ -347,6 +348,44 @@ struct ToBase64UrlFunction { } }; +template +struct ToBase32Function { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base32::encode(inputView, output); + if (!status.ok()) { + return status; + } + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); + } +}; + +template +struct FromBase32Function { + VELOX_DEFINE_FUNCTION_TYPES(TExec); + + // T can be either arg_type or arg_type. These are the + // same, but hard-coding one of them might be confusing. + FOLLY_ALWAYS_INLINE Status + call(out_type& result, const arg_type& input) { + std::string_view inputView(input.data(), input.size()); + std::string output; + auto status = encoding::Base32::decode(inputView, output); + if (!status.ok()) { + return status; + } + result.resize(output.size()); + std::memcpy(result.data(), output.data(), output.size()); + return Status::OK(); + } +}; + template struct FromBigEndian32 { VELOX_DEFINE_FUNCTION_TYPES(T); diff --git a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp index 6f098ebadc51..6ac4ff1bce75 100644 --- a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp @@ -56,6 +56,13 @@ void registerSimpleFunctions(const std::string& prefix) { registerFunction( {prefix + "from_base64url"}); + registerFunction( + {prefix + "to_base32"}); + registerFunction( + {prefix + "from_base32"}); + registerFunction( + {prefix + "from_base32"}); + registerFunction( {prefix + "from_big_endian_32"}); registerFunction( diff --git a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp index 2ce20c3f58e2..3fe44a8c914b 100644 --- a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp @@ -481,6 +481,77 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) { EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError); } +TEST_F(BinaryFunctionsTest, toBase32) { + const auto toBase32 = [&](std::optional value) { + return evaluateOnce("to_base32(cast(c0 as varbinary))", value); + }; + + EXPECT_EQ(std::nullopt, toBase32(std::nullopt)); + EXPECT_EQ("", toBase32("")); + EXPECT_EQ("ME======", toBase32("a")); + EXPECT_EQ("MFRGG===", toBase32("abc")); + EXPECT_EQ("NZXQ====", toBase32("no")); + EXPECT_EQ("O5SQ====", toBase32("we")); + EXPECT_EQ("MRRDE===", toBase32("db2")); + EXPECT_EQ("MNQWWZI=", toBase32("cake")); + EXPECT_EQ("NNSWK3Q=", toBase32("keen")); + EXPECT_EQ("GEZDGNA=", toBase32("1234")); + EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world")); + EXPECT_EQ( + "JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===", + toBase32("Hello World from Velox!")); +} + +TEST_F(BinaryFunctionsTest, fromBase32) { + const auto fromBase32 = [&](std::optional value) { + // from_base32 allows VARCHAR and VARBINARY inputs. + auto result = + evaluateOnce("from_base32(c0)", VARCHAR(), value); + auto otherResult = + evaluateOnce("from_base32(c0)", VARBINARY(), value); + + VELOX_CHECK_EQ(result.has_value(), otherResult.has_value()); + + if (!result.has_value()) { + return result; + } + + VELOX_CHECK_EQ(result.value(), otherResult.value()); + return result; + }; + + EXPECT_EQ(std::nullopt, fromBase32(std::nullopt)); + EXPECT_EQ("", fromBase32("")); + EXPECT_EQ("a", fromBase32("ME======")); + EXPECT_EQ("ab", fromBase32("MFRA====")); + EXPECT_EQ("abc", fromBase32("MFRGG===")); + EXPECT_EQ("db2", fromBase32("MRRDE===")); + EXPECT_EQ("abcd", fromBase32("MFRGGZA=")); + EXPECT_EQ("hello world", fromBase32("NBSWY3DPEB3W64TMMQ======")); + EXPECT_EQ( + "Hello World from Velox!", + fromBase32("JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===")); + + // Try encoded strings without padding + EXPECT_EQ("a", fromBase32("ME")); + EXPECT_EQ("ab", fromBase32("MFRA")); + EXPECT_EQ("abc", fromBase32("MFRGG")); + EXPECT_EQ("db2", fromBase32("MRRDE")); + EXPECT_EQ("abcd", fromBase32("MFRGGZA")); + EXPECT_EQ("1234", fromBase32("GEZDGNA")); + EXPECT_EQ("abcde", fromBase32("MFRGGZDF")); + EXPECT_EQ("abcdef", fromBase32("MFRGGZDFMY")); + + VELOX_ASSERT_USER_THROW( + fromBase32("1="), "decode() - invalid input string length."); + VELOX_ASSERT_USER_THROW( + fromBase32("M1======"), + "invalid input string: contains invalid characters."); + VELOX_ASSERT_USER_THROW( + fromBase32("J$======"), + "invalid input string: contains invalid characters."); +} + TEST_F(BinaryFunctionsTest, fromBigEndian32) { const auto fromBigEndian32 = [&](const std::optional& arg) { return evaluateOnce("from_big_endian_32(c0)", VARBINARY(), arg);