Skip to content

Commit

Permalink
Add from_base32 presto function
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed Jul 4, 2024
1 parent 2345bae commit 7fd99c3
Show file tree
Hide file tree
Showing 9 changed files with 395 additions and 2 deletions.
191 changes: 191 additions & 0 deletions velox/common/encode/Base32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/common/encode/Base32.h"

#include <glog/logging.h>

namespace facebook::velox::encoding {

// Encoding base to be used.
constexpr static int kBase = 32;

// Constants defining the size in bytes of binary and encoded blocks for Base32
// encoding.
// Size of a binary block in bytes (5 bytes = 40 bits)
constexpr static int kBinaryBlockByteSize = 5;
// Size of an encoded block in bytes (8 bytes = 40 bits)
constexpr static int kEncodedBlockByteSize = 8;

constexpr Charset kBase32Charset = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '2', '3', '4', '5', '6', '7'};

constexpr ReverseIndex kBase32ReverseIndexTable = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};

/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
/// in kBase32ReverseIndexTable is correct.
static_assert(
checkForwardIndex(
sizeof(kBase32Charset) / 2 - 1,
kBase32Charset,
kBase32ReverseIndexTable),
"kBase32Charset has incorrect entries");

/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
/// entry in kBase32Charset is correct.
static_assert(
checkReverseIndex(
sizeof(kBase32ReverseIndexTable) - 1,
kBase32Charset,
kBase,
kBase32ReverseIndexTable),
"kBase32ReverseIndexTable has incorrect entries.");

size_t Base32::calculateDecodedSize(const char* data, size_t& size) {
if (size == 0) {
return 0;
}

// Check if the input data is padded
if (isPadded(data, size)) {
/// If padded, ensure that the string length is a multiple of the encoded
/// block size.
if (size % kEncodedBlockByteSize != 0) {
VELOX_USER_FAIL(
"Base32::decode() - invalid input string: "
"string length is not a multiple of 8.");
}

auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize;
auto padding = numPadding(data, size);
size -= padding;

// Adjust the needed size by deducting the bytes corresponding to the
// padding from the calculated size.
return needed -
((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) /
kEncodedBlockByteSize;
} else {
// If not padded, calculate extra bytes, if any.
auto extra = size % kEncodedBlockByteSize;
auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize;

// Adjust the needed size for extra bytes, if present.
if (extra) {
if ((extra == 6) || (extra == 3) || (extra == 1)) {
VELOX_USER_FAIL(
"Base32::decode() - invalid input string: "
"string length cannot be 6, 3 or 1 more than a multiple of 8.");
}
needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize;
}

return needed;
}
}

size_t
Base32::decode(const char* src, size_t src_len, char* dst, size_t dst_len) {
return decodeImpl(src, src_len, dst, dst_len, kBase32ReverseIndexTable);
}

size_t Base32::decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const ReverseIndex& reverse_lookup) {
if (!src_len) {
return 0;
}

auto needed = calculateDecodedSize(src, src_len);
if (dst_len < needed) {
VELOX_USER_FAIL(
"Base32::decode() - invalid output string: "
"output string is too small.");
}

// Handle full groups of 8 characters.
for (; src_len > 8; src_len -= 8, src += 8, dst += 5) {
/// Each character of the 8 bytes encode 5 bits of the original, grab each
/// with the appropriate shifts to rebuild the original and then split that
/// back into the original 8 bit bytes.
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30) |
(baseReverseLookup(kBase, src[2], reverse_lookup) << 25) |
(baseReverseLookup(kBase, src[3], reverse_lookup) << 20) |
(baseReverseLookup(kBase, src[4], reverse_lookup) << 15) |
(baseReverseLookup(kBase, src[5], reverse_lookup) << 10) |
(baseReverseLookup(kBase, src[6], reverse_lookup) << 5) |
baseReverseLookup(kBase, src[7], reverse_lookup);
dst[0] = (last >> 32) & 0xff;
dst[1] = (last >> 24) & 0xff;
dst[2] = (last >> 16) & 0xff;
dst[3] = (last >> 8) & 0xff;
dst[4] = last & 0xff;
}

/// Handle the last 2, 4, 5, 7 or 8 characters. This is similar to the above,
/// but the last characters may or may not exist.
DCHECK(src_len >= 2);
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30);
dst[0] = (last >> 32) & 0xff;
if (src_len > 2) {
last |= baseReverseLookup(kBase, src[2], reverse_lookup) << 25;
last |= baseReverseLookup(kBase, src[3], reverse_lookup) << 20;
dst[1] = (last >> 24) & 0xff;
if (src_len > 4) {
last |= baseReverseLookup(kBase, src[4], reverse_lookup) << 15;
dst[2] = (last >> 16) & 0xff;
if (src_len > 5) {
last |= baseReverseLookup(kBase, src[5], reverse_lookup) << 10;
last |= baseReverseLookup(kBase, src[6], reverse_lookup) << 5;
dst[3] = (last >> 8) & 0xff;
if (src_len > 7) {
last |= baseReverseLookup(kBase, src[7], reverse_lookup);
dst[4] = last & 0xff;
}
}
}
}

return needed;
}

} // namespace facebook::velox::encoding
52 changes: 52 additions & 0 deletions velox/common/encode/Base32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <exception>
#include <map>
#include <string>

#include <folly/Range.h>

#include "velox/common/base/Exceptions.h"
#include "velox/common/encode/EncoderUtils.h"

namespace facebook::velox::encoding {

class Base32 {
public:
/// Returns decoded size for the specified input. Adjusts the 'size' to
/// subtract the length of the padding, if exists.
static size_t calculateDecodedSize(const char* data, size_t& size);

/// Decodes the specified number of characters from the 'src' and writes the
/// result to the 'dst'. The destination must have enough space, e.g. as
/// returned by the calculateDecodedSize().
static size_t
decode(const char* src, size_t src_len, char* dst, size_t dst_len);

private:
/// Decodes the specified number of base 32 encoded characters from the 'src'
/// and writes to 'dst'
static size_t decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const ReverseIndex& table);
};

} // namespace facebook::velox::encoding
2 changes: 1 addition & 1 deletion velox/common/encode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()

add_library(velox_encode Base64.cpp)
add_library(velox_encode Base32.cpp Base64.cpp)
target_link_libraries(velox_encode PUBLIC Folly::folly)
56 changes: 56 additions & 0 deletions velox/common/encode/tests/Base32Test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/common/encode/Base32.h"
#include <gtest/gtest.h>
#include "velox/common/base/tests/GTestUtils.h"

namespace facebook::velox::encoding {

class Base32Test : public ::testing::Test {};

TEST_F(Base32Test, calculateDecodedSizeProperSize) {
struct TestCase {
std::string encoded;
size_t initial_size;
int expected_decoded;
size_t expected_size;
};

std::vector<TestCase> test_cases = {
{"ME======", 8, 1, 2},
{"ME", 2, 1, 2},
{"MFRA====", 8, 2, 4},
{"MFRGG===", 8, 3, 5},
{"NBSWY3DPEB3W64TMMQ======", 24, 11, 18},
{"NBSWY3DPEB3W64TMMQ", 18, 11, 18}};

for (const auto& test : test_cases) {
size_t encoded_size = test.initial_size;
EXPECT_EQ(
test.expected_decoded,
Base32::calculateDecodedSize(test.encoded.c_str(), encoded_size));
EXPECT_EQ(test.expected_size, encoded_size);
}
}

TEST_F(Base32Test, errorWhenDecodedStringPartiallyPadded) {
size_t encoded_size = 9;
EXPECT_THROW(
Base32::calculateDecodedSize("MFRA====", encoded_size), VeloxUserError);
}

} // namespace facebook::velox::encoding
3 changes: 2 additions & 1 deletion velox/common/encode/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_executable(velox_common_encode_test Base64Test.cpp EncoderUtilsTests.cpp)
add_executable(velox_common_encode_test Base32Test.cpp Base64Test.cpp
EncoderUtilsTests.cpp)
add_test(velox_common_encode_test velox_common_encode_test)
target_link_libraries(
velox_common_encode_test
Expand Down
22 changes: 22 additions & 0 deletions velox/docs/functions/presto/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,28 @@ Binary Functions

Decodes ``bigint`` value from a 64-bit 2’s complement big endian ``binary``.

.. function:: from_base32(string) -> varbinary

Decodes a Base32-encoded ``string`` back into its original binary form.
This function can handle both padded and non-padded Base32 encoded strings. Partially padded Base32 strings will result in an error.

Examples
--------
Query with padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TMMQ======'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]

Query with non-padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TMMQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]

Query with partially padded Base32 string:
::
SELECT from_base32('JBSWY3DPEBLW64TM=='); -- Error: Base32::decode() - invalid input string: length is not a multiple of 8.

In the examples above, both fully padded and non-padded Base32 strings ('JBSWY3DPEBLW64TMMQ======' and 'JBSWY3DPEBLW64TMMQ') decode to the binary representation of the text 'Hello World'.
The partially padded Base32 string 'JBSWY3DPEBLW64TM==' will lead to a decoding error.

.. function:: from_hex(string) -> varbinary

Decodes binary data from the hex encoded ``string``.
Expand Down
17 changes: 17 additions & 0 deletions velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "folly/ssl/OpenSSLHash.h"
#include "velox/common/base/BitUtil.h"
#include "velox/common/encode/Base32.h"
#include "velox/common/encode/Base64.h"
#include "velox/external/md5/md5.h"
#include "velox/functions/Udf.h"
Expand Down Expand Up @@ -324,6 +325,22 @@ struct ToBase64UrlFunction {
}
};

template <typename TExec>
struct FromBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

// T can be either arg_type<Varchar> or arg_type<Varbinary>. These are the
// same, but hard-coding one of them might be confusing.
template <typename T>
FOLLY_ALWAYS_INLINE void call(out_type<Varbinary>& result, const T& input) {
auto inputSize = input.size();
result.resize(
encoding::Base32::calculateDecodedSize(input.data(), inputSize));
encoding::Base32::decode(
input.data(), inputSize, result.data(), result.size());
}
};

template <typename T>
struct FromBigEndian32 {
VELOX_DEFINE_FUNCTION_TYPES(T);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ void registerSimpleFunctions(const std::string& prefix) {
registerFunction<FromBase64UrlFunction, Varbinary, Varchar>(
{prefix + "from_base64url"});

registerFunction<FromBase32Function, Varbinary, Varchar>(
{prefix + "from_base32"});
registerFunction<FromBase32Function, Varbinary, Varbinary>(
{prefix + "from_base32"});

registerFunction<FromBigEndian32, int32_t, Varbinary>(
{prefix + "from_big_endian_32"});
registerFunction<ToBigEndian32, Varbinary, int32_t>(
Expand Down
Loading

0 comments on commit 7fd99c3

Please sign in to comment.