Skip to content

Commit

Permalink
Add presto functions to_base and from_base
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed Dec 8, 2023
1 parent ebe26c3 commit f6f4ed3
Show file tree
Hide file tree
Showing 7 changed files with 465 additions and 2 deletions.
296 changes: 296 additions & 0 deletions velox/common/encode/Base32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/common/encode/Base32.h"

#include <glog/logging.h>
#include <stdint.h>

namespace facebook::velox::encoding {

constexpr const Base32::Charset kBase32Charset = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'};

constexpr const Base32::ReverseIndex kBase32ReverseIndexTable = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};

/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
/// in kBase32ReverseIndexTable is correct.
static_assert(
checkForwardIndex(
sizeof(kBase32Charset) / 2 - 1,
kBase32Charset,
kBase32ReverseIndexTable),
"kBase32Charset has incorrect entries");

/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
/// entry in kBase32Charset is correct.
static_assert(
checkReverseIndex(
sizeof(kBase32ReverseIndexTable) - 1,
kBase32Charset,
Base32::kBase,
kBase32ReverseIndexTable),
"kBase32ReverseIndexTable has incorrect entries.");

// static
size_t Base32::calculateEncodedSize(size_t size, bool withPadding) {
if (size == 0) {
return 0;
}

// Calculate the output size assuming that we are including padding.
size_t encodedSize = ((size + 4) / 5) * 8;
if (!withPadding) {
// If the padding was not requested, subtract the padding bytes.
encodedSize -= (5 - (size % 5)) % 5;
}
return encodedSize;
}

// static
void Base32::encode(const char* data, size_t len, char* output) {
encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output);
}

template <class T>
/* static */ void Base32::encodeImpl(
const T& data,
const Charset& charset,
bool include_pad,
char* out) {
auto len = data.size();
if (len == 0) {
return;
}

auto wp = out;
auto it = data.begin();

/// For each group of 5 bytes (40 bits) in the input, split that into
/// 8 groups of 5 bits and encode that using the supplied charset lookup
for (; len > 4; len -= 5) {
uint64_t curr = uint64_t(*it++) << 32;
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;
curr |= uint8_t(*it++) << 8;
curr |= uint8_t(*it++);

*wp++ = charset[(curr >> 35) & 0x1f];
*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];
*wp++ = charset[(curr >> 10) & 0x1f];
*wp++ = charset[(curr >> 5) & 0x1f];
*wp++ = charset[curr & 0x1f];
}

if (len > 0) {
/// We have either 1 to 4 input bytes left. Encode this similar to the
/// above (assuming 0 for all other bytes). Optionally append the '='
/// character if it is requested.
uint64_t curr = uint64_t(*it++) << 32;
*wp++ = charset[(curr >> 35) & 0x1f];
if (len > 3) {
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;
curr |= uint8_t(*it) << 8;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];
*wp++ = charset[(curr >> 10) & 0x1f];
*wp++ = charset[(curr >> 5) & 0x1f];

if (include_pad) {
*wp = kBasePad;
}
} else if (len > 2) {
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];

if (include_pad) {
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp = kBasePad;
}
} else if (len > 1) {
curr |= uint8_t(*it) << 24;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];

if (include_pad) {
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp = kBasePad;
}
} else {
*wp++ = charset[(curr >> 30) & 0x1f];

if (include_pad) {
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp++ = kBasePad;
*wp = kBasePad;
}
}
}
}

size_t Base32::calculateDecodedSize(const char* data, size_t& size) {
if (size == 0) {
return 0;
}

// If padding doesn't exist, add count for the extra bytes
if (!isPadded(data, size)) {
/// If padding doesn't exist we need to calculate it from the size - if the
/// size % 8 is 0 then we have an even multiple 5 byte chunks in the result
/// if it is 7 then we need 1 more byte in the output. If it is 5 then we
/// need 3 more bytes in the output. Likewise 4 and 2. But, it should never
/// be 6 or 3 or 1.
auto extra = size % kEncodedBlockSize;
auto needed = (size / kEncodedBlockSize) * kBinaryBlockSize;
if (extra) {
if ((extra == 6) || (extra == 3) || (extra == 1)) {
throw BaseException(
"Base32::decode() - invalid input string: "
"string length cannot be 6, 3 or 1 more than a multiple of 8.");
}
needed += (extra * kBinaryBlockSize) / kEncodedBlockSize;
}
return needed;
}

/// If the pad characters are included then the source string must be a
/// multiple of encoded block size and we can query the end of the string
/// to see how much padding exists.
if (size % kEncodedBlockSize != 0) {
throw BaseException(
"Base32::decode() - invalid input string: "
"string length is not multiple of encoded block size.");
}

auto needed = (size * kBinaryBlockSize) / kEncodedBlockSize;
auto padding = Base::countPadding(data, size);
size -= padding;
return needed -
ceil((padding * kBinaryBlockSize) / double(kEncodedBlockSize));
}

size_t
Base32::decode(const char* src, size_t src_len, char* dst, size_t dst_len) {
return decodeImpl(src, src_len, dst, dst_len, kBase32ReverseIndexTable);
}

size_t Base32::decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const Base::ReverseIndex& reverse_lookup) {
if (!src_len) {
return 0;
}

auto needed = calculateDecodedSize(src, src_len);
if (dst_len < needed) {
throw BaseException(
"Base32::decode() - invalid output string: "
"output string is too small.");
}

// Handle full groups of 8 characters
for (; src_len > 8; src_len -= 8, src += 8, dst += 5) {
/// Each character of the 8 bytes encode 5 bits of the original, grab each
/// with the appropriate shifts to rebuild the original and then split that
/// back into the original 8 bit bytes.
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30) |
(baseReverseLookup(kBase, src[2], reverse_lookup) << 25) |
(baseReverseLookup(kBase, src[3], reverse_lookup) << 20) |
(baseReverseLookup(kBase, src[4], reverse_lookup) << 15) |
(baseReverseLookup(kBase, src[5], reverse_lookup) << 10) |
(baseReverseLookup(kBase, src[6], reverse_lookup) << 5) |
baseReverseLookup(kBase, src[7], reverse_lookup);
dst[0] = (last >> 32) & 0xff;
dst[1] = (last >> 24) & 0xff;
dst[2] = (last >> 16) & 0xff;
dst[3] = (last >> 8) & 0xff;
dst[4] = last & 0xff;
}

/// Handle the last 2, 4, 5, 7 or 8 characters. This is similar to the above,
/// but the last characters may or may not exist.
DCHECK(src_len >= 2);
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30);
dst[0] = (last >> 32) & 0xff;
if (src_len > 2) {
last |= baseReverseLookup(kBase, src[2], reverse_lookup) << 25;
last |= baseReverseLookup(kBase, src[3], reverse_lookup) << 20;
dst[1] = (last >> 24) & 0xff;
if (src_len > 4) {
last |= baseReverseLookup(kBase, src[4], reverse_lookup) << 15;
dst[2] = (last >> 16) & 0xff;
if (src_len > 5) {
last |= baseReverseLookup(kBase, src[5], reverse_lookup) << 10;
last |= baseReverseLookup(kBase, src[6], reverse_lookup) << 5;
dst[3] = (last >> 8) & 0xff;
if (src_len > 7) {
last |= baseReverseLookup(kBase, src[7], reverse_lookup);
dst[4] = last & 0xff;
}
}
}
}

return needed;
}

} // namespace facebook::velox::encoding
75 changes: 75 additions & 0 deletions velox/common/encode/Base32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <exception>
#include <map>
#include <string>

#include <folly/Range.h>
#include "velox/common/encode/Base.h"

namespace facebook::velox::encoding {

class Base32 : public Base {
public:
/// Returns encoded size for the input of the specified size.
static size_t calculateEncodedSize(size_t size, bool withPadding = true);

/// Encodes the specified number of characters from the 'data' and writes the
/// result to the 'output'. The output must have enough space, e.g. as
/// returned by the calculateEncodedSize().
static void encode(const char* data, size_t size, char* output);
/// Returns decoded size for the specified input. Adjusts the 'size' to
/// subtract the length of the padding, if exists.
static size_t calculateDecodedSize(const char* data, size_t& size);

/// Decodes the specified number of characters from the 'src' and writes the
/// result to the 'dst'. The destination must have enough space, e.g. as
/// returned by the calculateDecodedSize().
static size_t
decode(const char* src, size_t src_len, char* dst, size_t dst_len);

private:
template <class T>
static void encodeImpl(
const T& data,
const Charset& charset,
bool include_pad,
char* out);

/// Decodes the specified number of base 32 encoded characters from the 'src'
/// and writes to 'dst'
static size_t decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const ReverseIndex& table);

public:
// Padding character used in encoding
constexpr static char kBase = 32;

private:
// Size of the binary block before encoding.
constexpr static int kBinaryBlockSize = 5;

// Size of the encoded block after encoding.
constexpr static int kEncodedBlockSize = 8;
};

} // namespace facebook::velox::encoding
4 changes: 2 additions & 2 deletions velox/common/encode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_library(velox_encode Base.cpp Base64.cpp)
target_link_libraries(velox_encode PUBLIC Folly::folly)
add_library(velox_encode Base.cpp Base32.cpp Base64.cpp)
target_link_libraries(velox_encode PUBLIC Folly::folly)
Loading

0 comments on commit f6f4ed3

Please sign in to comment.