-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add presto functions to_base and from_base
- Loading branch information
1 parent
ebe26c3
commit f6f4ed3
Showing
7 changed files
with
465 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,296 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include "velox/common/encode/Base32.h" | ||
|
||
#include <glog/logging.h> | ||
#include <stdint.h> | ||
|
||
namespace facebook::velox::encoding { | ||
|
||
constexpr const Base32::Charset kBase32Charset = { | ||
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', | ||
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', | ||
'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'}; | ||
|
||
constexpr const Base32::ReverseIndex kBase32ReverseIndexTable = { | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, | ||
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, | ||
25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255}; | ||
|
||
/// Verify that for each 32 entries in kBase32Charset, the corresponding entry | ||
/// in kBase32ReverseIndexTable is correct. | ||
static_assert( | ||
checkForwardIndex( | ||
sizeof(kBase32Charset) / 2 - 1, | ||
kBase32Charset, | ||
kBase32ReverseIndexTable), | ||
"kBase32Charset has incorrect entries"); | ||
|
||
/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding | ||
/// entry in kBase32Charset is correct. | ||
static_assert( | ||
checkReverseIndex( | ||
sizeof(kBase32ReverseIndexTable) - 1, | ||
kBase32Charset, | ||
Base32::kBase, | ||
kBase32ReverseIndexTable), | ||
"kBase32ReverseIndexTable has incorrect entries."); | ||
|
||
// static | ||
size_t Base32::calculateEncodedSize(size_t size, bool withPadding) { | ||
if (size == 0) { | ||
return 0; | ||
} | ||
|
||
// Calculate the output size assuming that we are including padding. | ||
size_t encodedSize = ((size + 4) / 5) * 8; | ||
if (!withPadding) { | ||
// If the padding was not requested, subtract the padding bytes. | ||
encodedSize -= (5 - (size % 5)) % 5; | ||
} | ||
return encodedSize; | ||
} | ||
|
||
// static | ||
void Base32::encode(const char* data, size_t len, char* output) { | ||
encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output); | ||
} | ||
|
||
template <class T> | ||
/* static */ void Base32::encodeImpl( | ||
const T& data, | ||
const Charset& charset, | ||
bool include_pad, | ||
char* out) { | ||
auto len = data.size(); | ||
if (len == 0) { | ||
return; | ||
} | ||
|
||
auto wp = out; | ||
auto it = data.begin(); | ||
|
||
/// For each group of 5 bytes (40 bits) in the input, split that into | ||
/// 8 groups of 5 bits and encode that using the supplied charset lookup | ||
for (; len > 4; len -= 5) { | ||
uint64_t curr = uint64_t(*it++) << 32; | ||
curr |= uint8_t(*it++) << 24; | ||
curr |= uint8_t(*it++) << 16; | ||
curr |= uint8_t(*it++) << 8; | ||
curr |= uint8_t(*it++); | ||
|
||
*wp++ = charset[(curr >> 35) & 0x1f]; | ||
*wp++ = charset[(curr >> 30) & 0x1f]; | ||
*wp++ = charset[(curr >> 25) & 0x1f]; | ||
*wp++ = charset[(curr >> 20) & 0x1f]; | ||
*wp++ = charset[(curr >> 15) & 0x1f]; | ||
*wp++ = charset[(curr >> 10) & 0x1f]; | ||
*wp++ = charset[(curr >> 5) & 0x1f]; | ||
*wp++ = charset[curr & 0x1f]; | ||
} | ||
|
||
if (len > 0) { | ||
/// We have either 1 to 4 input bytes left. Encode this similar to the | ||
/// above (assuming 0 for all other bytes). Optionally append the '=' | ||
/// character if it is requested. | ||
uint64_t curr = uint64_t(*it++) << 32; | ||
*wp++ = charset[(curr >> 35) & 0x1f]; | ||
if (len > 3) { | ||
curr |= uint8_t(*it++) << 24; | ||
curr |= uint8_t(*it++) << 16; | ||
curr |= uint8_t(*it) << 8; | ||
|
||
*wp++ = charset[(curr >> 30) & 0x1f]; | ||
*wp++ = charset[(curr >> 25) & 0x1f]; | ||
*wp++ = charset[(curr >> 20) & 0x1f]; | ||
*wp++ = charset[(curr >> 15) & 0x1f]; | ||
*wp++ = charset[(curr >> 10) & 0x1f]; | ||
*wp++ = charset[(curr >> 5) & 0x1f]; | ||
|
||
if (include_pad) { | ||
*wp = kBasePad; | ||
} | ||
} else if (len > 2) { | ||
curr |= uint8_t(*it++) << 24; | ||
curr |= uint8_t(*it++) << 16; | ||
|
||
*wp++ = charset[(curr >> 30) & 0x1f]; | ||
*wp++ = charset[(curr >> 25) & 0x1f]; | ||
*wp++ = charset[(curr >> 20) & 0x1f]; | ||
*wp++ = charset[(curr >> 15) & 0x1f]; | ||
|
||
if (include_pad) { | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp = kBasePad; | ||
} | ||
} else if (len > 1) { | ||
curr |= uint8_t(*it) << 24; | ||
|
||
*wp++ = charset[(curr >> 30) & 0x1f]; | ||
*wp++ = charset[(curr >> 25) & 0x1f]; | ||
*wp++ = charset[(curr >> 20) & 0x1f]; | ||
|
||
if (include_pad) { | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp = kBasePad; | ||
} | ||
} else { | ||
*wp++ = charset[(curr >> 30) & 0x1f]; | ||
|
||
if (include_pad) { | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp++ = kBasePad; | ||
*wp = kBasePad; | ||
} | ||
} | ||
} | ||
} | ||
|
||
size_t Base32::calculateDecodedSize(const char* data, size_t& size) { | ||
if (size == 0) { | ||
return 0; | ||
} | ||
|
||
// If padding doesn't exist, add count for the extra bytes | ||
if (!isPadded(data, size)) { | ||
/// If padding doesn't exist we need to calculate it from the size - if the | ||
/// size % 8 is 0 then we have an even multiple 5 byte chunks in the result | ||
/// if it is 7 then we need 1 more byte in the output. If it is 5 then we | ||
/// need 3 more bytes in the output. Likewise 4 and 2. But, it should never | ||
/// be 6 or 3 or 1. | ||
auto extra = size % kEncodedBlockSize; | ||
auto needed = (size / kEncodedBlockSize) * kBinaryBlockSize; | ||
if (extra) { | ||
if ((extra == 6) || (extra == 3) || (extra == 1)) { | ||
throw BaseException( | ||
"Base32::decode() - invalid input string: " | ||
"string length cannot be 6, 3 or 1 more than a multiple of 8."); | ||
} | ||
needed += (extra * kBinaryBlockSize) / kEncodedBlockSize; | ||
} | ||
return needed; | ||
} | ||
|
||
/// If the pad characters are included then the source string must be a | ||
/// multiple of encoded block size and we can query the end of the string | ||
/// to see how much padding exists. | ||
if (size % kEncodedBlockSize != 0) { | ||
throw BaseException( | ||
"Base32::decode() - invalid input string: " | ||
"string length is not multiple of encoded block size."); | ||
} | ||
|
||
auto needed = (size * kBinaryBlockSize) / kEncodedBlockSize; | ||
auto padding = Base::countPadding(data, size); | ||
size -= padding; | ||
return needed - | ||
ceil((padding * kBinaryBlockSize) / double(kEncodedBlockSize)); | ||
} | ||
|
||
size_t | ||
Base32::decode(const char* src, size_t src_len, char* dst, size_t dst_len) { | ||
return decodeImpl(src, src_len, dst, dst_len, kBase32ReverseIndexTable); | ||
} | ||
|
||
size_t Base32::decodeImpl( | ||
const char* src, | ||
size_t src_len, | ||
char* dst, | ||
size_t dst_len, | ||
const Base::ReverseIndex& reverse_lookup) { | ||
if (!src_len) { | ||
return 0; | ||
} | ||
|
||
auto needed = calculateDecodedSize(src, src_len); | ||
if (dst_len < needed) { | ||
throw BaseException( | ||
"Base32::decode() - invalid output string: " | ||
"output string is too small."); | ||
} | ||
|
||
// Handle full groups of 8 characters | ||
for (; src_len > 8; src_len -= 8, src += 8, dst += 5) { | ||
/// Each character of the 8 bytes encode 5 bits of the original, grab each | ||
/// with the appropriate shifts to rebuild the original and then split that | ||
/// back into the original 8 bit bytes. | ||
uint64_t last = | ||
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) | | ||
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30) | | ||
(baseReverseLookup(kBase, src[2], reverse_lookup) << 25) | | ||
(baseReverseLookup(kBase, src[3], reverse_lookup) << 20) | | ||
(baseReverseLookup(kBase, src[4], reverse_lookup) << 15) | | ||
(baseReverseLookup(kBase, src[5], reverse_lookup) << 10) | | ||
(baseReverseLookup(kBase, src[6], reverse_lookup) << 5) | | ||
baseReverseLookup(kBase, src[7], reverse_lookup); | ||
dst[0] = (last >> 32) & 0xff; | ||
dst[1] = (last >> 24) & 0xff; | ||
dst[2] = (last >> 16) & 0xff; | ||
dst[3] = (last >> 8) & 0xff; | ||
dst[4] = last & 0xff; | ||
} | ||
|
||
/// Handle the last 2, 4, 5, 7 or 8 characters. This is similar to the above, | ||
/// but the last characters may or may not exist. | ||
DCHECK(src_len >= 2); | ||
uint64_t last = | ||
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) | | ||
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30); | ||
dst[0] = (last >> 32) & 0xff; | ||
if (src_len > 2) { | ||
last |= baseReverseLookup(kBase, src[2], reverse_lookup) << 25; | ||
last |= baseReverseLookup(kBase, src[3], reverse_lookup) << 20; | ||
dst[1] = (last >> 24) & 0xff; | ||
if (src_len > 4) { | ||
last |= baseReverseLookup(kBase, src[4], reverse_lookup) << 15; | ||
dst[2] = (last >> 16) & 0xff; | ||
if (src_len > 5) { | ||
last |= baseReverseLookup(kBase, src[5], reverse_lookup) << 10; | ||
last |= baseReverseLookup(kBase, src[6], reverse_lookup) << 5; | ||
dst[3] = (last >> 8) & 0xff; | ||
if (src_len > 7) { | ||
last |= baseReverseLookup(kBase, src[7], reverse_lookup); | ||
dst[4] = last & 0xff; | ||
} | ||
} | ||
} | ||
} | ||
|
||
return needed; | ||
} | ||
|
||
} // namespace facebook::velox::encoding |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <exception> | ||
#include <map> | ||
#include <string> | ||
|
||
#include <folly/Range.h> | ||
#include "velox/common/encode/Base.h" | ||
|
||
namespace facebook::velox::encoding { | ||
|
||
class Base32 : public Base { | ||
public: | ||
/// Returns encoded size for the input of the specified size. | ||
static size_t calculateEncodedSize(size_t size, bool withPadding = true); | ||
|
||
/// Encodes the specified number of characters from the 'data' and writes the | ||
/// result to the 'output'. The output must have enough space, e.g. as | ||
/// returned by the calculateEncodedSize(). | ||
static void encode(const char* data, size_t size, char* output); | ||
/// Returns decoded size for the specified input. Adjusts the 'size' to | ||
/// subtract the length of the padding, if exists. | ||
static size_t calculateDecodedSize(const char* data, size_t& size); | ||
|
||
/// Decodes the specified number of characters from the 'src' and writes the | ||
/// result to the 'dst'. The destination must have enough space, e.g. as | ||
/// returned by the calculateDecodedSize(). | ||
static size_t | ||
decode(const char* src, size_t src_len, char* dst, size_t dst_len); | ||
|
||
private: | ||
template <class T> | ||
static void encodeImpl( | ||
const T& data, | ||
const Charset& charset, | ||
bool include_pad, | ||
char* out); | ||
|
||
/// Decodes the specified number of base 32 encoded characters from the 'src' | ||
/// and writes to 'dst' | ||
static size_t decodeImpl( | ||
const char* src, | ||
size_t src_len, | ||
char* dst, | ||
size_t dst_len, | ||
const ReverseIndex& table); | ||
|
||
public: | ||
// Padding character used in encoding | ||
constexpr static char kBase = 32; | ||
|
||
private: | ||
// Size of the binary block before encoding. | ||
constexpr static int kBinaryBlockSize = 5; | ||
|
||
// Size of the encoded block after encoding. | ||
constexpr static int kEncodedBlockSize = 8; | ||
}; | ||
|
||
} // namespace facebook::velox::encoding |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.