Add presto functions to_base and from_base

facebookincubator · Dec 8, 2023 · f6f4ed3 · f6f4ed3
1 parent ebe26c3
commit f6f4ed3
Show file tree

Hide file tree

Showing 7 changed files with 465 additions and 2 deletions.
diff --git a/velox/common/encode/Base32.cpp b/velox/common/encode/Base32.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/common/encode/Base32.h"
+
+#include <glog/logging.h>
+#include <stdint.h>
+
+namespace facebook::velox::encoding {
+
+constexpr const Base32::Charset kBase32Charset = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'};
+
+constexpr const Base32::ReverseIndex kBase32ReverseIndexTable = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  255, 255, 255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
+/// in kBase32ReverseIndexTable is correct.
+static_assert(
+    checkForwardIndex(
+        sizeof(kBase32Charset) / 2 - 1,
+        kBase32Charset,
+        kBase32ReverseIndexTable),
+    "kBase32Charset has incorrect entries");
+
+/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
+/// entry in kBase32Charset is correct.
+static_assert(
+    checkReverseIndex(
+        sizeof(kBase32ReverseIndexTable) - 1,
+        kBase32Charset,
+        Base32::kBase,
+        kBase32ReverseIndexTable),
+    "kBase32ReverseIndexTable has incorrect entries.");
+
+// static
+size_t Base32::calculateEncodedSize(size_t size, bool withPadding) {
+  if (size == 0) {
+    return 0;
+  }
+
+  // Calculate the output size assuming that we are including padding.
+  size_t encodedSize = ((size + 4) / 5) * 8;
+  if (!withPadding) {
+    // If the padding was not requested, subtract the padding bytes.
+    encodedSize -= (5 - (size % 5)) % 5;
+  }
+  return encodedSize;
+}
+
+// static
+void Base32::encode(const char* data, size_t len, char* output) {
+  encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output);
+}
+
+template <class T>
+/* static */ void Base32::encodeImpl(
+    const T& data,
+    const Charset& charset,
+    bool include_pad,
+    char* out) {
+  auto len = data.size();
+  if (len == 0) {
+    return;
+  }
+
+  auto wp = out;
+  auto it = data.begin();
+
+  /// For each group of 5 bytes (40 bits) in the input, split that into
+  /// 8 groups of 5 bits and encode that using the supplied charset lookup
+  for (; len > 4; len -= 5) {
+    uint64_t curr = uint64_t(*it++) << 32;
+    curr |= uint8_t(*it++) << 24;
+    curr |= uint8_t(*it++) << 16;
+    curr |= uint8_t(*it++) << 8;
+    curr |= uint8_t(*it++);
+
+    *wp++ = charset[(curr >> 35) & 0x1f];
+    *wp++ = charset[(curr >> 30) & 0x1f];
+    *wp++ = charset[(curr >> 25) & 0x1f];
+    *wp++ = charset[(curr >> 20) & 0x1f];
+    *wp++ = charset[(curr >> 15) & 0x1f];
+    *wp++ = charset[(curr >> 10) & 0x1f];
+    *wp++ = charset[(curr >> 5) & 0x1f];
+    *wp++ = charset[curr & 0x1f];
+  }
+
+  if (len > 0) {
+    /// We have either 1 to 4 input bytes left.  Encode this similar to the
+    /// above (assuming 0 for all other bytes).  Optionally append the '='
+    /// character if it is requested.
+    uint64_t curr = uint64_t(*it++) << 32;
+    *wp++ = charset[(curr >> 35) & 0x1f];
+    if (len > 3) {
+      curr |= uint8_t(*it++) << 24;
+      curr |= uint8_t(*it++) << 16;
+      curr |= uint8_t(*it) << 8;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+      *wp++ = charset[(curr >> 15) & 0x1f];
+      *wp++ = charset[(curr >> 10) & 0x1f];
+      *wp++ = charset[(curr >> 5) & 0x1f];
+
+      if (include_pad) {
+        *wp = kBasePad;
+      }
+    } else if (len > 2) {
+      curr |= uint8_t(*it++) << 24;
+      curr |= uint8_t(*it++) << 16;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+      *wp++ = charset[(curr >> 15) & 0x1f];
+
+      if (include_pad) {
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp = kBasePad;
+      }
+    } else if (len > 1) {
+      curr |= uint8_t(*it) << 24;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+
+      if (include_pad) {
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp = kBasePad;
+      }
+    } else {
+      *wp++ = charset[(curr >> 30) & 0x1f];
+
+      if (include_pad) {
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp++ = kBasePad;
+        *wp = kBasePad;
+      }
+    }
+  }
+}
+
+size_t Base32::calculateDecodedSize(const char* data, size_t& size) {
+  if (size == 0) {
+    return 0;
+  }
+
+  // If padding doesn't exist, add count for the extra bytes
+  if (!isPadded(data, size)) {
+    /// If padding doesn't exist we need to calculate it from the size - if the
+    /// size % 8 is 0 then we have an even multiple 5 byte chunks in the result
+    /// if it is 7 then we need 1 more byte in the output.  If it is 5 then we
+    /// need 3 more bytes in the output. Likewise 4 and 2. But, it should never
+    /// be 6 or 3 or 1.
+    auto extra = size % kEncodedBlockSize;
+    auto needed = (size / kEncodedBlockSize) * kBinaryBlockSize;
+    if (extra) {
+      if ((extra == 6) || (extra == 3) || (extra == 1)) {
+        throw BaseException(
+            "Base32::decode() - invalid input string: "
+            "string length cannot be 6, 3 or 1 more than a multiple of 8.");
+      }
+      needed += (extra * kBinaryBlockSize) / kEncodedBlockSize;
+    }
+    return needed;
+  }
+
+  /// If the pad characters are included then the source string must be a
+  /// multiple of encoded block size and we can query the end of the string
+  /// to see how much padding exists.
+  if (size % kEncodedBlockSize != 0) {
+    throw BaseException(
+        "Base32::decode() - invalid input string: "
+        "string length is not multiple of encoded block size.");
+  }
+
+  auto needed = (size * kBinaryBlockSize) / kEncodedBlockSize;
+  auto padding = Base::countPadding(data, size);
+  size -= padding;
+  return needed -
+      ceil((padding * kBinaryBlockSize) / double(kEncodedBlockSize));
+}
+
+size_t
+Base32::decode(const char* src, size_t src_len, char* dst, size_t dst_len) {
+  return decodeImpl(src, src_len, dst, dst_len, kBase32ReverseIndexTable);
+}
+
+size_t Base32::decodeImpl(
+    const char* src,
+    size_t src_len,
+    char* dst,
+    size_t dst_len,
+    const Base::ReverseIndex& reverse_lookup) {
+  if (!src_len) {
+    return 0;
+  }
+
+  auto needed = calculateDecodedSize(src, src_len);
+  if (dst_len < needed) {
+    throw BaseException(
+        "Base32::decode() - invalid output string: "
+        "output string is too small.");
+  }
+
+  // Handle full groups of 8 characters
+  for (; src_len > 8; src_len -= 8, src += 8, dst += 5) {
+    /// Each character of the 8 bytes encode 5 bits of the original, grab each
+    /// with the appropriate shifts to rebuild the original and then split that
+    /// back into the original 8 bit bytes.
+    uint64_t last =
+        (uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
+        (uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30) |
+        (baseReverseLookup(kBase, src[2], reverse_lookup) << 25) |
+        (baseReverseLookup(kBase, src[3], reverse_lookup) << 20) |
+        (baseReverseLookup(kBase, src[4], reverse_lookup) << 15) |
+        (baseReverseLookup(kBase, src[5], reverse_lookup) << 10) |
+        (baseReverseLookup(kBase, src[6], reverse_lookup) << 5) |
+        baseReverseLookup(kBase, src[7], reverse_lookup);
+    dst[0] = (last >> 32) & 0xff;
+    dst[1] = (last >> 24) & 0xff;
+    dst[2] = (last >> 16) & 0xff;
+    dst[3] = (last >> 8) & 0xff;
+    dst[4] = last & 0xff;
+  }
+
+  /// Handle the last 2, 4, 5, 7 or 8 characters.  This is similar to the above,
+  /// but the last characters may or may not exist.
+  DCHECK(src_len >= 2);
+  uint64_t last =
+      (uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
+      (uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30);
+  dst[0] = (last >> 32) & 0xff;
+  if (src_len > 2) {
+    last |= baseReverseLookup(kBase, src[2], reverse_lookup) << 25;
+    last |= baseReverseLookup(kBase, src[3], reverse_lookup) << 20;
+    dst[1] = (last >> 24) & 0xff;
+    if (src_len > 4) {
+      last |= baseReverseLookup(kBase, src[4], reverse_lookup) << 15;
+      dst[2] = (last >> 16) & 0xff;
+      if (src_len > 5) {
+        last |= baseReverseLookup(kBase, src[5], reverse_lookup) << 10;
+        last |= baseReverseLookup(kBase, src[6], reverse_lookup) << 5;
+        dst[3] = (last >> 8) & 0xff;
+        if (src_len > 7) {
+          last |= baseReverseLookup(kBase, src[7], reverse_lookup);
+          dst[4] = last & 0xff;
+        }
+      }
+    }
+  }
+
+  return needed;
+}
+
+} // namespace facebook::velox::encoding
diff --git a/velox/common/encode/Base32.h b/velox/common/encode/Base32.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <exception>
+#include <map>
+#include <string>
+
+#include <folly/Range.h>
+#include "velox/common/encode/Base.h"
+
+namespace facebook::velox::encoding {
+
+class Base32 : public Base {
+ public:
+  /// Returns encoded size for the input of the specified size.
+  static size_t calculateEncodedSize(size_t size, bool withPadding = true);
+
+  /// Encodes the specified number of characters from the 'data' and writes the
+  /// result to the 'output'. The output must have enough space, e.g. as
+  /// returned by the calculateEncodedSize().
+  static void encode(const char* data, size_t size, char* output);
+  /// Returns decoded size for the specified input. Adjusts the 'size' to
+  /// subtract the length of the padding, if exists.
+  static size_t calculateDecodedSize(const char* data, size_t& size);
+
+  /// Decodes the specified number of characters from the 'src' and writes the
+  /// result to the 'dst'. The destination must have enough space, e.g. as
+  /// returned by the calculateDecodedSize().
+  static size_t
+  decode(const char* src, size_t src_len, char* dst, size_t dst_len);
+
+ private:
+  template <class T>
+  static void encodeImpl(
+      const T& data,
+      const Charset& charset,
+      bool include_pad,
+      char* out);
+
+  /// Decodes the specified number of base 32 encoded characters from the 'src'
+  /// and writes to 'dst'
+  static size_t decodeImpl(
+      const char* src,
+      size_t src_len,
+      char* dst,
+      size_t dst_len,
+      const ReverseIndex& table);
+
+ public:
+  // Padding character used in encoding
+  constexpr static char kBase = 32;
+
+ private:
+  // Size of the binary block before encoding.
+  constexpr static int kBinaryBlockSize = 5;
+
+  // Size of the encoded block after encoding.
+  constexpr static int kEncodedBlockSize = 8;
+};
+
+} // namespace facebook::velox::encoding
diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library(velox_encode Base.cpp Base64.cpp)
-target_link_libraries(velox_encode PUBLIC Folly::folly)
+add_library(velox_encode Base.cpp Base32.cpp Base64.cpp)
+target_link_libraries(velox_encode PUBLIC Folly::folly)