From 85f031c38bbcf65dbd35d2ddac3fa806d6caa984 Mon Sep 17 00:00:00 2001
From: Krishna Pai <kpai@fb.com>
Date: Sun, 17 Nov 2024 12:32:49 -0800
Subject: [PATCH] Add support for canonicalization of JSON. (#11284)

Summary:
This is preliminary PR that adds support for canonicalization of JSON strings. This initial PR only tackles canonicalization of json_parse. Another diff will handle CAST( _ as JSON) . Canonicalization is required since currently Velox just treats JSON as varchars thus equivalent JSON but having different backing varchar's are treated as separate JSON's which is incorrect and contrary to behavior shown by Presto.


Reviewed By: Yuhta, gggrace14

Differential Revision: D65084925

Pulled By: kgpai
---
 velox/functions/prestosql/JsonFunctions.cpp   | 212 ++++++++++++++++--
 .../prestosql/json/JsonStringUtil.cpp         | 179 +++++++++++++++
 .../functions/prestosql/json/JsonStringUtil.h |  20 ++
 .../prestosql/tests/JsonFunctionsTest.cpp     | 129 ++++++++++-
 velox/type/Conversions.cpp                    |   8 +-
 velox/type/Conversions.h                      |   2 +-
 6 files changed, 526 insertions(+), 24 deletions(-)
diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp
index 8628a669d828..01ea6a627fe1 100644
--- a/velox/functions/prestosql/JsonFunctions.cpp
+++ b/velox/functions/prestosql/JsonFunctions.cpp
@@ -14,11 +14,95 @@
  * limitations under the License.
  */
 #include "velox/expression/VectorFunction.h"
+#include "velox/functions/prestosql/json/JsonStringUtil.h"
 #include "velox/functions/prestosql/json/SIMDJsonUtil.h"
 #include "velox/functions/prestosql/types/JsonType.h"
 
 namespace facebook::velox::functions {
 
+namespace {
+const std::string_view kArrayStart = "[";
+const std::string_view kArrayEnd = "]";
+const std::string_view kSeparator = ",";
+const std::string_view kObjectStart = "{";
+const std::string_view kObjectEnd = "}";
+const std::string_view kObjectKeySeparator = ":";
+
+using JsonViews = std::vector<std::string_view>;
+
+inline void addOrMergeViews(JsonViews& jsonViews, std::string_view view) {
+  if (jsonViews.empty()) {
+    jsonViews.push_back(view);
+    return;
+  }
+
+  auto& lastView = jsonViews.back();
+
+  if (lastView.data() + lastView.size() == view.data()) {
+    lastView = std::string_view(lastView.data(), lastView.size() + view.size());
+  } else {
+    jsonViews.push_back(view);
+  }
+}
+
+/// Class to keep track of json strings being written
+/// in to a buffer. The size of the backing buffer must be known during
+/// construction time.
+class BufferTracker {
+ public:
+  explicit BufferTracker(BufferPtr buffer) : curPos_(0), currentViewStart_(0) {
+    bufPtr_ = buffer->asMutable<char>();
+    capacity = buffer->capacity();
+  }
+
+  /// Write out all the views to the buffer.
+  auto getCanonicalString(JsonViews& jsonViews) {
+    for (auto view : jsonViews) {
+      trimEscapeWriteToBuffer(view);
+    }
+    return getStringView();
+  }
+
+  /// Sets current view to the end of the previous string.
+  /// Should be called only after getCanonicalString ,
+  /// as after this call the previous view is lost.
+  void startNewString() {
+    currentViewStart_ += curPos_;
+    curPos_ = 0;
+  }
+
+ private:
+  /// Trims whitespace and escapes utf characters before writing to buffer.
+  void trimEscapeWriteToBuffer(std::string_view input) {
+    auto trimmed = velox::util::trimWhiteSpace(input.data(), input.size());
+    auto curBufPtr = getCurrentBufferPtr();
+    auto bytesWritten =
+        prestoJavaEscapeString(trimmed.data(), trimmed.size(), curBufPtr);
+    incrementCounter(bytesWritten);
+  }
+
+  /// Returns current string view against the buffer.
+  std::string_view getStringView() {
+    return std::string_view(bufPtr_ + currentViewStart_, curPos_);
+  }
+
+  inline char* getCurrentBufferPtr() {
+    return bufPtr_ + currentViewStart_ + curPos_;
+  }
+
+  void incrementCounter(size_t increment) {
+    VELOX_DCHECK_LE(curPos_ + currentViewStart_ + increment, capacity);
+    curPos_ += increment;
+  }
+
+  size_t capacity;
+  size_t curPos_;
+  size_t currentViewStart_;
+  char* bufPtr_;
+};
+
+} // namespace
+
 namespace {
 class JsonFormatFunction : public exec::VectorFunction {
  public:
@@ -84,38 +168,75 @@ class JsonParseFunction : public exec::VectorFunction {
       auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
       paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING);
       memcpy(paddedInput_.data(), value.data(), value.size());
-      if (auto error = parse(value.size())) {
+      auto escapeSize = escapedStringSize(value.data(), value.size());
+      auto buffer = AlignedBuffer::allocate<char>(escapeSize, context.pool());
+      BufferTracker bufferTracker{buffer};
+
+      JsonViews jsonViews;
+
+      if (auto error = parse(value.size(), jsonViews)) {
         context.setErrors(rows, errors_[error]);
         return;
       }
-      localResult = std::make_shared<ConstantVector<StringView>>(
-          context.pool(), rows.end(), false, JSON(), std::move(value));
+
+      BufferPtr stringViews =
+          AlignedBuffer::allocate<StringView>(1, context.pool(), StringView());
+      auto rawStringViews = stringViews->asMutable<StringView>();
+      rawStringViews[0] =
+          StringView(bufferTracker.getCanonicalString(jsonViews));
+
+      auto constantBase = std::make_shared<FlatVector<StringView>>(
+          context.pool(),
+          JSON(),
+          nullptr,
+          1,
+          stringViews,
+          std::vector<BufferPtr>{buffer});
+
+      localResult = BaseVector::wrapInConstant(rows.end(), 0, constantBase);
+
     } else {
       auto flatInput = arg->asFlatVector<StringView>();
+      BufferPtr stringViews = AlignedBuffer::allocate<StringView>(
+          rows.end(), context.pool(), StringView());
+      auto rawStringViews = stringViews->asMutable<StringView>();
 
-      auto stringBuffers = flatInput->stringBuffers();
       VELOX_CHECK_LE(rows.end(), flatInput->size());
 
       size_t maxSize = 0;
+      size_t totalOutputSize = 0;
       rows.applyToSelected([&](auto row) {
         auto value = flatInput->valueAt(row);
         maxSize = std::max(maxSize, value.size());
+        totalOutputSize += escapedStringSize(value.data(), value.size());
       });
+
       paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING);
+      BufferPtr buffer =
+          AlignedBuffer::allocate<char>(totalOutputSize, context.pool());
+      BufferTracker bufferTracker{buffer};
+
       rows.applyToSelected([&](auto row) {
+        JsonViews jsonViews;
         auto value = flatInput->valueAt(row);
         memcpy(paddedInput_.data(), value.data(), value.size());
-        if (auto error = parse(value.size())) {
+        if (auto error = parse(value.size(), jsonViews)) {
           context.setVeloxExceptionError(row, errors_[error]);
+        } else {
+          auto canonicalString = bufferTracker.getCanonicalString(jsonViews);
+
+          rawStringViews[row] = StringView(canonicalString);
+          bufferTracker.startNewString();
         }
       });
+
       localResult = std::make_shared<FlatVector<StringView>>(
           context.pool(),
           JSON(),
           nullptr,
           rows.end(),
-          flatInput->values(),
-          std::move(stringBuffers));
+          stringViews,
+          std::vector<BufferPtr>{buffer});
     }
 
     context.moveOrCopyResult(localResult, rows, result);
@@ -130,11 +251,11 @@ class JsonParseFunction : public exec::VectorFunction {
   }
 
  private:
-  simdjson::error_code parse(size_t size) const {
+  simdjson::error_code parse(size_t size, JsonViews& jsonViews) const {
     simdjson::padded_string_view paddedInput(
         paddedInput_.data(), size, paddedInput_.size());
     SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(paddedInput));
-    SIMDJSON_TRY(validate<simdjson::ondemand::document&>(doc));
+    SIMDJSON_TRY(validate<simdjson::ondemand::document&>(doc, jsonViews));
     if (!doc.at_end()) {
       return simdjson::TRAILING_CONTENT;
     }
@@ -142,33 +263,94 @@ class JsonParseFunction : public exec::VectorFunction {
   }
 
   template <typename T>
-  static simdjson::error_code validate(T value) {
+  static simdjson::error_code validate(T value, JsonViews& jsonViews) {
     SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type());
     switch (type) {
       case simdjson::ondemand::json_type::array: {
         SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array());
+
+        jsonViews.push_back(kArrayStart);
+        auto jsonViewsSize = jsonViews.size();
         for (auto elementOrError : array) {
           SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError);
-          SIMDJSON_TRY(validate(element));
+          SIMDJSON_TRY(validate(element, jsonViews));
+          jsonViews.push_back(kSeparator);
+        }
+
+        // If the array is not empty, remove the last separator.
+        if (jsonViews.size() > jsonViewsSize) {
+          jsonViews.pop_back();
         }
+
+        jsonViews.push_back(kArrayEnd);
+
         return simdjson::SUCCESS;
       }
+
       case simdjson::ondemand::json_type::object: {
         SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object());
+
+        std::vector<std::pair<std::string_view, JsonViews>> objFields;
         for (auto fieldOrError : object) {
           SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError);
-          SIMDJSON_TRY(validate(field.value()));
+          auto key = field.key_raw_json_token();
+          JsonViews elementArray;
+          SIMDJSON_TRY(validate(field.value(), elementArray));
+          objFields.push_back({key, elementArray});
         }
+
+        std::sort(objFields.begin(), objFields.end(), [](auto& a, auto& b) {
+          // Remove the quotes from the keys before we sort them.
+          auto af = std::string_view{a.first.data() + 1, a.first.size() - 2};
+          auto bf = std::string_view{b.first.data() + 1, b.first.size() - 2};
+          return lessThan(a.first, b.first);
+        });
+
+        jsonViews.push_back(kObjectStart);
+
+        for (auto i = 0; i < objFields.size(); i++) {
+          auto field = objFields[i];
+          addOrMergeViews(jsonViews, field.first);
+          jsonViews.push_back(kObjectKeySeparator);
+
+          for (auto& element : field.second) {
+            addOrMergeViews(jsonViews, element);
+          }
+
+          if (i < objFields.size() - 1) {
+            jsonViews.push_back(kSeparator);
+          }
+        }
+
+        jsonViews.push_back(kObjectEnd);
         return simdjson::SUCCESS;
       }
-      case simdjson::ondemand::json_type::number:
+
+      case simdjson::ondemand::json_type::number: {
+        SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json());
+        addOrMergeViews(jsonViews, rawJson);
+
         return value.get_double().error();
-      case simdjson::ondemand::json_type::string:
+      }
+      case simdjson::ondemand::json_type::string: {
+        SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json());
+        addOrMergeViews(jsonViews, rawJson);
+
         return value.get_string().error();
-      case simdjson::ondemand::json_type::boolean:
+      }
+
+      case simdjson::ondemand::json_type::boolean: {
+        SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json());
+        addOrMergeViews(jsonViews, rawJson);
+
         return value.get_bool().error();
+      }
+
       case simdjson::ondemand::json_type::null: {
         SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null());
+        SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json());
+        addOrMergeViews(jsonViews, rawJson);
+
         return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR;
       }
     }
diff --git a/velox/functions/prestosql/json/JsonStringUtil.cpp b/velox/functions/prestosql/json/JsonStringUtil.cpp
index 43be101ec40c..29c14c59c598 100644
--- a/velox/functions/prestosql/json/JsonStringUtil.cpp
+++ b/velox/functions/prestosql/json/JsonStringUtil.cpp
@@ -18,6 +18,7 @@
 #include "folly/Unicode.h"
 
 #include "velox/common/base/Exceptions.h"
+#include "velox/external/utf8proc/utf8procImpl.h"
 #include "velox/functions/lib/Utf8Utils.h"
 #include "velox/functions/prestosql/json/JsonStringUtil.h"
 
@@ -31,6 +32,20 @@ FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) {
   return c < 10 ? c + '0' : c - 10 + 'A';
 }
 
+FOLLY_ALWAYS_INLINE int32_t digitToHex(char c) {
+  if (c >= '0' && c <= '9') {
+    return c - '0';
+  }
+  if (c >= 'A' && c <= 'F') {
+    return c - 'A' + 10;
+  }
+  if (c >= 'a' && c <= 'f') {
+    return c - 'a' + 10;
+  }
+
+  VELOX_USER_FAIL("Invalid escape digit: {}", c);
+}
+
 FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) {
   value = folly::Endian::little(value);
   *out++ = '\\';
@@ -181,4 +196,168 @@ size_t escapedStringSize(const char* input, size_t length) {
   return outSize;
 }
 
+namespace {
+int32_t compareChars(
+    const std::string_view& first,
+    const std::string_view& second,
+    size_t& i,
+    size_t& j) {
+  // Both are ASCII.
+  if (FOLLY_LIKELY(!(first[i] & 0x80) && !(second[j] & 0x80))) {
+    // Check if escaped.
+    auto firstChar = first[i] == '\\' ? first[++i] : first[i];
+    auto secondChar = second[j] == '\\' ? second[++j] : second[j];
+    i++;
+    j++;
+    return firstChar - secondChar;
+  } else {
+    // Assume unicode.
+    uint32_t firstCodePoint = 0;
+    uint32_t secondCodePoint = 0;
+    auto firstSize = 0;
+    auto secondSize = 0;
+    if (first[i] & 0x80) {
+      firstCodePoint = utf8proc_codepoint(
+          first.data() + i, first.data() + first.size(), firstSize);
+    } else {
+      firstCodePoint = first[i];
+    }
+
+    if (second[j] & 0x80) {
+      secondCodePoint = utf8proc_codepoint(
+          second.data() + j, second.data() + second.size(), secondSize);
+    } else {
+      secondCodePoint = second[j];
+    }
+
+    i += firstSize > 0 ? firstSize : 1;
+    j += secondSize > 0 ? secondSize : 1;
+    return firstCodePoint - secondCodePoint;
+  }
+}
+} // namespace
+
+bool lessThan(const std::string_view& first, const std::string_view& second) {
+  size_t firstLength = first.size();
+  size_t secondLength = second.size();
+  size_t minLength = std::min(firstLength, secondLength);
+
+  for (size_t i = 0, j = 0; i < minLength && j < minLength;) {
+    auto result = compareChars(first, second, i, j);
+    if (result != 0) {
+      return result < 0;
+    }
+  }
+
+  return firstLength < secondLength;
+}
+
+namespace {
+int32_t parseHex(const std::string_view& hexString) {
+  int32_t result = 0;
+  for (auto c : hexString) {
+    result = (result << 4) + digitToHex(c);
+  }
+
+  return result;
+}
+
+bool isHighSurrogate(std::int32_t code_point) {
+  return code_point >= 0xD800 && code_point <= 0xDBFF;
+}
+
+bool isLowSurrogate(std::int32_t code_point) {
+  return code_point >= 0xDC00 && code_point <= 0xDFFF;
+}
+
+} // namespace
+
+size_t prestoJavaEscapeString(const char* input, size_t length, char* output) {
+  char* pos = output;
+
+  auto* start = reinterpret_cast<const unsigned char*>(input);
+  auto* end = reinterpret_cast<const unsigned char*>(input + length);
+  while (start < end) {
+    int count = validateAndGetNextUtf8Length(start, end);
+    switch (count) {
+      case 1: {
+        // Unescape characters that are escaped by \ character.
+        if (FOLLY_UNLIKELY(*start == '\\')) {
+          if (start + 1 == end) {
+            VELOX_USER_FAIL("Invalid escape sequence at the end of string");
+          }
+          // Presto java implementation only unescapes the / character.
+          switch (*(start + 1)) {
+            case '/':
+              *pos++ = '/';
+              start += 2;
+              continue;
+            case 'u': {
+              if (start + 5 > end) {
+                VELOX_USER_FAIL("Invalid escape sequence at the end of string");
+              }
+
+              // Read 4 hex digits.
+              auto codePoint = parseHex(std::string_view(
+                  reinterpret_cast<const char*>(start) + 2, 4));
+
+              // Presto java implementation doesnt unescape surrogate pairs.
+              // Thus we just write it out in the same way as it is.
+              if (isHighSurrogate(codePoint) || isLowSurrogate(codePoint)) {
+                memcpy(pos, reinterpret_cast<const char*>(start), 6);
+                pos += 6;
+                start += 6;
+                continue;
+              }
+
+              // Otherwise write it as a single code point.
+              auto increment = utf8proc_encode_char(
+                  codePoint, reinterpret_cast<unsigned char*>(pos));
+              pos += increment;
+              start += 6;
+              continue;
+            }
+            default:
+              *pos++ = *start;
+              *pos++ = *(start + 1);
+              start += 2;
+              continue;
+          }
+        } else {
+          *pos++ = *start;
+          start++;
+          continue;
+        }
+      }
+      case 2: {
+        memcpy(pos, reinterpret_cast<const char*>(start), 2);
+        pos += 2;
+        start += 2;
+        continue;
+      }
+      case 3: {
+        memcpy(pos, reinterpret_cast<const char*>(start), 3);
+        pos += 3;
+        start += 3;
+        continue;
+      }
+      case 4: {
+        char32_t codePoint = folly::utf8ToCodePoint(start, end, true);
+        if (codePoint == U'\ufffd') {
+          writeHex(0xFFFDu, pos);
+          continue;
+        }
+        encodeUtf16Hex(codePoint, pos);
+        continue;
+      }
+      default: {
+        writeHex(0xFFFDu, pos);
+        start++;
+      }
+    }
+  }
+
+  return (pos - output);
+}
+
 } // namespace facebook::velox
diff --git a/velox/functions/prestosql/json/JsonStringUtil.h b/velox/functions/prestosql/json/JsonStringUtil.h
index 65cadd86bf68..384f6b768bd1 100644
--- a/velox/functions/prestosql/json/JsonStringUtil.h
+++ b/velox/functions/prestosql/json/JsonStringUtil.h
@@ -40,13 +40,33 @@ namespace facebook::velox {
 ///                responsible to allocate enough space for output.
 void escapeString(const char* input, size_t length, char* output);
 
+/// Unescape the unicode characters of `input` to make it canonical for JSON
+/// The behavior is compatible with Presto Java's json_parse.
+/// Presto java json_parse will unescape the following characters:
+/// \/ and non surrogate unicode characters.
+/// Other behavior is similar to escapeString.
+/// @param input: Input string to escape that is UTF-8 encoded.
+/// @param length: Length of the input string.
+/// @param output: Output string to write the escaped input to. The caller is
+///                responsible to allocate enough space for output.
+/// @return The number of bytes written to the output.
+size_t prestoJavaEscapeString(const char* input, size_t length, char* output);
+
 /// Return the size of string after the unicode characters of `input` are
 /// escaped using the method as in`escapeString`. The function will iterate
 /// over `input` once.
 /// @param input: Input string to escape that is UTF-8 encoded.
 /// @param length: Length of the input string.
+/// @return The size of the string after escaping.
 size_t escapedStringSize(const char* input, size_t length);
 
+/// Compares two string views. The comparison takes into account
+/// escape sequences and also unicode characters.
+/// Returns true if first is less than second else false.
+/// @param first: First string to compare.
+/// @param second: Second string to compare.
+bool lessThan(const std::string_view& first, const std::string_view& second);
+
 /// For test only. Encode `codePoint` value by UTF-16 and write the one or two
 /// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars
 /// accordingly. The caller shall ensure there is enough space in `out`.
diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
index 067d374411f3..fc1346325446 100644
--- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "folly/Unicode.h"
 #include "velox/common/base/tests/GTestUtils.h"
 #include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
 #include "velox/functions/prestosql/types/JsonType.h"
@@ -71,6 +72,13 @@ class JsonFunctionsTest : public functions::test::FunctionBaseTest {
     return makeNullableFlatVector<StringView>({s}, JSON());
   }
 
+  void testJsonParse(std::string json, std::string expectedJson) {
+    auto data = makeRowVector({makeFlatVector<std::string>({json})});
+    auto result = evaluate("json_parse(c0)", data);
+    auto expected = makeFlatVector<std::string>({expectedJson}, JSON());
+    velox::test::assertEqualVectors(expected, result);
+  }
+
   std::pair<VectorPtr, VectorPtr> makeVectors(std::optional<std::string> json) {
     std::optional<StringView> s = json.has_value()
         ? std::make_optional(StringView(json.value()))
@@ -189,13 +197,32 @@ TEST_F(JsonFunctionsTest, jsonParse) {
   };
 
   EXPECT_EQ(jsonParse(std::nullopt), std::nullopt);
+  // Spaces before and after.
+  EXPECT_EQ(jsonParse(R"( "abc"       )"), R"("abc")");
   EXPECT_EQ(jsonParse(R"(true)"), "true");
   EXPECT_EQ(jsonParse(R"(null)"), "null");
   EXPECT_EQ(jsonParse(R"(42)"), "42");
   EXPECT_EQ(jsonParse(R"("abc")"), R"("abc")");
-  EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1, 2, 3]");
-  EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})");
-  EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])");
+  EXPECT_EQ(jsonParse("\"abc\u4FE1\""), "\"abc\u4FE1\"");
+  auto utf32cp = folly::codePointToUtf8(U'😀');
+  testJsonParse(fmt::format("\"{}\"", utf32cp), R"("\uD83D\uDE00")");
+  EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1,2,3]");
+  EXPECT_EQ(jsonParse(R"({"k1": "v1" })"), R"({"k1":"v1"})");
+  EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1","v1"])");
+  testJsonParse(R"({ "abc" : "\/"})", R"({"abc":"/"})");
+  testJsonParse(R"({ "abc" : "\\/"})", R"({"abc":"\\/"})");
+  testJsonParse(R"({ "abc" : [1, 2, 3, 4    ]})", R"({"abc":[1,2,3,4]})");
+  // Test out with unicodes and empty keys.
+  testJsonParse(
+      R"({"4":0.1,"\"":0.14, "自社在庫":0.1, "٢": 2.0, "١": 1.0, "१": 1.0, "": 3.5})",
+      R"({"":3.5,"\"":0.14,"4":0.1,"١":1.0,"٢":2.0,"१":1.0,"自社在庫":0.1})");
+  testJsonParse(
+      R"({"error":"Falha na configura\u00e7\u00e3o do pagamento"})",
+      R"({"error":"Falha na configuração do pagamento"})");
+  // Test unicode in key and surogate pairs in values.
+  testJsonParse(
+      R"({"utf\u4FE1": "\u4FE1 \uD83D\uDE00 \/ \n abc a\uD834\uDD1Ec \u263Acba "})",
+      R"({"utf信":"信 \uD83D\uDE00 / \n abc a\uD834\uDD1Ec ☺cba "})");
 
   VELOX_ASSERT_THROW(
       jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
@@ -228,7 +255,7 @@ TEST_F(JsonFunctionsTest, jsonParse) {
 
   VELOX_ASSERT_THROW(
       evaluate("json_parse(c0)", data),
-      "Unexpected trailing content in the JSON input");
+      "TAPE_ERROR: The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc.");
 
   data = makeRowVector({makeFlatVector<StringView>(
       {R"("This is a long sentence")", R"("This is some other sentence")"})});
@@ -276,6 +303,100 @@ TEST_F(JsonFunctionsTest, jsonParse) {
   }
 }
 
+TEST_F(JsonFunctionsTest, canonicalization) {
+  auto json = R"({
+  "menu": {
+      "id": "file",
+      "value": "File",
+      "popup": {
+          "menuitem": [
+              {
+                  "value": "New",
+                  "onclick": "CreateNewDoc() "
+              },
+              {
+                  "value": "Open",
+                  "onclick": "OpenDoc() "
+              },
+              {
+                  "value": "Close",
+                  "onclick": "CloseDoc() "
+              }
+          ]
+  }
+  }
+  })";
+
+  auto expectedJson =
+      R"({"menu":{"id":"file","popup":{"menuitem":[{"onclick":"CreateNewDoc() ","value":"New"},{"onclick":"OpenDoc() ","value":"Open"},{"onclick":"CloseDoc() ","value":"Close"}]},"value":"File"}})";
+  testJsonParse(json, expectedJson);
+
+  json =
+      "{\n"
+      "  \"name\": \"John Doe\",\n"
+      "  \"address\": {\n"
+      "    \"street\": \"123 Main St\",\n"
+      "    \"city\": \"Anytown\",\n"
+      "    \"state\": \"CA\",\n"
+      "    \"zip\": \"12345\"\n"
+      "  },\n"
+      "  \"phoneNumbers\": [\n"
+      "    {\n"
+      "      \"type\": \"home\",\n"
+      "      \"number\": \"555-1234\"\n"
+      "    },\n"
+      "    {\n"
+      "      \"type\": \"work\",\n"
+      "      \"number\": \"555-5678\"\n"
+      "    }\n"
+      "  ],\n"
+      "  \"familyMembers\": [\n"
+      "    {\n"
+      "      \"name\": \"Jane Doe\",\n"
+      "      \"relationship\": \"wife\"\n"
+      "    },\n"
+      "    {\n"
+      "      \"name\": \"Jimmy Doe\",\n"
+      "      \"relationship\": \"son\"\n"
+      "    }\n"
+      "  ],\n"
+      "  \"hobbies\": [\"golf\", \"reading\", \"traveling\"]\n"
+      "}";
+  expectedJson =
+      R"({"address":{"city":"Anytown","state":"CA","street":"123 Main St","zip":"12345"},"familyMembers":[{"name":"Jane Doe","relationship":"wife"},{"name":"Jimmy Doe","relationship":"son"}],"hobbies":["golf","reading","traveling"],"name":"John Doe","phoneNumbers":[{"number":"555-1234","type":"home"},{"number":"555-5678","type":"work"}]})";
+  testJsonParse(json, expectedJson);
+
+  // Json with spaces in keys
+  json = R"({
+  "menu": {
+      "id": "file",
+      "value": "File",
+      "emptyArray": [],
+      "popup": {
+          "menuitem": [
+              {
+                  "value ": "New ",
+                  "onclick": "CreateNewDoc() ",
+                  " value ": " Space "
+              }
+           ]
+  }
+  }
+  })";
+
+  expectedJson =
+      R"({"menu":{"emptyArray":[],"id":"file","popup":{"menuitem":[{" value ":" Space ","onclick":"CreateNewDoc() ","value ":"New "}]},"value":"File"}})";
+  testJsonParse(json, expectedJson);
+
+  json =
+      R"({"stars":[{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","task_name":null,"event":"START_APPLICATION","time":1678975122,"user_id":100004750901196},{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","task_name":"STARS_SIGN_TOS","event":"START_TASK","time":1678975122,"user_id":100004750901196},{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","task_name":"STARS_SIGN_TOS","event":"COMPLETE_TASK","time":1678975128,"user_id":100004750901196},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"START_APPLICATION","time":1706866395,"user_id":100004750901196},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","event":"START_TASK","time":1706866395,"user_id":100004750901196},{"error":null,"updated_deferred_payout_state":"PAYOUT_SETUP_DEFERRED","onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","event":"COMPLETE_TASK","time":1706866402,"user_id":100004750901196},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"SUBMIT_APPLICATION","time":1706866402,"user_id":100004750901196},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"APPLICATION_APPROVED","time":1706866402,"user_id":100004750901196}]})";
+
+  expectedJson =
+      R"({"stars":[{"entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","event":"START_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1678975122,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","event":"START_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_SIGN_TOS","time":1678975122,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_STARS_DEFAULT_ON_LEARN_MORE_ACTION_SCREEN","event":"COMPLETE_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_SIGN_TOS","time":1678975128,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"START_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866395,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"START_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","time":1706866395,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"COMPLETE_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","time":1706866402,"updated_deferred_payout_state":"PAYOUT_SETUP_DEFERRED","user_id":100004750901196},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"SUBMIT_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866402,"updated_deferred_payout_state":null,"user_id":100004750901196},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"APPLICATION_APPROVED","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866402,"updated_deferred_payout_state":null,"user_id":100004750901196}]})";
+
+  testJsonParse(json, expectedJson);
+}
+
 TEST_F(JsonFunctionsTest, isJsonScalarSignatures) {
   auto signatures = getSignatureStrings("is_json_scalar");
   ASSERT_EQ(2, signatures.size());
diff --git a/velox/type/Conversions.cpp b/velox/type/Conversions.cpp
index c2cbdad99289..3b2d87ac659b 100644
--- a/velox/type/Conversions.cpp
+++ b/velox/type/Conversions.cpp
@@ -30,9 +30,9 @@ namespace facebook::velox::util {
 /// folly's tryTo doesn't ignore control characters or other unicode whitespace.
 /// We trim the string for control and unicode whitespace
 /// from both directions and return a StringView of the result.
-StringView trimWhiteSpace(const char* data, size_t length) {
+std::string_view trimWhiteSpace(const char* data, size_t length) {
   if (length == 0) {
-    return StringView(data, 0);
+    return std::string_view(data, 0);
   }
 
   int startIndex = 0;
@@ -67,7 +67,7 @@ StringView trimWhiteSpace(const char* data, size_t length) {
   }
 
   // Trim whitespace from right side.
-  for (auto i = length - 1; i > startIndex;) {
+  for (auto i = length - 1; i >= startIndex;) {
     size = 0;
     auto isWhiteSpaceOrControlChar = false;
 
@@ -100,7 +100,7 @@ StringView trimWhiteSpace(const char* data, size_t length) {
 
   // If we end on a unicode char make sure we add that to the end.
   auto charSize = size > 0 ? size : 1;
-  return StringView(data + startIndex, endIndex - startIndex + charSize);
+  return std::string_view(data + startIndex, endIndex - startIndex + charSize);
 }
 
 } // namespace facebook::velox::util
diff --git a/velox/type/Conversions.h b/velox/type/Conversions.h
index 25847b1a99b5..bab6e2a49e96 100644
--- a/velox/type/Conversions.h
+++ b/velox/type/Conversions.h
@@ -233,7 +233,7 @@ struct Converter<TypeKind::BOOLEAN, void, TPolicy> {
 /// Presto compatible trim of whitespace. This also trims
 /// control characters from both front and back and returns
 /// a StringView of the trimmed string.
-StringView trimWhiteSpace(const char* data, size_t length);
+std::string_view trimWhiteSpace(const char* data, size_t length);
 
 /// To TINYINT, SMALLINT, INTEGER, BIGINT, and HUGEINT converter.
 template <TypeKind KIND, typename TPolicy>