From 30a901e40cbbaa4f965e5ebc347096bb7ad62ee7 Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Wed, 16 Oct 2024 16:32:03 -0700 Subject: [PATCH] Add support for canonicalization of JSON. --- velox/functions/prestosql/JsonFunctions.cpp | 167 ++++++++++++++++-- .../prestosql/tests/JsonFunctionsTest.cpp | 76 +++++++- 2 files changed, 221 insertions(+), 22 deletions(-) diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 8628a669d828..22b1d00d21f8 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -16,9 +16,82 @@ #include "velox/expression/VectorFunction.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/functions/prestosql/types/JsonType.h" +#include "velox/type/Conversions.h" namespace facebook::velox::functions { +namespace { +constexpr const char* kArrayStart = "["; +constexpr const char* kArrayEnd = "]"; +constexpr const char* kSeparator = ","; +constexpr const char* kObjectStart = "{"; +constexpr const char* kObjectEnd = "}"; +constexpr const char* kObjectKeySeparator = ":"; +constexpr const char* kQuote = "\""; + +class JsonView { + public: + virtual void canonicalize(std::stringstream& stream) = 0; +}; + +using JsonViewPtr = std::shared_ptr; + +struct JsonLeafView : public JsonView { + JsonLeafView(const StringView view) : view_(view){}; + + void canonicalize(std::stringstream& stream) override { + stream << view_; + } + + private: + const StringView view_; +}; + +struct JsonArrayView : public JsonView { + JsonArrayView(const std::vector array) : array_(array){}; + + void canonicalize(std::stringstream& stream) override { + stream << kArrayStart; + for (auto i = 0; i < array_.size(); i++) { + array_[i]->canonicalize(stream); + if (i < array_.size() - 1) { + stream << kSeparator; + } + } + stream << kArrayEnd; + } + + private: + const std::vector array_; +}; + +struct JsonObjView : public JsonView { + JsonObjView(std::vector> objFields) + : objFields_(objFields){}; + + void canonicalize(std::stringstream& stream) override { + std::sort(objFields_.begin(), objFields_.end(), [](auto& a, auto& b) { + return a.first < b.first; + }); + + stream << kObjectStart; + for (auto i = 0; i < objFields_.size(); i++) { + auto field = objFields_[i]; + stream << kQuote << field.first << kQuote << kObjectKeySeparator; + field.second->canonicalize(stream); + if (i < objFields_.size() - 1) { + stream << kSeparator; + } + } + stream << kObjectEnd; + } + + private: + std::vector> objFields_; +}; + +} // namespace + namespace { class JsonFormatFunction : public exec::VectorFunction { public: @@ -84,14 +157,32 @@ class JsonParseFunction : public exec::VectorFunction { auto value = arg->as>()->valueAt(0); paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + auto canonicalStringStream = std::stringstream{}; + JsonViewPtr jsonView; + + if (auto error = parse(value.size(), jsonView)) { context.setErrors(rows, errors_[error]); return; } - localResult = std::make_shared>( - context.pool(), rows.end(), false, JSON(), std::move(value)); + + jsonView->canonicalize(canonicalStringStream); + localResult = BaseVector::createConstant( + JSON(), canonicalStringStream.str(), rows.end(), context.pool()); + } else { auto flatInput = arg->asFlatVector(); + BufferPtr stringViews = AlignedBuffer::allocate( + rows.end(), context.pool(), StringView()); + + // TODO: Optimize this + localResult = std::make_shared>( + context.pool(), + JSON(), + nullptr, + rows.end(), + stringViews, + std::vector{}); + auto flatResult = localResult->asFlatVector(); auto stringBuffers = flatInput->stringBuffers(); VELOX_CHECK_LE(rows.end(), flatInput->size()); @@ -102,20 +193,23 @@ class JsonParseFunction : public exec::VectorFunction { maxSize = std::max(maxSize, value.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + + auto canonicalStringStream = std::stringstream{}; + JsonViewPtr jsonView; rows.applyToSelected([&](auto row) { auto value = flatInput->valueAt(row); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + if (auto error = parse(value.size(), jsonView)) { context.setVeloxExceptionError(row, errors_[error]); + } else { + jsonView->canonicalize(canonicalStringStream); + auto canonicalString = canonicalStringStream.str(); + // TODO: This creates a copy, can we optimize. + flatResult->set( + row, StringView(canonicalString.data(), canonicalString.size())); + canonicalStringStream.str(std::string()); } }); - localResult = std::make_shared>( - context.pool(), - JSON(), - nullptr, - rows.end(), - flatInput->values(), - std::move(stringBuffers)); } context.moveOrCopyResult(localResult, rows, result); @@ -130,11 +224,11 @@ class JsonParseFunction : public exec::VectorFunction { } private: - simdjson::error_code parse(size_t size) const { + simdjson::error_code parse(size_t size, JsonViewPtr& jsonView) const { simdjson::padded_string_view paddedInput( paddedInput_.data(), size, paddedInput_.size()); SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(paddedInput)); - SIMDJSON_TRY(validate(doc)); + SIMDJSON_TRY(validate(doc, jsonView)); if (!doc.at_end()) { return simdjson::TRAILING_CONTENT; } @@ -142,33 +236,68 @@ class JsonParseFunction : public exec::VectorFunction { } template - static simdjson::error_code validate(T value) { + static simdjson::error_code validate(T value, JsonViewPtr& jsonView) { SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); switch (type) { case simdjson::ondemand::json_type::array: { SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); + + std::vector arrayPtr; for (auto elementOrError : array) { SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError); - SIMDJSON_TRY(validate(element)); + JsonViewPtr elementPtr; + SIMDJSON_TRY(validate(element, elementPtr)); + arrayPtr.push_back(elementPtr); } + + jsonView = std::make_shared(arrayPtr); return simdjson::SUCCESS; } case simdjson::ondemand::json_type::object: { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); + + std::vector> objFields; for (auto fieldOrError : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError); - SIMDJSON_TRY(validate(field.value())); + JsonViewPtr elementPtr; + auto key = field.escaped_key(); + auto trimmedKey = velox::util::trimWhiteSpace(key.data(), key.size()); + SIMDJSON_TRY(validate(field.value(), elementPtr)); + objFields.push_back({trimmedKey, elementPtr}); } + + jsonView = std::make_shared(objFields); return simdjson::SUCCESS; } - case simdjson::ondemand::json_type::number: + case simdjson::ondemand::json_type::number: { + std::string_view rawJsonv = value.raw_json_token(); + + jsonView = std::make_shared( + velox::util::trimWhiteSpace(rawJsonv.data(), rawJsonv.size())); return value.get_double().error(); - case simdjson::ondemand::json_type::string: + } + case simdjson::ondemand::json_type::string: { + std::string_view rawJsonv = value.raw_json_token(); + + auto s = velox::util::trimWhiteSpace(rawJsonv.data(), rawJsonv.size()); + jsonView = std::make_shared(s); return value.get_string().error(); - case simdjson::ondemand::json_type::boolean: + } + + case simdjson::ondemand::json_type::boolean: { + std::string_view rawJsonv = value.raw_json_token(); + + jsonView = std::make_shared( + velox::util::trimWhiteSpace(rawJsonv.data(), rawJsonv.size())); return value.get_bool().error(); + } + case simdjson::ondemand::json_type::null: { SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null()); + std::string_view rawJsonv = value.raw_json_token(); + + jsonView = std::make_shared( + velox::util::trimWhiteSpace(rawJsonv.data(), rawJsonv.size())); return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR; } } diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 067d374411f3..6c0a8cbb058b 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -189,13 +189,15 @@ TEST_F(JsonFunctionsTest, jsonParse) { }; EXPECT_EQ(jsonParse(std::nullopt), std::nullopt); + // Spaces before and after. + EXPECT_EQ(jsonParse(R"( "abc" )"), R"("abc")"); EXPECT_EQ(jsonParse(R"(true)"), "true"); EXPECT_EQ(jsonParse(R"(null)"), "null"); EXPECT_EQ(jsonParse(R"(42)"), "42"); EXPECT_EQ(jsonParse(R"("abc")"), R"("abc")"); - EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1, 2, 3]"); - EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})"); - EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])"); + EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1,2,3]"); + EXPECT_EQ(jsonParse(R"({"k1": "v1" })"), R"({"k1":"v1"})"); + EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1","v1"])"); VELOX_ASSERT_THROW( jsonParse(R"({"k1":})"), "The JSON document has an improper structure"); @@ -276,6 +278,74 @@ TEST_F(JsonFunctionsTest, jsonParse) { } } +TEST_F(JsonFunctionsTest, canonicalization) { + const auto jsonParse = [&](std::optional value) { + return evaluateOnce("json_parse(c0)", value); + }; + + auto json = R"({ + "menu": { + "id": "file", + "value": "File", + "popup": { + "menuitem": [ + { + "value": "New", + "onclick": "CreateNewDoc() " + }, + { + "value": "Open", + "onclick": "OpenDoc() " + }, + { + "value": "Close", + "onclick": "CloseDoc() " + } + ] + } + } + })"; + + StringView expectedJson = + R"({"menu":{"id":"file","popup":{"menuitem":[{"onclick":"CreateNewDoc() ","value":"New"},{"onclick":"OpenDoc() ","value":"Open"},{"onclick":"CloseDoc() ","value":"Close"}]},"value":"File"}})"; + EXPECT_EQ(jsonParse(json), expectedJson); + + json = + "{\n" + " \"name\": \"John Doe\",\n" + " \"address\": {\n" + " \"street\": \"123 Main St\",\n" + " \"city\": \"Anytown\",\n" + " \"state\": \"CA\",\n" + " \"zip\": \"12345\"\n" + " },\n" + " \"phoneNumbers\": [\n" + " {\n" + " \"type\": \"home\",\n" + " \"number\": \"555-1234\"\n" + " },\n" + " {\n" + " \"type\": \"work\",\n" + " \"number\": \"555-5678\"\n" + " }\n" + " ],\n" + " \"familyMembers\": [\n" + " {\n" + " \"name\": \"Jane Doe\",\n" + " \"relationship\": \"wife\"\n" + " },\n" + " {\n" + " \"name\": \"Jimmy Doe\",\n" + " \"relationship\": \"son\"\n" + " }\n" + " ],\n" + " \"hobbies\": [\"golf\", \"reading\", \"traveling\"]\n" + "}"; + expectedJson = + R"({"address":{"city":"Anytown","state":"CA","street":"123 Main St","zip":"12345"},"familyMembers":[{"name":"Jane Doe","relationship":"wife"},{"name":"Jimmy Doe","relationship":"son"}],"hobbies":["golf","reading","traveling"],"name":"John Doe","phoneNumbers":[{"number":"555-1234","type":"home"},{"number":"555-5678","type":"work"}]})"; + EXPECT_EQ(jsonParse(json), expectedJson); +} + TEST_F(JsonFunctionsTest, isJsonScalarSignatures) { auto signatures = getSignatureStrings("is_json_scalar"); ASSERT_EQ(2, signatures.size());