diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 8628a669d8285..4a39e718688c2 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -14,11 +14,92 @@ * limitations under the License. */ #include "velox/expression/VectorFunction.h" +#include "velox/functions/prestosql/json/JsonStringUtil.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/functions/prestosql/types/JsonType.h" +#include "velox/type/Conversions.h" namespace facebook::velox::functions { +StringView trimAndEscape(const char* data, size_t length) { + return velox::util::trimWhiteSpace(data, length); +} + +namespace { +constexpr const char* kArrayStart = "["; +constexpr const char* kArrayEnd = "]"; +constexpr const char* kSeparator = ","; +constexpr const char* kObjectStart = "{"; +constexpr const char* kObjectEnd = "}"; +constexpr const char* kObjectKeySeparator = ":"; +constexpr const char* kQuote = "\""; + +class JsonView { + public: + virtual void canonicalize(std::stringstream& stream) = 0; +}; + +using JsonViewPtr = std::shared_ptr; + +struct JsonLeafView : public JsonView { + JsonLeafView(const StringView view) : view_(view) {}; + + void canonicalize(std::stringstream& stream) override { + auto canonicalValue = trimAndEscape(view_.data(), view_.size()); + stream << canonicalValue; + } + + private: + const StringView view_; +}; + +struct JsonArrayView : public JsonView { + JsonArrayView(const std::vector array) : array_(array) {}; + + void canonicalize(std::stringstream& stream) override { + stream << kArrayStart; + for (auto i = 0; i < array_.size(); i++) { + array_[i]->canonicalize(stream); + if (i < array_.size() - 1) { + stream << kSeparator; + } + } + stream << kArrayEnd; + } + + private: + const std::vector array_; +}; + +struct JsonObjView : public JsonView { + JsonObjView(std::vector> objFields) + : objFields_(objFields) {}; + + void canonicalize(std::stringstream& stream) override { + std::sort(objFields_.begin(), objFields_.end(), [](auto& a, auto& b) { + return a.first < b.first; + }); + + stream << kObjectStart; + for (auto i = 0; i < objFields_.size(); i++) { + auto field = objFields_[i]; + auto trimmedKey = trimAndEscape(field.first.data(), field.first.size()); + + stream << kQuote << trimmedKey << kQuote << kObjectKeySeparator; + field.second->canonicalize(stream); + if (i < objFields_.size() - 1) { + stream << kSeparator; + } + } + stream << kObjectEnd; + } + + private: + std::vector> objFields_; +}; + +} // namespace + namespace { class JsonFormatFunction : public exec::VectorFunction { public: @@ -84,14 +165,32 @@ class JsonParseFunction : public exec::VectorFunction { auto value = arg->as>()->valueAt(0); paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + auto canonicalStringStream = std::stringstream{}; + JsonViewPtr jsonView; + + if (auto error = parse(value.size(), jsonView)) { context.setErrors(rows, errors_[error]); return; } - localResult = std::make_shared>( - context.pool(), rows.end(), false, JSON(), std::move(value)); + + jsonView->canonicalize(canonicalStringStream); + localResult = BaseVector::createConstant( + JSON(), canonicalStringStream.str(), rows.end(), context.pool()); + } else { auto flatInput = arg->asFlatVector(); + BufferPtr stringViews = AlignedBuffer::allocate( + rows.end(), context.pool(), StringView()); + + // TODO: Optimize this + localResult = std::make_shared>( + context.pool(), + JSON(), + nullptr, + rows.end(), + stringViews, + std::vector{}); + auto flatResult = localResult->asFlatVector(); auto stringBuffers = flatInput->stringBuffers(); VELOX_CHECK_LE(rows.end(), flatInput->size()); @@ -102,20 +201,23 @@ class JsonParseFunction : public exec::VectorFunction { maxSize = std::max(maxSize, value.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + + auto canonicalStringStream = std::stringstream{}; + JsonViewPtr jsonView; rows.applyToSelected([&](auto row) { auto value = flatInput->valueAt(row); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + if (auto error = parse(value.size(), jsonView)) { context.setVeloxExceptionError(row, errors_[error]); + } else { + jsonView->canonicalize(canonicalStringStream); + auto canonicalString = canonicalStringStream.str(); + // TODO: This creates a copy, can we optimize. + flatResult->set( + row, StringView(canonicalString.data(), canonicalString.size())); + canonicalStringStream.str(std::string()); } }); - localResult = std::make_shared>( - context.pool(), - JSON(), - nullptr, - rows.end(), - flatInput->values(), - std::move(stringBuffers)); } context.moveOrCopyResult(localResult, rows, result); @@ -130,11 +232,11 @@ class JsonParseFunction : public exec::VectorFunction { } private: - simdjson::error_code parse(size_t size) const { + simdjson::error_code parse(size_t size, JsonViewPtr& jsonView) const { simdjson::padded_string_view paddedInput( paddedInput_.data(), size, paddedInput_.size()); SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(paddedInput)); - SIMDJSON_TRY(validate(doc)); + SIMDJSON_TRY(validate(doc, jsonView)); if (!doc.at_end()) { return simdjson::TRAILING_CONTENT; } @@ -142,33 +244,63 @@ class JsonParseFunction : public exec::VectorFunction { } template - static simdjson::error_code validate(T value) { + static simdjson::error_code validate(T value, JsonViewPtr& jsonView) { SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); switch (type) { case simdjson::ondemand::json_type::array: { SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); + + std::vector arrayPtr; for (auto elementOrError : array) { SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError); - SIMDJSON_TRY(validate(element)); + JsonViewPtr elementPtr; + SIMDJSON_TRY(validate(element, elementPtr)); + arrayPtr.push_back(elementPtr); } + + jsonView = std::make_shared(std::move(arrayPtr)); return simdjson::SUCCESS; } case simdjson::ondemand::json_type::object: { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); + + std::vector> objFields; for (auto fieldOrError : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError); - SIMDJSON_TRY(validate(field.value())); + JsonViewPtr elementPtr; + auto key = StringView(field.escaped_key()); + SIMDJSON_TRY(validate(field.value(), elementPtr)); + objFields.push_back({key, elementPtr}); } + + jsonView = std::make_shared(objFields); return simdjson::SUCCESS; } - case simdjson::ondemand::json_type::number: + case simdjson::ondemand::json_type::number: { + std::string_view rawJsonv = value.raw_json_token(); + + jsonView = std::make_shared(StringView(rawJsonv)); return value.get_double().error(); - case simdjson::ondemand::json_type::string: + } + case simdjson::ondemand::json_type::string: { + auto rawJsonv = StringView(value.raw_json_token()); + + jsonView = std::make_shared(rawJsonv); return value.get_string().error(); - case simdjson::ondemand::json_type::boolean: + } + + case simdjson::ondemand::json_type::boolean: { + auto rawJsonv = StringView(value.raw_json_token()); + + jsonView = std::make_shared(rawJsonv); return value.get_bool().error(); + } + case simdjson::ondemand::json_type::null: { SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null()); + auto rawJsonv = StringView(value.raw_json_token()); + + jsonView = std::make_shared(rawJsonv); return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR; } } diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 067d374411f30..003fdbe220230 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "folly/Unicode.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" #include "velox/functions/prestosql/types/JsonType.h" @@ -189,13 +190,18 @@ TEST_F(JsonFunctionsTest, jsonParse) { }; EXPECT_EQ(jsonParse(std::nullopt), std::nullopt); + // Spaces before and after. + EXPECT_EQ(jsonParse(R"( "abc" )"), R"("abc")"); EXPECT_EQ(jsonParse(R"(true)"), "true"); EXPECT_EQ(jsonParse(R"(null)"), "null"); EXPECT_EQ(jsonParse(R"(42)"), "42"); EXPECT_EQ(jsonParse(R"("abc")"), R"("abc")"); - EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1, 2, 3]"); - EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})"); - EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])"); + EXPECT_EQ(jsonParse("\"abc\u4FE1\""), "\"abc\u4FE1\""); + auto utf32cp = folly::codePointToUtf8(U'😀'); + EXPECT_EQ(jsonParse(fmt::format("\"{}\"", utf32cp)), "\"😀\""); + EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1,2,3]"); + EXPECT_EQ(jsonParse(R"({"k1": "v1" })"), R"({"k1":"v1"})"); + EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1","v1"])"); VELOX_ASSERT_THROW( jsonParse(R"({"k1":})"), "The JSON document has an improper structure"); @@ -276,6 +282,74 @@ TEST_F(JsonFunctionsTest, jsonParse) { } } +TEST_F(JsonFunctionsTest, canonicalization) { + const auto jsonParse = [&](std::optional value) { + return evaluateOnce("json_parse(c0)", value); + }; + + auto json = R"({ + "menu": { + "id": "file", + "value": "File", + "popup": { + "menuitem": [ + { + "value": "New", + "onclick": "CreateNewDoc() " + }, + { + "value": "Open", + "onclick": "OpenDoc() " + }, + { + "value": "Close", + "onclick": "CloseDoc() " + } + ] + } + } + })"; + + StringView expectedJson = + R"({"menu":{"id":"file","popup":{"menuitem":[{"onclick":"CreateNewDoc() ","value":"New"},{"onclick":"OpenDoc() ","value":"Open"},{"onclick":"CloseDoc() ","value":"Close"}]},"value":"File"}})"; + EXPECT_EQ(jsonParse(json), expectedJson); + + json = + "{\n" + " \"name\": \"John Doe\",\n" + " \"address\": {\n" + " \"street\": \"123 Main St\",\n" + " \"city\": \"Anytown\",\n" + " \"state\": \"CA\",\n" + " \"zip\": \"12345\"\n" + " },\n" + " \"phoneNumbers\": [\n" + " {\n" + " \"type\": \"home\",\n" + " \"number\": \"555-1234\"\n" + " },\n" + " {\n" + " \"type\": \"work\",\n" + " \"number\": \"555-5678\"\n" + " }\n" + " ],\n" + " \"familyMembers\": [\n" + " {\n" + " \"name\": \"Jane Doe\",\n" + " \"relationship\": \"wife\"\n" + " },\n" + " {\n" + " \"name\": \"Jimmy Doe\",\n" + " \"relationship\": \"son\"\n" + " }\n" + " ],\n" + " \"hobbies\": [\"golf\", \"reading\", \"traveling\"]\n" + "}"; + expectedJson = + R"({"address":{"city":"Anytown","state":"CA","street":"123 Main St","zip":"12345"},"familyMembers":[{"name":"Jane Doe","relationship":"wife"},{"name":"Jimmy Doe","relationship":"son"}],"hobbies":["golf","reading","traveling"],"name":"John Doe","phoneNumbers":[{"number":"555-1234","type":"home"},{"number":"555-5678","type":"work"}]})"; + EXPECT_EQ(jsonParse(json), expectedJson); +} + TEST_F(JsonFunctionsTest, isJsonScalarSignatures) { auto signatures = getSignatureStrings("is_json_scalar"); ASSERT_EQ(2, signatures.size());