diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 8628a669d828..01ea6a627fe1 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -14,11 +14,95 @@ * limitations under the License. */ #include "velox/expression/VectorFunction.h" +#include "velox/functions/prestosql/json/JsonStringUtil.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" #include "velox/functions/prestosql/types/JsonType.h" namespace facebook::velox::functions { +namespace { +const std::string_view kArrayStart = "["; +const std::string_view kArrayEnd = "]"; +const std::string_view kSeparator = ","; +const std::string_view kObjectStart = "{"; +const std::string_view kObjectEnd = "}"; +const std::string_view kObjectKeySeparator = ":"; + +using JsonViews = std::vector; + +inline void addOrMergeViews(JsonViews& jsonViews, std::string_view view) { + if (jsonViews.empty()) { + jsonViews.push_back(view); + return; + } + + auto& lastView = jsonViews.back(); + + if (lastView.data() + lastView.size() == view.data()) { + lastView = std::string_view(lastView.data(), lastView.size() + view.size()); + } else { + jsonViews.push_back(view); + } +} + +/// Class to keep track of json strings being written +/// in to a buffer. The size of the backing buffer must be known during +/// construction time. +class BufferTracker { + public: + explicit BufferTracker(BufferPtr buffer) : curPos_(0), currentViewStart_(0) { + bufPtr_ = buffer->asMutable(); + capacity = buffer->capacity(); + } + + /// Write out all the views to the buffer. + auto getCanonicalString(JsonViews& jsonViews) { + for (auto view : jsonViews) { + trimEscapeWriteToBuffer(view); + } + return getStringView(); + } + + /// Sets current view to the end of the previous string. + /// Should be called only after getCanonicalString , + /// as after this call the previous view is lost. + void startNewString() { + currentViewStart_ += curPos_; + curPos_ = 0; + } + + private: + /// Trims whitespace and escapes utf characters before writing to buffer. + void trimEscapeWriteToBuffer(std::string_view input) { + auto trimmed = velox::util::trimWhiteSpace(input.data(), input.size()); + auto curBufPtr = getCurrentBufferPtr(); + auto bytesWritten = + prestoJavaEscapeString(trimmed.data(), trimmed.size(), curBufPtr); + incrementCounter(bytesWritten); + } + + /// Returns current string view against the buffer. + std::string_view getStringView() { + return std::string_view(bufPtr_ + currentViewStart_, curPos_); + } + + inline char* getCurrentBufferPtr() { + return bufPtr_ + currentViewStart_ + curPos_; + } + + void incrementCounter(size_t increment) { + VELOX_DCHECK_LE(curPos_ + currentViewStart_ + increment, capacity); + curPos_ += increment; + } + + size_t capacity; + size_t curPos_; + size_t currentViewStart_; + char* bufPtr_; +}; + +} // namespace + namespace { class JsonFormatFunction : public exec::VectorFunction { public: @@ -84,38 +168,75 @@ class JsonParseFunction : public exec::VectorFunction { auto value = arg->as>()->valueAt(0); paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + auto escapeSize = escapedStringSize(value.data(), value.size()); + auto buffer = AlignedBuffer::allocate(escapeSize, context.pool()); + BufferTracker bufferTracker{buffer}; + + JsonViews jsonViews; + + if (auto error = parse(value.size(), jsonViews)) { context.setErrors(rows, errors_[error]); return; } - localResult = std::make_shared>( - context.pool(), rows.end(), false, JSON(), std::move(value)); + + BufferPtr stringViews = + AlignedBuffer::allocate(1, context.pool(), StringView()); + auto rawStringViews = stringViews->asMutable(); + rawStringViews[0] = + StringView(bufferTracker.getCanonicalString(jsonViews)); + + auto constantBase = std::make_shared>( + context.pool(), + JSON(), + nullptr, + 1, + stringViews, + std::vector{buffer}); + + localResult = BaseVector::wrapInConstant(rows.end(), 0, constantBase); + } else { auto flatInput = arg->asFlatVector(); + BufferPtr stringViews = AlignedBuffer::allocate( + rows.end(), context.pool(), StringView()); + auto rawStringViews = stringViews->asMutable(); - auto stringBuffers = flatInput->stringBuffers(); VELOX_CHECK_LE(rows.end(), flatInput->size()); size_t maxSize = 0; + size_t totalOutputSize = 0; rows.applyToSelected([&](auto row) { auto value = flatInput->valueAt(row); maxSize = std::max(maxSize, value.size()); + totalOutputSize += escapedStringSize(value.data(), value.size()); }); + paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + BufferPtr buffer = + AlignedBuffer::allocate(totalOutputSize, context.pool()); + BufferTracker bufferTracker{buffer}; + rows.applyToSelected([&](auto row) { + JsonViews jsonViews; auto value = flatInput->valueAt(row); memcpy(paddedInput_.data(), value.data(), value.size()); - if (auto error = parse(value.size())) { + if (auto error = parse(value.size(), jsonViews)) { context.setVeloxExceptionError(row, errors_[error]); + } else { + auto canonicalString = bufferTracker.getCanonicalString(jsonViews); + + rawStringViews[row] = StringView(canonicalString); + bufferTracker.startNewString(); } }); + localResult = std::make_shared>( context.pool(), JSON(), nullptr, rows.end(), - flatInput->values(), - std::move(stringBuffers)); + stringViews, + std::vector{buffer}); } context.moveOrCopyResult(localResult, rows, result); @@ -130,11 +251,11 @@ class JsonParseFunction : public exec::VectorFunction { } private: - simdjson::error_code parse(size_t size) const { + simdjson::error_code parse(size_t size, JsonViews& jsonViews) const { simdjson::padded_string_view paddedInput( paddedInput_.data(), size, paddedInput_.size()); SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(paddedInput)); - SIMDJSON_TRY(validate(doc)); + SIMDJSON_TRY(validate(doc, jsonViews)); if (!doc.at_end()) { return simdjson::TRAILING_CONTENT; } @@ -142,33 +263,94 @@ class JsonParseFunction : public exec::VectorFunction { } template - static simdjson::error_code validate(T value) { + static simdjson::error_code validate(T value, JsonViews& jsonViews) { SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); switch (type) { case simdjson::ondemand::json_type::array: { SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); + + jsonViews.push_back(kArrayStart); + auto jsonViewsSize = jsonViews.size(); for (auto elementOrError : array) { SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError); - SIMDJSON_TRY(validate(element)); + SIMDJSON_TRY(validate(element, jsonViews)); + jsonViews.push_back(kSeparator); + } + + // If the array is not empty, remove the last separator. + if (jsonViews.size() > jsonViewsSize) { + jsonViews.pop_back(); } + + jsonViews.push_back(kArrayEnd); + return simdjson::SUCCESS; } + case simdjson::ondemand::json_type::object: { SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); + + std::vector> objFields; for (auto fieldOrError : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError); - SIMDJSON_TRY(validate(field.value())); + auto key = field.key_raw_json_token(); + JsonViews elementArray; + SIMDJSON_TRY(validate(field.value(), elementArray)); + objFields.push_back({key, elementArray}); } + + std::sort(objFields.begin(), objFields.end(), [](auto& a, auto& b) { + // Remove the quotes from the keys before we sort them. + auto af = std::string_view{a.first.data() + 1, a.first.size() - 2}; + auto bf = std::string_view{b.first.data() + 1, b.first.size() - 2}; + return lessThan(a.first, b.first); + }); + + jsonViews.push_back(kObjectStart); + + for (auto i = 0; i < objFields.size(); i++) { + auto field = objFields[i]; + addOrMergeViews(jsonViews, field.first); + jsonViews.push_back(kObjectKeySeparator); + + for (auto& element : field.second) { + addOrMergeViews(jsonViews, element); + } + + if (i < objFields.size() - 1) { + jsonViews.push_back(kSeparator); + } + } + + jsonViews.push_back(kObjectEnd); return simdjson::SUCCESS; } - case simdjson::ondemand::json_type::number: + + case simdjson::ondemand::json_type::number: { + SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json()); + addOrMergeViews(jsonViews, rawJson); + return value.get_double().error(); - case simdjson::ondemand::json_type::string: + } + case simdjson::ondemand::json_type::string: { + SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json()); + addOrMergeViews(jsonViews, rawJson); + return value.get_string().error(); - case simdjson::ondemand::json_type::boolean: + } + + case simdjson::ondemand::json_type::boolean: { + SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json()); + addOrMergeViews(jsonViews, rawJson); + return value.get_bool().error(); + } + case simdjson::ondemand::json_type::null: { SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null()); + SIMDJSON_ASSIGN_OR_RAISE(auto rawJson, value.raw_json()); + addOrMergeViews(jsonViews, rawJson); + return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR; } } diff --git a/velox/functions/prestosql/json/JsonStringUtil.cpp b/velox/functions/prestosql/json/JsonStringUtil.cpp index 43be101ec40c..84193d280594 100644 --- a/velox/functions/prestosql/json/JsonStringUtil.cpp +++ b/velox/functions/prestosql/json/JsonStringUtil.cpp @@ -18,6 +18,7 @@ #include "folly/Unicode.h" #include "velox/common/base/Exceptions.h" +#include "velox/external/utf8proc/utf8procImpl.h" #include "velox/functions/lib/Utf8Utils.h" #include "velox/functions/prestosql/json/JsonStringUtil.h" @@ -31,6 +32,20 @@ FOLLY_ALWAYS_INLINE char hexDigit(uint8_t c) { return c < 10 ? c + '0' : c - 10 + 'A'; } +FOLLY_ALWAYS_INLINE int32_t digitToHex(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'F') { + return c - 'A' + 10; + } + if (c >= 'a' && c <= 'f') { + return c - 'a' + 10; + } + + VELOX_USER_FAIL("Invalid escape digit: {}", c); +} + FOLLY_ALWAYS_INLINE void writeHex(char16_t value, char*& out) { value = folly::Endian::little(value); *out++ = '\\'; @@ -181,4 +196,222 @@ size_t escapedStringSize(const char* input, size_t length) { return outSize; } +namespace { + +int32_t parseHex(const std::string_view& hexString) { + int32_t result = 0; + for (auto c : hexString) { + result = (result << 4) + digitToHex(c); + } + + return result; +} + +bool isHighSurrogate(int32_t code_point) { + return code_point >= 0xD800 && code_point <= 0xDBFF; +} + +bool isLowSurrogate(int32_t code_point) { + return code_point >= 0xDC00 && code_point <= 0xDFFF; +} + +bool isSpecialCode(int32_t codePoint) { + // Java implementation ignores some code points + // in the first plane from 0x00 to 0x1F. + + if (codePoint >= 0 && codePoint <= 0x1F) { + return true; + } + + return false; +} + +// Gets codepoint of a char and increments position pos +// in view to next char. +int32_t getEscapedChar(std::string_view view, size_t& pos) { + if (FOLLY_UNLIKELY(view[pos] == '\\')) { + switch (view[pos + 1]) { + case '/': + pos++; + return '/'; + case 'u': { + if (pos + 5 > view.size()) { + VELOX_USER_FAIL("Invalid escape sequence at the end of string"); + } + + // Read 4 hex digits. + auto codePoint = parseHex(std::string_view(view.data() + pos + 2, 4)); + pos += 6; + return codePoint; + } + case '"': + pos += 2; + return '"'; + case 'b': + pos += 2; + return '\b'; + case 'n': + pos += 2; + return '\n'; + case 'f': + pos += 2; + return '\f'; + case 'r': + pos += 2; + return '\r'; + case 't': + pos += 2; + return '\t'; + + default: + VELOX_USER_FAIL("Bad escape character in view {}", view); + } + } + + // Not escaped, just return character. + return view[pos++]; +} + +int32_t compareChars( + const std::string_view& first, + const std::string_view& second, + size_t& i, + size_t& j) { + // Both are ASCII. + if (FOLLY_LIKELY(!(first[i] & 0x80) && !(second[j] & 0x80))) { + // Check if escaped. + auto firstChar = getEscapedChar(first, i); + auto secondChar = getEscapedChar(second, j); + return firstChar - secondChar; + } else { + // Assume unicode. + uint32_t firstCodePoint = 0; + uint32_t secondCodePoint = 0; + auto firstSize = 0; + auto secondSize = 0; + if (first[i] & 0x80) { + firstCodePoint = utf8proc_codepoint( + first.data() + i, first.data() + first.size(), firstSize); + } else { + firstCodePoint = first[i]; + } + + if (second[j] & 0x80) { + secondCodePoint = utf8proc_codepoint( + second.data() + j, second.data() + second.size(), secondSize); + } else { + secondCodePoint = second[j]; + } + + i += firstSize > 0 ? firstSize : 1; + j += secondSize > 0 ? secondSize : 1; + return firstCodePoint - secondCodePoint; + } +} +} // namespace + +bool lessThan(const std::string_view& first, const std::string_view& second) { + size_t firstLength = first.size(); + size_t secondLength = second.size(); + size_t minLength = std::min(firstLength, secondLength); + + for (size_t i = 0, j = 0; i < minLength && j < minLength;) { + auto result = compareChars(first, second, i, j); + if (result != 0) { + return result < 0; + } + } + + return firstLength < secondLength; +} + +size_t prestoJavaEscapeString(const char* input, size_t length, char* output) { + char* pos = output; + + auto* start = reinterpret_cast(input); + auto* end = reinterpret_cast(input + length); + while (start < end) { + int count = validateAndGetNextUtf8Length(start, end); + switch (count) { + case 1: { + // Unescape characters that are escaped by \ character. + if (FOLLY_UNLIKELY(*start == '\\')) { + if (start + 1 == end) { + VELOX_USER_FAIL("Invalid escape sequence at the end of string"); + } + // Presto java implementation only unescapes the / character. + switch (*(start + 1)) { + case '/': + *pos++ = '/'; + start += 2; + continue; + case 'u': { + if (start + 5 > end) { + VELOX_USER_FAIL("Invalid escape sequence at the end of string"); + } + + // Read 4 hex digits. + auto codePoint = parseHex(std::string_view( + reinterpret_cast(start) + 2, 4)); + + // Presto java implementation doesnt unescape surrogate pairs. + // Thus we just write it out in the same way as it is. + if (isHighSurrogate(codePoint) || isLowSurrogate(codePoint) || + isSpecialCode(codePoint)) { + memcpy(pos, reinterpret_cast(start), 6); + pos += 6; + start += 6; + continue; + } + + // Otherwise write it as a single code point. + auto increment = utf8proc_encode_char( + codePoint, reinterpret_cast(pos)); + pos += increment; + start += 6; + continue; + } + default: + *pos++ = *start; + *pos++ = *(start + 1); + start += 2; + continue; + } + } else { + *pos++ = *start; + start++; + continue; + } + } + case 2: { + memcpy(pos, reinterpret_cast(start), 2); + pos += 2; + start += 2; + continue; + } + case 3: { + memcpy(pos, reinterpret_cast(start), 3); + pos += 3; + start += 3; + continue; + } + case 4: { + char32_t codePoint = folly::utf8ToCodePoint(start, end, true); + if (codePoint == U'\ufffd') { + writeHex(0xFFFDu, pos); + continue; + } + encodeUtf16Hex(codePoint, pos); + continue; + } + default: { + writeHex(0xFFFDu, pos); + start++; + } + } + } + + return (pos - output); +} + } // namespace facebook::velox diff --git a/velox/functions/prestosql/json/JsonStringUtil.h b/velox/functions/prestosql/json/JsonStringUtil.h index 65cadd86bf68..384f6b768bd1 100644 --- a/velox/functions/prestosql/json/JsonStringUtil.h +++ b/velox/functions/prestosql/json/JsonStringUtil.h @@ -40,13 +40,33 @@ namespace facebook::velox { /// responsible to allocate enough space for output. void escapeString(const char* input, size_t length, char* output); +/// Unescape the unicode characters of `input` to make it canonical for JSON +/// The behavior is compatible with Presto Java's json_parse. +/// Presto java json_parse will unescape the following characters: +/// \/ and non surrogate unicode characters. +/// Other behavior is similar to escapeString. +/// @param input: Input string to escape that is UTF-8 encoded. +/// @param length: Length of the input string. +/// @param output: Output string to write the escaped input to. The caller is +/// responsible to allocate enough space for output. +/// @return The number of bytes written to the output. +size_t prestoJavaEscapeString(const char* input, size_t length, char* output); + /// Return the size of string after the unicode characters of `input` are /// escaped using the method as in`escapeString`. The function will iterate /// over `input` once. /// @param input: Input string to escape that is UTF-8 encoded. /// @param length: Length of the input string. +/// @return The size of the string after escaping. size_t escapedStringSize(const char* input, size_t length); +/// Compares two string views. The comparison takes into account +/// escape sequences and also unicode characters. +/// Returns true if first is less than second else false. +/// @param first: First string to compare. +/// @param second: Second string to compare. +bool lessThan(const std::string_view& first, const std::string_view& second); + /// For test only. Encode `codePoint` value by UTF-16 and write the one or two /// prefixed hexadecimals to `out`. Move `out` forward by 6 or 12 chars /// accordingly. The caller shall ensure there is enough space in `out`. diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index 067d374411f3..b78606ac5b40 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "folly/Unicode.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" #include "velox/functions/prestosql/types/JsonType.h" @@ -71,6 +72,13 @@ class JsonFunctionsTest : public functions::test::FunctionBaseTest { return makeNullableFlatVector({s}, JSON()); } + void testJsonParse(std::string json, std::string expectedJson) { + auto data = makeRowVector({makeFlatVector({json})}); + auto result = evaluate("json_parse(c0)", data); + auto expected = makeFlatVector({expectedJson}, JSON()); + velox::test::assertEqualVectors(expected, result); + } + std::pair makeVectors(std::optional json) { std::optional s = json.has_value() ? std::make_optional(StringView(json.value())) @@ -189,13 +197,39 @@ TEST_F(JsonFunctionsTest, jsonParse) { }; EXPECT_EQ(jsonParse(std::nullopt), std::nullopt); + // Spaces before and after. + EXPECT_EQ(jsonParse(R"( "abc" )"), R"("abc")"); EXPECT_EQ(jsonParse(R"(true)"), "true"); EXPECT_EQ(jsonParse(R"(null)"), "null"); EXPECT_EQ(jsonParse(R"(42)"), "42"); EXPECT_EQ(jsonParse(R"("abc")"), R"("abc")"); - EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1, 2, 3]"); - EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})"); - EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])"); + EXPECT_EQ(jsonParse("\"abc\u4FE1\""), "\"abc\u4FE1\""); + auto utf32cp = folly::codePointToUtf8(U'😀'); + testJsonParse(fmt::format("\"{}\"", utf32cp), R"("\uD83D\uDE00")"); + EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1,2,3]"); + EXPECT_EQ(jsonParse(R"({"k1": "v1" })"), R"({"k1":"v1"})"); + EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1","v1"])"); + testJsonParse(R"({ "abc" : "\/"})", R"({"abc":"/"})"); + testJsonParse(R"({ "abc" : "\\/"})", R"({"abc":"\\/"})"); + testJsonParse(R"({ "abc" : [1, 2, 3, 4 ]})", R"({"abc":[1,2,3,4]})"); + // Test out with unicodes and empty keys. + testJsonParse( + R"({"4":0.1,"\"":0.14, "自社在庫":0.1, "٢": 2.0, "١": 1.0, "१": 1.0, "": 3.5})", + R"({"":3.5,"\"":0.14,"4":0.1,"١":1.0,"٢":2.0,"१":1.0,"自社在庫":0.1})"); + testJsonParse( + R"({"error":"Falha na configura\u00e7\u00e3o do pagamento"})", + R"({"error":"Falha na configuração do pagamento"})"); + // Test unicode in key and surogate pairs in values. + testJsonParse( + R"({"utf\u4FE1": "\u4FE1 \uD83D\uDE00 \/ \n abc a\uD834\uDD1Ec \u263Acba \u0002 \u001F \u0020"})", + R"({"utf信":"信 \uD83D\uDE00 / \n abc a\uD834\uDD1Ec ☺cba \u0002 \u001F "})"); + testJsonParse( + R"({"v\u06ecfzo-\u04fbyw\u25d6#\u2adc\u27e6\u0494\u090e":0.74,"\u042d\u25eb\u03fe)\u044c\u25cb\u2184e":0.89})", + R"({"v۬fzo-ӻyw◖#⫝̸⟦Ҕऎ":0.74,"Э◫Ͼ)ь○ↄe":0.89})"); + // Test special unicode characters. + testJsonParse( + R"({"utf\u4FE1": "\u0002 \u001F \u0020"})", + R"({"utf信":"\u0002 \u001F "})"); VELOX_ASSERT_THROW( jsonParse(R"({"k1":})"), "The JSON document has an improper structure"); @@ -228,7 +262,7 @@ TEST_F(JsonFunctionsTest, jsonParse) { VELOX_ASSERT_THROW( evaluate("json_parse(c0)", data), - "Unexpected trailing content in the JSON input"); + "TAPE_ERROR: The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc."); data = makeRowVector({makeFlatVector( {R"("This is a long sentence")", R"("This is some other sentence")"})}); @@ -276,6 +310,100 @@ TEST_F(JsonFunctionsTest, jsonParse) { } } +TEST_F(JsonFunctionsTest, canonicalization) { + auto json = R"({ + "menu": { + "id": "file", + "value": "File", + "popup": { + "menuitem": [ + { + "value": "New", + "onclick": "CreateNewDoc() " + }, + { + "value": "Open", + "onclick": "OpenDoc() " + }, + { + "value": "Close", + "onclick": "CloseDoc() " + } + ] + } + } + })"; + + auto expectedJson = + R"({"menu":{"id":"file","popup":{"menuitem":[{"onclick":"CreateNewDoc() ","value":"New"},{"onclick":"OpenDoc() ","value":"Open"},{"onclick":"CloseDoc() ","value":"Close"}]},"value":"File"}})"; + testJsonParse(json, expectedJson); + + json = + "{\n" + " \"name\": \"John Doe\",\n" + " \"address\": {\n" + " \"street\": \"123 Main St\",\n" + " \"city\": \"Anytown\",\n" + " \"state\": \"CA\",\n" + " \"zip\": \"12345\"\n" + " },\n" + " \"phoneNumbers\": [\n" + " {\n" + " \"type\": \"home\",\n" + " \"number\": \"555-1234\"\n" + " },\n" + " {\n" + " \"type\": \"work\",\n" + " \"number\": \"555-5678\"\n" + " }\n" + " ],\n" + " \"familyMembers\": [\n" + " {\n" + " \"name\": \"Jane Doe\",\n" + " \"relationship\": \"wife\"\n" + " },\n" + " {\n" + " \"name\": \"Jimmy Doe\",\n" + " \"relationship\": \"son\"\n" + " }\n" + " ],\n" + " \"hobbies\": [\"golf\", \"reading\", \"traveling\"]\n" + "}"; + expectedJson = + R"({"address":{"city":"Anytown","state":"CA","street":"123 Main St","zip":"12345"},"familyMembers":[{"name":"Jane Doe","relationship":"wife"},{"name":"Jimmy Doe","relationship":"son"}],"hobbies":["golf","reading","traveling"],"name":"John Doe","phoneNumbers":[{"number":"555-1234","type":"home"},{"number":"555-5678","type":"work"}]})"; + testJsonParse(json, expectedJson); + + // Json with spaces in keys + json = R"({ + "menu": { + "id": "file", + "value": "File", + "emptyArray": [], + "popup": { + "menuitem": [ + { + "value ": "New ", + "onclick": "CreateNewDoc() ", + " value ": " Space " + } + ] + } + } + })"; + + expectedJson = + R"({"menu":{"emptyArray":[],"id":"file","popup":{"menuitem":[{" value ":" Space ","onclick":"CreateNewDoc() ","value ":"New "}]},"value":"File"}})"; + testJsonParse(json, expectedJson); + + json = + R"({"stars":[{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS","task_name":null,"event":"START_APPLICATION","time":1678975122,"user_id":123456789123456},{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS","task_name":"STARS_SIGN_TOS","event":"START_TASK","time":1678975122,"user_id":123456789123456},{"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_STARS","task_name":"STARS_SIGN_TOS","event":"COMPLETE_TASK","time":1678975128,"user_id":123456789123456},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"START_APPLICATION","time":1706866395,"user_id":123456789123456},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","event":"START_TASK","time":1706866395,"user_id":123456789123456},{"error":null,"updated_deferred_payout_state":"PAYOUT_SETUP_DEFERRED","onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","event":"COMPLETE_TASK","time":1706866402,"user_id":123456789123456},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"SUBMIT_APPLICATION","time":1706866402,"user_id":123456789123456},{"error":null,"updated_deferred_payout_state":null,"onboard_surface":"MTA_ON_MOBILE","entry_point":"FROM_MOBILE_PRO_DASH","task_name":null,"event":"APPLICATION_APPROVED","time":1706866402,"user_id":123456789123456}]})"; + + expectedJson = + R"({"stars":[{"entry_point":"FROM_STARS","event":"START_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1678975122,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_STARS","event":"START_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_SIGN_TOS","time":1678975122,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_STARS","event":"COMPLETE_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_SIGN_TOS","time":1678975128,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"START_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866395,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"START_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","time":1706866395,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"COMPLETE_TASK","onboard_surface":"MTA_ON_MOBILE","task_name":"STARS_DEFERRED_PAYOUT_WITH_TOS","time":1706866402,"updated_deferred_payout_state":"PAYOUT_SETUP_DEFERRED","user_id":123456789123456},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"SUBMIT_APPLICATION","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866402,"updated_deferred_payout_state":null,"user_id":123456789123456},{"entry_point":"FROM_MOBILE_PRO_DASH","error":null,"event":"APPLICATION_APPROVED","onboard_surface":"MTA_ON_MOBILE","task_name":null,"time":1706866402,"updated_deferred_payout_state":null,"user_id":123456789123456}]})"; + + testJsonParse(json, expectedJson); +} + TEST_F(JsonFunctionsTest, isJsonScalarSignatures) { auto signatures = getSignatureStrings("is_json_scalar"); ASSERT_EQ(2, signatures.size()); diff --git a/velox/type/Conversions.cpp b/velox/type/Conversions.cpp index c2cbdad99289..29e71497d2a9 100644 --- a/velox/type/Conversions.cpp +++ b/velox/type/Conversions.cpp @@ -30,9 +30,9 @@ namespace facebook::velox::util { /// folly's tryTo doesn't ignore control characters or other unicode whitespace. /// We trim the string for control and unicode whitespace /// from both directions and return a StringView of the result. -StringView trimWhiteSpace(const char* data, size_t length) { +std::string_view trimWhiteSpace(const char* data, size_t length) { if (length == 0) { - return StringView(data, 0); + return std::string_view(data, 0); } int startIndex = 0; @@ -67,7 +67,7 @@ StringView trimWhiteSpace(const char* data, size_t length) { } // Trim whitespace from right side. - for (auto i = length - 1; i > startIndex;) { + for (auto i = length - 1; i >= startIndex; i--) { size = 0; auto isWhiteSpaceOrControlChar = false; @@ -92,15 +92,11 @@ StringView trimWhiteSpace(const char* data, size_t length) { endIndex = i; break; } - - if (i > 0) { - i--; - } } // If we end on a unicode char make sure we add that to the end. auto charSize = size > 0 ? size : 1; - return StringView(data + startIndex, endIndex - startIndex + charSize); + return std::string_view(data + startIndex, endIndex - startIndex + charSize); } } // namespace facebook::velox::util diff --git a/velox/type/Conversions.h b/velox/type/Conversions.h index 25847b1a99b5..bab6e2a49e96 100644 --- a/velox/type/Conversions.h +++ b/velox/type/Conversions.h @@ -233,7 +233,7 @@ struct Converter { /// Presto compatible trim of whitespace. This also trims /// control characters from both front and back and returns /// a StringView of the trimmed string. -StringView trimWhiteSpace(const char* data, size_t length); +std::string_view trimWhiteSpace(const char* data, size_t length); /// To TINYINT, SMALLINT, INTEGER, BIGINT, and HUGEINT converter. template