Skip to content

Commit

Permalink
Add support for canonicalization of JSON.
Browse files Browse the repository at this point in the history
  • Loading branch information
kgpai committed Oct 28, 2024
1 parent 92779f9 commit 9bc80fb
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 22 deletions.
170 changes: 151 additions & 19 deletions velox/functions/prestosql/JsonFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,92 @@
* limitations under the License.
*/
#include "velox/expression/VectorFunction.h"
#include "velox/functions/prestosql/json/JsonStringUtil.h"
#include "velox/functions/prestosql/json/SIMDJsonUtil.h"
#include "velox/functions/prestosql/types/JsonType.h"
#include "velox/type/Conversions.h"

namespace facebook::velox::functions {

StringView trimAndEscape(const char* data, size_t length) {
return velox::util::trimWhiteSpace(data, length);
}

namespace {
constexpr const char* kArrayStart = "[";
constexpr const char* kArrayEnd = "]";
constexpr const char* kSeparator = ",";
constexpr const char* kObjectStart = "{";
constexpr const char* kObjectEnd = "}";
constexpr const char* kObjectKeySeparator = ":";
constexpr const char* kQuote = "\"";

class JsonView {
public:
virtual void canonicalize(std::stringstream& stream) = 0;
};

using JsonViewPtr = std::shared_ptr<JsonView>;

struct JsonLeafView : public JsonView {
JsonLeafView(const StringView view) : view_(view) {};

void canonicalize(std::stringstream& stream) override {
auto canonicalValue = trimAndEscape(view_.data(), view_.size());
stream << canonicalValue;
}

private:
const StringView view_;
};

struct JsonArrayView : public JsonView {
JsonArrayView(const std::vector<JsonViewPtr> array) : array_(array) {};

void canonicalize(std::stringstream& stream) override {
stream << kArrayStart;
for (auto i = 0; i < array_.size(); i++) {
array_[i]->canonicalize(stream);
if (i < array_.size() - 1) {
stream << kSeparator;
}
}
stream << kArrayEnd;
}

private:
const std::vector<JsonViewPtr> array_;
};

struct JsonObjView : public JsonView {
JsonObjView(std::vector<std::pair<StringView, JsonViewPtr>> objFields)
: objFields_(objFields) {};

void canonicalize(std::stringstream& stream) override {
std::sort(objFields_.begin(), objFields_.end(), [](auto& a, auto& b) {
return a.first < b.first;
});

stream << kObjectStart;
for (auto i = 0; i < objFields_.size(); i++) {
auto field = objFields_[i];
auto trimmedKey = trimAndEscape(field.first.data(), field.first.size());

stream << kQuote << trimmedKey << kQuote << kObjectKeySeparator;
field.second->canonicalize(stream);
if (i < objFields_.size() - 1) {
stream << kSeparator;
}
}
stream << kObjectEnd;
}

private:
std::vector<std::pair<StringView, JsonViewPtr>> objFields_;
};

} // namespace

namespace {
class JsonFormatFunction : public exec::VectorFunction {
public:
Expand Down Expand Up @@ -84,14 +165,32 @@ class JsonParseFunction : public exec::VectorFunction {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING);
memcpy(paddedInput_.data(), value.data(), value.size());
if (auto error = parse(value.size())) {
auto canonicalStringStream = std::stringstream{};
JsonViewPtr jsonView;

if (auto error = parse(value.size(), jsonView)) {
context.setErrors(rows, errors_[error]);
return;
}
localResult = std::make_shared<ConstantVector<StringView>>(
context.pool(), rows.end(), false, JSON(), std::move(value));

jsonView->canonicalize(canonicalStringStream);
localResult = BaseVector::createConstant(
JSON(), canonicalStringStream.str(), rows.end(), context.pool());

} else {
auto flatInput = arg->asFlatVector<StringView>();
BufferPtr stringViews = AlignedBuffer::allocate<StringView>(
rows.end(), context.pool(), StringView());

// TODO: Optimize this
localResult = std::make_shared<FlatVector<StringView>>(
context.pool(),
JSON(),
nullptr,
rows.end(),
stringViews,
std::vector<BufferPtr>{});
auto flatResult = localResult->asFlatVector<StringView>();

auto stringBuffers = flatInput->stringBuffers();
VELOX_CHECK_LE(rows.end(), flatInput->size());
Expand All @@ -102,20 +201,23 @@ class JsonParseFunction : public exec::VectorFunction {
maxSize = std::max(maxSize, value.size());
});
paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING);

auto canonicalStringStream = std::stringstream{};
JsonViewPtr jsonView;
rows.applyToSelected([&](auto row) {
auto value = flatInput->valueAt(row);
memcpy(paddedInput_.data(), value.data(), value.size());
if (auto error = parse(value.size())) {
if (auto error = parse(value.size(), jsonView)) {
context.setVeloxExceptionError(row, errors_[error]);
} else {
jsonView->canonicalize(canonicalStringStream);
auto canonicalString = canonicalStringStream.str();
// TODO: This creates a copy, can we optimize.
flatResult->set(
row, StringView(canonicalString.data(), canonicalString.size()));
canonicalStringStream.str(std::string());
}
});
localResult = std::make_shared<FlatVector<StringView>>(
context.pool(),
JSON(),
nullptr,
rows.end(),
flatInput->values(),
std::move(stringBuffers));
}

context.moveOrCopyResult(localResult, rows, result);
Expand All @@ -130,45 +232,75 @@ class JsonParseFunction : public exec::VectorFunction {
}

private:
simdjson::error_code parse(size_t size) const {
simdjson::error_code parse(size_t size, JsonViewPtr& jsonView) const {
simdjson::padded_string_view paddedInput(
paddedInput_.data(), size, paddedInput_.size());
SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(paddedInput));
SIMDJSON_TRY(validate<simdjson::ondemand::document&>(doc));
SIMDJSON_TRY(validate<simdjson::ondemand::document&>(doc, jsonView));
if (!doc.at_end()) {
return simdjson::TRAILING_CONTENT;
}
return simdjson::SUCCESS;
}

template <typename T>
static simdjson::error_code validate(T value) {
static simdjson::error_code validate(T value, JsonViewPtr& jsonView) {
SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type());
switch (type) {
case simdjson::ondemand::json_type::array: {
SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array());

std::vector<JsonViewPtr> arrayPtr;
for (auto elementOrError : array) {
SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError);
SIMDJSON_TRY(validate(element));
JsonViewPtr elementPtr;
SIMDJSON_TRY(validate(element, elementPtr));
arrayPtr.push_back(elementPtr);
}

jsonView = std::make_shared<JsonArrayView>(std::move(arrayPtr));
return simdjson::SUCCESS;
}
case simdjson::ondemand::json_type::object: {
SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object());

std::vector<std::pair<StringView, JsonViewPtr>> objFields;
for (auto fieldOrError : object) {
SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError);
SIMDJSON_TRY(validate(field.value()));
JsonViewPtr elementPtr;
auto key = StringView(field.escaped_key());
SIMDJSON_TRY(validate(field.value(), elementPtr));
objFields.push_back({key, elementPtr});
}

jsonView = std::make_shared<JsonObjView>(objFields);
return simdjson::SUCCESS;
}
case simdjson::ondemand::json_type::number:
case simdjson::ondemand::json_type::number: {
std::string_view rawJsonv = value.raw_json_token();

jsonView = std::make_shared<JsonLeafView>(StringView(rawJsonv));
return value.get_double().error();
case simdjson::ondemand::json_type::string:
}
case simdjson::ondemand::json_type::string: {
auto rawJsonv = StringView(value.raw_json_token());

jsonView = std::make_shared<JsonLeafView>(rawJsonv);
return value.get_string().error();
case simdjson::ondemand::json_type::boolean:
}

case simdjson::ondemand::json_type::boolean: {
auto rawJsonv = StringView(value.raw_json_token());

jsonView = std::make_shared<JsonLeafView>(rawJsonv);
return value.get_bool().error();
}

case simdjson::ondemand::json_type::null: {
SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null());
auto rawJsonv = StringView(value.raw_json_token());

jsonView = std::make_shared<JsonLeafView>(rawJsonv);
return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR;
}
}
Expand Down
80 changes: 77 additions & 3 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "folly/Unicode.h"
#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
#include "velox/functions/prestosql/types/JsonType.h"
Expand Down Expand Up @@ -189,13 +190,18 @@ TEST_F(JsonFunctionsTest, jsonParse) {
};

EXPECT_EQ(jsonParse(std::nullopt), std::nullopt);
// Spaces before and after.
EXPECT_EQ(jsonParse(R"( "abc" )"), R"("abc")");
EXPECT_EQ(jsonParse(R"(true)"), "true");
EXPECT_EQ(jsonParse(R"(null)"), "null");
EXPECT_EQ(jsonParse(R"(42)"), "42");
EXPECT_EQ(jsonParse(R"("abc")"), R"("abc")");
EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1, 2, 3]");
EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})");
EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])");
EXPECT_EQ(jsonParse("\"abc\u4FE1\""), "\"abc\u4FE1\"");
auto utf32cp = folly::codePointToUtf8(U'😀');
EXPECT_EQ(jsonParse(fmt::format("\"{}\"", utf32cp)), "\"😀\"");
EXPECT_EQ(jsonParse(R"([1, 2, 3])"), "[1,2,3]");
EXPECT_EQ(jsonParse(R"({"k1": "v1" })"), R"({"k1":"v1"})");
EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1","v1"])");

VELOX_ASSERT_THROW(
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
Expand Down Expand Up @@ -276,6 +282,74 @@ TEST_F(JsonFunctionsTest, jsonParse) {
}
}

TEST_F(JsonFunctionsTest, canonicalization) {
const auto jsonParse = [&](std::optional<std::string> value) {
return evaluateOnce<StringView>("json_parse(c0)", value);
};

auto json = R"({
"menu": {
"id": "file",
"value": "File",
"popup": {
"menuitem": [
{
"value": "New",
"onclick": "CreateNewDoc() "
},
{
"value": "Open",
"onclick": "OpenDoc() "
},
{
"value": "Close",
"onclick": "CloseDoc() "
}
]
}
}
})";

StringView expectedJson =
R"({"menu":{"id":"file","popup":{"menuitem":[{"onclick":"CreateNewDoc() ","value":"New"},{"onclick":"OpenDoc() ","value":"Open"},{"onclick":"CloseDoc() ","value":"Close"}]},"value":"File"}})";
EXPECT_EQ(jsonParse(json), expectedJson);

json =
"{\n"
" \"name\": \"John Doe\",\n"
" \"address\": {\n"
" \"street\": \"123 Main St\",\n"
" \"city\": \"Anytown\",\n"
" \"state\": \"CA\",\n"
" \"zip\": \"12345\"\n"
" },\n"
" \"phoneNumbers\": [\n"
" {\n"
" \"type\": \"home\",\n"
" \"number\": \"555-1234\"\n"
" },\n"
" {\n"
" \"type\": \"work\",\n"
" \"number\": \"555-5678\"\n"
" }\n"
" ],\n"
" \"familyMembers\": [\n"
" {\n"
" \"name\": \"Jane Doe\",\n"
" \"relationship\": \"wife\"\n"
" },\n"
" {\n"
" \"name\": \"Jimmy Doe\",\n"
" \"relationship\": \"son\"\n"
" }\n"
" ],\n"
" \"hobbies\": [\"golf\", \"reading\", \"traveling\"]\n"
"}";
expectedJson =
R"({"address":{"city":"Anytown","state":"CA","street":"123 Main St","zip":"12345"},"familyMembers":[{"name":"Jane Doe","relationship":"wife"},{"name":"Jimmy Doe","relationship":"son"}],"hobbies":["golf","reading","traveling"],"name":"John Doe","phoneNumbers":[{"number":"555-1234","type":"home"},{"number":"555-5678","type":"work"}]})";
EXPECT_EQ(jsonParse(json), expectedJson);
}

TEST_F(JsonFunctionsTest, isJsonScalarSignatures) {
auto signatures = getSignatureStrings("is_json_scalar");
ASSERT_EQ(2, signatures.size());
Expand Down

0 comments on commit 9bc80fb

Please sign in to comment.