Skip to content

Commit

Permalink
feat: Add Spark get_json_object function (#11691)
Browse files Browse the repository at this point in the history
Summary:
This PR proposes an implementation for Spark get_json_object function
based on simdjson lib. This function returns a json object, represented by
VARCHAR, from json string by searching user-specified path.

Spark source code: [link](https://github.com/apache/spark/blob/v3.5.1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala#L127).

Pull Request resolved: #11691

Reviewed By: xiaoxmeng

Differential Revision: D67119142

Pulled By: kgpai

fbshipit-source-id: f4a4259a1bd54c6bb6e7811480f764a9f1a0373a
  • Loading branch information
PHILO-HE authored and facebook-github-bot committed Dec 19, 2024
1 parent c9dee4e commit a1adafe
Show file tree
Hide file tree
Showing 5 changed files with 380 additions and 1 deletion.
35 changes: 34 additions & 1 deletion velox/docs/functions/spark/json.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,42 @@
JSON Functions
==============

JSON Format
-----------

JSON is a language-independent data format that represents data as
human-readable text. A JSON text can represent a number, a boolean, a
string, an array, an object, or a null. A JSON text representing a string
must escape all characters and enclose the string in double quotes, e.g.,
``"123\n"``, whereas a JSON text representing a number does not need to,
e.g., ``123``. A JSON text representing an array must enclose the array
elements in square brackets, e.g., ``[1,2,3]``. More detailed grammar can
be found in `this JSON introduction`_.

.. _this JSON introduction: https://www.json.org

JSON Functions
--------------

.. spark:function:: get_json_object(jsonString, path) -> varchar
Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``.
Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field"
to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns
``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed.
Returns NULL if ``path`` does not exist. ::

SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b'
SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}'
SELECT get_json_object('{"a":3}', '$.b'); -- NULL (unexisting field)
SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string)
SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path)

.. spark:function:: json_object_keys(jsonString) -> array(string)
Returns all the keys of the outermost JSON object as an array if a valid JSON object is given. If it is any other valid JSON string, an invalid JSON string or an empty string, the function returns null. ::
Returns all the keys of the outermost JSON object as an array if a valid JSON object is given.
If it is any other valid JSON string, an invalid JSON string or an empty string, the function
returns null. ::

SELECT json_object_keys('{}'); -- []
SELECT json_object_keys('{"name": "Alice", "age": 5, "id": "001"}'); -- ['name', 'age', 'id']
Expand Down
219 changes: 219 additions & 0 deletions velox/functions/sparksql/GetJsonObject.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/functions/Macros.h"
#include "velox/functions/prestosql/json/SIMDJsonUtil.h"

namespace facebook::velox::functions::sparksql {

/// Parses a JSON string and returns the value at the specified path.
/// Simdjson On-Demand API is used to parse JSON string.
/// get_json_object(jsonString, path) -> value
template <typename T>
struct GetJsonObjectFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

// ASCII input always produces ASCII result.
static constexpr bool is_default_ascii_behavior = true;

FOLLY_ALWAYS_INLINE void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& /*config*/,
const arg_type<Varchar>* /*json*/,
const arg_type<Varchar>* jsonPath) {
if (jsonPath != nullptr && checkJsonPath(*jsonPath)) {
jsonPath_ = removeSingleQuotes(*jsonPath);
}
}

FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& json,
const arg_type<Varchar>& jsonPath) {
// Spark requires the first char in jsonPath is '$'.
if (!checkJsonPath(jsonPath)) {
return false;
}
// jsonPath is "$".
if (jsonPath.size() == 1) {
result.append(json);
return true;
}
simdjson::ondemand::document jsonDoc;
simdjson::padded_string paddedJson(json.data(), json.size());
if (simdjsonParse(paddedJson).get(jsonDoc)) {
return false;
}
const auto formattedJsonPath = jsonPath_.has_value()
? jsonPath_.value()
: removeSingleQuotes(jsonPath);
try {
// Can return error result or throw exception possibly.
auto rawResult = jsonDoc.at_path(formattedJsonPath);
if (rawResult.error()) {
return false;
}

if (!extractStringResult(rawResult, result)) {
return false;
}
} catch (simdjson::simdjson_error& e) {
return false;
}

const char* currentPos;
if (jsonDoc.current_location().get(currentPos)) {
return false;
}

return isValidEndingCharacter(currentPos);
}

private:
FOLLY_ALWAYS_INLINE bool checkJsonPath(StringView jsonPath) {
// Spark requires the first char in jsonPath is '$'.
if (jsonPath.empty() || jsonPath.data()[0] != '$') {
return false;
}
return true;
}

// Spark's json path requires field name surrounded by single quotes if it is
// specified in "[]". But simdjson lib requires not. This method just removes
// such single quotes to adapt to simdjson lib, e.g., converts "['a']['b']" to
// "[a][b]".
std::string removeSingleQuotes(StringView jsonPath) {
// Skip the initial "$".
std::string result(jsonPath.data() + 1, jsonPath.size() - 1);
size_t pairEnd = 0;
while (true) {
auto pairBegin = result.find("['", pairEnd);
if (pairBegin == std::string::npos) {
break;
}
pairEnd = result.find("]", pairBegin);
// If expected pattern, like ['a'], is not found.
if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') {
return "-1";
}
result.erase(pairEnd - 1, 1);
result.erase(pairBegin + 1, 1);
pairEnd -= 2;
}
return result;
}

// Extracts a string representation from a simdjson result. Handles various
// JSON types including numbers, booleans, strings, objects, and arrays.
// Returns true if the conversion is successful. Otherwise, returns false.
bool extractStringResult(
simdjson::simdjson_result<simdjson::ondemand::value> rawResult,
out_type<Varchar>& result) {
std::stringstream ss;
switch (rawResult.type()) {
// For number and bool types, we need to explicitly get the value
// for specific types instead of using `ss << rawResult`. Thus, we
// can make simdjson's internal parsing position moved and then we
// can check the validity of ending character.
case simdjson::ondemand::json_type::number: {
switch (rawResult.get_number_type()) {
case simdjson::ondemand::number_type::unsigned_integer: {
uint64_t numberResult;
if (!rawResult.get_uint64().get(numberResult)) {
ss << numberResult;
result.append(ss.str());
return true;
}
return false;
}
case simdjson::ondemand::number_type::signed_integer: {
int64_t numberResult;
if (!rawResult.get_int64().get(numberResult)) {
ss << numberResult;
result.append(ss.str());
return true;
}
return false;
}
case simdjson::ondemand::number_type::floating_point_number: {
double numberResult;
if (!rawResult.get_double().get(numberResult)) {
ss << rawResult;
result.append(ss.str());
return true;
}
return false;
}
default:
VELOX_UNREACHABLE();
}
}
case simdjson::ondemand::json_type::boolean: {
bool boolResult;
if (!rawResult.get_bool().get(boolResult)) {
result.append(boolResult ? "true" : "false");
return true;
}
return false;
}
case simdjson::ondemand::json_type::string: {
std::string_view stringResult;
if (!rawResult.get_string().get(stringResult)) {
result.append(stringResult);
return true;
}
return false;
}
case simdjson::ondemand::json_type::object: {
// For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will
// return an object type.
ss << rawResult;
result.append(ss.str());
return true;
}
case simdjson::ondemand::json_type::array: {
ss << rawResult;
result.append(ss.str());
return true;
}
default:
return false;
}
}

// Checks whether the obtained result is followed by valid char. Because
// On-Demand API we are using ignores json format validation for characters
// following the current parsing position. As json doc is padded with NULL
// characters, it's safe to do recursively check.
bool isValidEndingCharacter(const char* currentPos) {
char endingChar = *currentPos;
if (endingChar == ',' || endingChar == '}' || endingChar == ']') {
return true;
}
// These chars can be prior to a valid ending char. See reference:
// https://github.com/simdjson/simdjson/blob/v3.9.0/dependencies/jsoncppdist/jsoncpp.cpp
if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' ||
endingChar == '\t') {
return isValidEndingCharacter(++currentPos);
}
return false;
}

// Used for constant json path.
std::optional<std::string> jsonPath_;
};

} // namespace facebook::velox::functions::sparksql
3 changes: 3 additions & 0 deletions velox/functions/sparksql/registration/RegisterJson.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
* limitations under the License.
*/
#include "velox/functions/lib/RegistrationHelpers.h"
#include "velox/functions/sparksql/GetJsonObject.h"
#include "velox/functions/sparksql/JsonObjectKeys.h"

namespace facebook::velox::functions::sparksql {

void registerJsonFunctions(const std::string& prefix) {
registerFunction<GetJsonObjectFunction, Varchar, Varchar, Varchar>(
{prefix + "get_json_object"});
registerFunction<JsonObjectKeysFunction, Array<Varchar>, Varchar>(
{prefix + "json_object_keys"});
}
Expand Down
1 change: 1 addition & 0 deletions velox/functions/sparksql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ add_executable(
DecimalRoundTest.cpp
DecimalUtilTest.cpp
ElementAtTest.cpp
GetJsonObjectTest.cpp
HashTest.cpp
InTest.cpp
JsonObjectKeysTest.cpp
Expand Down
Loading

0 comments on commit a1adafe

Please sign in to comment.