Skip to content

Commit

Permalink
feat: implement bigframes.bigquery.json_extract_array (#910)
Browse files Browse the repository at this point in the history
* feat: implement `bigframes.bigquery.json_extract_array`

This id needed to implement support for array return types in remote
functions.

* actually return, make tests pass

* add negative test case
  • Loading branch information
shobsi authored Aug 23, 2024
1 parent e837f6e commit 575a29e
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 0 deletions.
32 changes: 32 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,38 @@ def json_extract(
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))


def json_extract_array(
series: series.Series,
json_path: str = "$",
) -> series.Series:
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
values. This function uses single quotes and brackets to escape invalid JSONPath
characters in JSON keys.
**Examples:**
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
>>> bbq.json_extract_array(s)
0 ['1' '2' '3']
1 ['4' '5']
dtype: list<item: string>[pyarrow]
Args:
series (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.
Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
"""
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))


# Search functions defined from
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions

Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,11 @@ def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
return json_extract(json_obj=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
return json_extract_array(json_obj=x, json_path=op.json_path)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down Expand Up @@ -1581,6 +1586,13 @@ def json_extract(
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""


@ibis.udf.scalar.builtin(name="json_extract_array")
def json_extract_array(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
17 changes: 17 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,23 @@ def output_type(self, *input_types):
return input_type


@dataclasses.dataclass(frozen=True)
class JSONExtractArray(UnaryOp):
name: typing.ClassVar[str] = "json_extract_array"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be an valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
)


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
25 changes: 25 additions & 0 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,28 @@ def test_json_extract_from_string():
def test_json_extract_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract(bpd.Series([1, 2]), "$.a")


def test_json_extract_array_from_json_strings():
s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract_array(bpd.Series([1, 2]))

0 comments on commit 575a29e

Please sign in to comment.