Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: Enable reading JSON data with dbjson extension dtype #1139

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,6 @@ def from_table(
):
if offsets_col and primary_key:
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
warnings.warn(
"Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.",
bigframes.exceptions.PreviewWarning,
)
# define data source only for needed columns, this makes row-hashing cheaper
table_def = nodes.GbqTable.from_table(table, columns=schema.names)

Expand Down
13 changes: 1 addition & 12 deletions bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@

import textwrap
from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
import warnings

import bigframes_vendored.constants as constants
import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
import ibis
Expand Down Expand Up @@ -208,10 +206,6 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
"""
ibis_type = value.type()
name = value.get_name()
if ibis_type.is_json():
value = vendored_ibis_ops.ToJsonString(value).to_expr()
value = value.case().when("null", ibis.null()).else_(value).end()
return value.name(name)
# Allow REQUIRED fields to be joined with NULLABLE fields.
nullable_type = ibis_type.copy(nullable=True)
return value.cast(nullable_type).name(name)
Expand Down Expand Up @@ -295,13 +289,8 @@ def ibis_dtype_to_bigframes_dtype(
if isinstance(ibis_dtype, ibis_dtypes.Integer):
return pd.Int64Dtype()

# Temporary: Will eventually support an explicit json type instead of casting to string.
if isinstance(ibis_dtype, ibis_dtypes.JSON):
warnings.warn(
"Interpreting JSON as string. This behavior may change in future versions.",
bigframes.exceptions.PreviewWarning,
)
return bigframes.dtypes.STRING_DTYPE
return bigframes.dtypes.JSON_DTYPE

if ibis_dtype in IBIS_TO_BIGFRAMES:
return IBIS_TO_BIGFRAMES[ibis_dtype]
Expand Down
35 changes: 11 additions & 24 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
import typing

import bigframes_vendored.constants as constants
import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
import ibis.common.exceptions
import ibis.expr.datatypes as ibis_dtypes
import ibis.expr.operations as ibis_ops
import ibis.expr.operations.generic
import ibis.expr.types as ibis_types
import numpy as np
Expand Down Expand Up @@ -1113,26 +1113,20 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp):
# JSON Ops
@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
if x.type().is_json():
return json_set(
json_obj=x,
json_path=op.json_path,
json_value=y,
).to_expr()
else:
# Enabling JSON type eliminates the need for less efficient string conversions.
return vendored_ibis_ops.ToJsonString(
json_set(
json_obj=parse_json(x),
json_path=op.json_path,
json_value=y,
)
).to_expr()
return json_set(json_obj=x, json_path=op.json_path, json_value=y)


@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
return json_extract(json_obj=x, json_path=op.json_path)
# Define a user-defined function whose returned type is dynamically matching the input.
def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
...

return_type = x.type()
json_extract.__annotations__["return"] = return_type
json_extract_op = ibis_ops.udf.scalar.builtin(json_extract)
return json_extract_op(json_or_json_string=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
Expand Down Expand Up @@ -1794,13 +1788,6 @@ def json_set(
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""


@ibis.udf.scalar.builtin(name="json_extract")
def json_extract(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.JSON:
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""


@ibis.udf.scalar.builtin(name="json_extract_array")
def json_extract_array(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
Expand Down
21 changes: 11 additions & 10 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import Dict, Literal, Union

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery
import numpy as np
Expand Down Expand Up @@ -55,6 +56,7 @@
BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38))
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
JSON_DTYPE = db_dtypes.JSONDtype()

# Used when storing Null expressions
DEFAULT_DTYPE = FLOAT_DTYPE
Expand Down Expand Up @@ -187,6 +189,12 @@ class SimpleDtypeInfo:
logical_bytes=40,
clusterable=True,
),
# JSON has no corresponding arrow dtype
SimpleDtypeInfo(
dtype=JSON_DTYPE,
arrow_dtype=None,
type_kind=("JSON",),
),
)


Expand Down Expand Up @@ -271,8 +279,7 @@ def is_struct_like(type_: ExpressionType) -> bool:


def is_json_like(type_: ExpressionType) -> bool:
# TODO: Add JSON type support
return type_ == STRING_DTYPE
return type_ == STRING_DTYPE or type_ == JSON_DTYPE


def is_json_encoding_type(type_: ExpressionType) -> bool:
Expand Down Expand Up @@ -446,8 +453,6 @@ def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
return bigframes_dtype_to_arrow_dtype(infer_literal_type(literal))


# Don't have dtype for json, so just end up interpreting as STRING
_REMAPPED_TYPEKINDS = {"JSON": "STRING"}
_TK_TO_BIGFRAMES = {
type_kind: mapping.dtype
for mapping in SIMPLE_TYPES
Expand All @@ -471,12 +476,8 @@ def convert_schema_field(
pa_struct = pa.struct(fields)
pa_type = pa.list_(pa_struct) if is_repeated else pa_struct
return field.name, pd.ArrowDtype(pa_type)
elif (
field.field_type in _TK_TO_BIGFRAMES or field.field_type in _REMAPPED_TYPEKINDS
):
singular_type = _TK_TO_BIGFRAMES[
_REMAPPED_TYPEKINDS.get(field.field_type, field.field_type)
]
elif field.field_type in _TK_TO_BIGFRAMES:
singular_type = _TK_TO_BIGFRAMES[field.field_type]
if is_repeated:
pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(singular_type))
return field.name, pd.ArrowDtype(pa_type)
Expand Down
3 changes: 1 addition & 2 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,8 +871,7 @@ def output_type(self, *input_types):
+ f"Received type: {right_type}"
)

# After JSON type implementation, ONLY return JSON data.
return left_type
return dtypes.JSON_DTYPE


# Ternary Ops
Expand Down
3 changes: 3 additions & 0 deletions bigframes/session/_io/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from typing import Union

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
import geopandas # type: ignore
import pandas
import pandas.arrays
Expand Down Expand Up @@ -104,6 +105,8 @@ def arrow_to_pandas(
series = pandas.Series(pd_array, dtype=dtype)
elif isinstance(dtype, pandas.ArrowDtype):
series = _arrow_to_pandas_arrowdtype(column, dtype)
elif isinstance(dtype, db_dtypes.JSONDtype):
series = db_dtypes.JSONArray(column)
else:
series = column.to_pandas(types_mapper=lambda _: dtype)

Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@
"sqlglot >=23.6.3,<25.2",
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
"humanize >=4.6.0",
"matplotlib >=3.7.1",
# "db-dtypes >=1.3.0",
"db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d",
]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
matplotlib==3.7.1
db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d
# extras
pandas-gbq==0.19.0
23 changes: 17 additions & 6 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type():
def test_json_extract_from_json():
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
actual = bbq.json_extract(s, "$.a.b").to_pandas()
# After the introduction of the JSON type, the output should be a JSON-formatted series.
expected = _get_series_from_json([[1, 2], None, 0]).to_pandas()
pd.testing.assert_series_equal(
actual,
Expand All @@ -129,11 +128,10 @@ def test_json_extract_from_json():
def test_json_extract_from_string():
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
actual = bbq.json_extract(s, "$.a.b")
expected = _get_series_from_json([[1, 2], None, 0])
expected = bpd.Series(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
check_names=False,
)


Expand All @@ -142,17 +140,30 @@ def test_json_extract_w_invalid_series_type():
bbq.json_extract(bpd.Series([1, 2]), "$.a")


# The repeated JSON dtype is not supported because `dbjson` is not an arrow extension dtype.
# def test_json_extract_array_from_json():
# s = _get_series_from_json([{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4","5"]}, {}])
# actual = bbq.json_extract_array(s, "$.a")
# expected = _get_series_from_json([["ab", "2", "3 xy"], [], ["4","5"], None])
# pd.testing.assert_series_equal(
# actual.to_pandas(),
# expected.to_pandas(),
# )


def test_json_extract_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
s = bpd.Series(
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"]
)
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_from_array_strings():
def test_json_extract_array_from_json_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
Expand Down
25 changes: 12 additions & 13 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from typing import Tuple

import db_dtypes # type:ignore
import google.api_core.exceptions
import pandas as pd
import pandas.testing
Expand Down Expand Up @@ -248,22 +249,20 @@ def test_to_pandas_array_struct_correct_result(session):


def test_load_json(session):
df = session.read_gbq(
"""SELECT
JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column
"""
)
df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column")
assert df.dtypes["json_column"] == db_dtypes.JSONDtype()
assert isinstance(df["json_column"][0], dict)
assert df["json_column"][0]["bar"]
assert df["json_column"][0]["foo"] == 10

result = df.to_pandas()
expected = pd.DataFrame(
{
"json_column": ['{"bar":true,"foo":10}'],
},
dtype=pd.StringDtype(storage="pyarrow"),
pd_df = pd.DataFrame(
{"json_column": [{"bar": True, "foo": 10}]},
dtype=db_dtypes.JSONDtype(),
)
expected.index = expected.index.astype("Int64")
pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
pd.testing.assert_series_equal(result["json_column"], expected["json_column"])
pd_df.index = pd_df.index.astype("Int64")
pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
pd.testing.assert_series_equal(result["json_column"], pd_df["json_column"])


def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
Expand Down
3 changes: 2 additions & 1 deletion tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import re
import tempfile

import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import numpy
from packaging.version import Version
Expand Down Expand Up @@ -281,7 +282,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
def test_get_column_w_json(json_df, json_pandas_df):
series = json_df["json_col"]
series_pandas = series.to_pandas()
assert series.dtype == pd.StringDtype(storage="pyarrow")
assert series.dtype == db_dtypes.JSONDtype()
assert series_pandas.shape[0] == json_pandas_df.shape[0]


Expand Down