diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 71b1214d01..8ef1333583 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -108,11 +108,6 @@ def from_table( ): if offsets_col and primary_key: raise ValueError("must set at most one of 'offests', 'primary_key'") - if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): - warnings.warn( - "Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.", - bigframes.exceptions.PreviewWarning, - ) # define data source only for needed columns, this makes row-hashing cheaper table_def = nodes.GbqTable.from_table(table, columns=schema.names) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index a4c37b7c5d..00ff599cba 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -15,11 +15,9 @@ import textwrap from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union -import warnings import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import ibis @@ -208,10 +206,6 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: """ ibis_type = value.type() name = value.get_name() - if ibis_type.is_json(): - value = vendored_ibis_ops.ToJsonString(value).to_expr() - value = value.case().when("null", ibis.null()).else_(value).end() - return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) return value.cast(nullable_type).name(name) @@ -295,13 +289,8 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Integer): return pd.Int64Dtype() - # Temporary: Will eventually support an explicit json type instead of casting to string. if isinstance(ibis_dtype, ibis_dtypes.JSON): - warnings.warn( - "Interpreting JSON as string. This behavior may change in future versions.", - bigframes.exceptions.PreviewWarning, - ) - return bigframes.dtypes.STRING_DTYPE + return bigframes.dtypes.JSON_DTYPE if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 80e354aa8c..305251b7d0 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -18,10 +18,10 @@ import typing import bigframes_vendored.constants as constants -import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.operations as ibis_ops import ibis.expr.operations.generic import ibis.expr.types as ibis_types import numpy as np @@ -1113,26 +1113,20 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): - if x.type().is_json(): - return json_set( - json_obj=x, - json_path=op.json_path, - json_value=y, - ).to_expr() - else: - # Enabling JSON type eliminates the need for less efficient string conversions. - return vendored_ibis_ops.ToJsonString( - json_set( - json_obj=parse_json(x), - json_path=op.json_path, - json_value=y, - ) - ).to_expr() + return json_set(json_obj=x, json_path=op.json_path, json_value=y) @scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): - return json_extract(json_obj=x, json_path=op.json_path) + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract.__annotations__["return"] = return_type + json_extract_op = ibis_ops.udf.scalar.builtin(json_extract) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) @scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) @@ -1794,13 +1788,6 @@ def json_set( """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" -@ibis.udf.scalar.builtin(name="json_extract") -def json_extract( - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str -) -> ibis_dtypes.JSON: - """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" - - @ibis.udf.scalar.builtin(name="json_extract_array") def json_extract_array( json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index c71531f9f3..290c52b33c 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -21,6 +21,7 @@ from typing import Dict, Literal, Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery import numpy as np @@ -55,6 +56,7 @@ BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38)) # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() +JSON_DTYPE = db_dtypes.JSONDtype() # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE @@ -187,6 +189,12 @@ class SimpleDtypeInfo: logical_bytes=40, clusterable=True, ), + # JSON has no corresponding arrow dtype + SimpleDtypeInfo( + dtype=JSON_DTYPE, + arrow_dtype=None, + type_kind=("JSON",), + ), ) @@ -271,8 +279,7 @@ def is_struct_like(type_: ExpressionType) -> bool: def is_json_like(type_: ExpressionType) -> bool: - # TODO: Add JSON type support - return type_ == STRING_DTYPE + return type_ == STRING_DTYPE or type_ == JSON_DTYPE def is_json_encoding_type(type_: ExpressionType) -> bool: @@ -446,8 +453,6 @@ def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: return bigframes_dtype_to_arrow_dtype(infer_literal_type(literal)) -# Don't have dtype for json, so just end up interpreting as STRING -_REMAPPED_TYPEKINDS = {"JSON": "STRING"} _TK_TO_BIGFRAMES = { type_kind: mapping.dtype for mapping in SIMPLE_TYPES @@ -471,12 +476,8 @@ def convert_schema_field( pa_struct = pa.struct(fields) pa_type = pa.list_(pa_struct) if is_repeated else pa_struct return field.name, pd.ArrowDtype(pa_type) - elif ( - field.field_type in _TK_TO_BIGFRAMES or field.field_type in _REMAPPED_TYPEKINDS - ): - singular_type = _TK_TO_BIGFRAMES[ - _REMAPPED_TYPEKINDS.get(field.field_type, field.field_type) - ] + elif field.field_type in _TK_TO_BIGFRAMES: + singular_type = _TK_TO_BIGFRAMES[field.field_type] if is_repeated: pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(singular_type)) return field.name, pd.ArrowDtype(pa_type) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 2e2e4a0552..ee40835ea5 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -871,8 +871,7 @@ def output_type(self, *input_types): + f"Received type: {right_type}" ) - # After JSON type implementation, ONLY return JSON data. - return left_type + return dtypes.JSON_DTYPE # Ternary Ops diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 6ceaab6915..f11ede7519 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -16,6 +16,7 @@ from typing import Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas # type: ignore import pandas import pandas.arrays @@ -104,6 +105,8 @@ def arrow_to_pandas( series = pandas.Series(pd_array, dtype=dtype) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) + elif isinstance(dtype, db_dtypes.JSONDtype): + series = db_dtypes.JSONArray(column) else: series = column.to_pandas(types_mapper=lambda _: dtype) diff --git a/setup.py b/setup.py index 833d4fe565..247a2fe0f2 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,10 @@ "sqlglot >=23.6.3,<25.2", "tabulate >= 0.9", "ipywidgets >=7.7.1", - "humanize >= 4.6.0", - "matplotlib >= 3.7.1", + "humanize >=4.6.0", + "matplotlib >=3.7.1", + # "db-dtypes >=1.3.0", + "db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 4a9d1ae281..caf9a8b9c3 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -26,5 +26,6 @@ tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 +db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d # extras pandas-gbq==0.19.0 diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 75b9345107..4319d47a64 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) actual = bbq.json_extract(s, "$.a.b").to_pandas() - # After the introduction of the JSON type, the output should be a JSON-formatted series. expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( actual, @@ -129,11 +128,10 @@ def test_json_extract_from_json(): def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json([[1, 2], None, 0]) + expected = bpd.Series(["[1,2]", None, "0"]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), - check_names=False, ) @@ -142,17 +140,30 @@ def test_json_extract_w_invalid_series_type(): bbq.json_extract(bpd.Series([1, 2]), "$.a") +# The repeated JSON dtype is not supported because `dbjson` is not an arrow extension dtype. +# def test_json_extract_array_from_json(): +# s = _get_series_from_json([{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4","5"]}, {}]) +# actual = bbq.json_extract_array(s, "$.a") +# expected = _get_series_from_json([["ab", "2", "3 xy"], [], ["4","5"], None]) +# pd.testing.assert_series_equal( +# actual.to_pandas(), +# expected.to_pandas(), +# ) + + def test_json_extract_array_from_json_strings(): - s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"] + ) actual = bbq.json_extract_array(s, "$.a") - expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']]) + expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), ) -def test_json_extract_array_from_array_strings(): +def test_json_extract_array_from_json_array_strings(): s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) actual = bbq.json_extract_array(s) expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index ab1fdceae5..a7a9f7c46c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -14,6 +14,7 @@ from typing import Tuple +import db_dtypes # type:ignore import google.api_core.exceptions import pandas as pd import pandas.testing @@ -248,22 +249,20 @@ def test_to_pandas_array_struct_correct_result(session): def test_load_json(session): - df = session.read_gbq( - """SELECT - JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column - """ - ) + df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column") + assert df.dtypes["json_column"] == db_dtypes.JSONDtype() + assert isinstance(df["json_column"][0], dict) + assert df["json_column"][0]["bar"] + assert df["json_column"][0]["foo"] == 10 result = df.to_pandas() - expected = pd.DataFrame( - { - "json_column": ['{"bar":true,"foo":10}'], - }, - dtype=pd.StringDtype(storage="pyarrow"), + pd_df = pd.DataFrame( + {"json_column": [{"bar": True, "foo": 10}]}, + dtype=db_dtypes.JSONDtype(), ) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, expected.dtypes) - pd.testing.assert_series_equal(result["json_column"], expected["json_column"]) + pd_df.index = pd_df.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(result["json_column"], pd_df["json_column"]) def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5bb20e2714..9e77926eaa 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -17,6 +17,7 @@ import re import tempfile +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import numpy from packaging.version import Version @@ -281,7 +282,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] series_pandas = series.to_pandas() - assert series.dtype == pd.StringDtype(storage="pyarrow") + assert series.dtype == db_dtypes.JSONDtype() assert series_pandas.shape[0] == json_pandas_df.shape[0]