googleapis · chelsea-lin · Nov 7, 2024 · Nov 13, 2024
@@ -108,11 +108,6 @@ def from_table(
     ):
         if offsets_col and primary_key:
             raise ValueError("must set at most one of 'offests', 'primary_key'")
-        if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
-            warnings.warn(
-                "Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.",
-                bigframes.exceptions.PreviewWarning,
-            )
         # define data source only for needed columns, this makes row-hashing cheaper
         table_def = nodes.GbqTable.from_table(table, columns=schema.names)
 

@@ -15,11 +15,9 @@
 
 import textwrap
 from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
-import warnings
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes
-import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
 import ibis
@@ -208,10 +206,6 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
     """
     ibis_type = value.type()
     name = value.get_name()
-    if ibis_type.is_json():
-        value = vendored_ibis_ops.ToJsonString(value).to_expr()
-        value = value.case().when("null", ibis.null()).else_(value).end()
-        return value.name(name)
     # Allow REQUIRED fields to be joined with NULLABLE fields.
     nullable_type = ibis_type.copy(nullable=True)
     return value.cast(nullable_type).name(name)
@@ -295,13 +289,8 @@ def ibis_dtype_to_bigframes_dtype(
     if isinstance(ibis_dtype, ibis_dtypes.Integer):
         return pd.Int64Dtype()
 
-    # Temporary: Will eventually support an explicit json type instead of casting to string.
     if isinstance(ibis_dtype, ibis_dtypes.JSON):
-        warnings.warn(
-            "Interpreting JSON as string. This behavior may change in future versions.",
-            bigframes.exceptions.PreviewWarning,
-        )
-        return bigframes.dtypes.STRING_DTYPE
+        return bigframes.dtypes.JSON_DTYPE
 
     if ibis_dtype in IBIS_TO_BIGFRAMES:
         return IBIS_TO_BIGFRAMES[ibis_dtype]

@@ -18,10 +18,10 @@
 import typing
 
 import bigframes_vendored.constants as constants
-import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 import ibis
 import ibis.common.exceptions
 import ibis.expr.datatypes as ibis_dtypes
+import ibis.expr.operations as ibis_ops
 import ibis.expr.operations.generic
 import ibis.expr.types as ibis_types
 import numpy as np
@@ -1113,26 +1113,20 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp):
 # JSON Ops
 @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
 def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
-    if x.type().is_json():
-        return json_set(
-            json_obj=x,
-            json_path=op.json_path,
-            json_value=y,
-        ).to_expr()
-    else:
-        # Enabling JSON type eliminates the need for less efficient string conversions.
-        return vendored_ibis_ops.ToJsonString(
-            json_set(
-                json_obj=parse_json(x),
-                json_path=op.json_path,
-                json_value=y,
-            )
-        ).to_expr()
+    return json_set(json_obj=x, json_path=op.json_path, json_value=y)
 
 
 @scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
 def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
-    return json_extract(json_obj=x, json_path=op.json_path)
+    # Define a user-defined function whose returned type is dynamically matching the input.
+    def json_extract(json_or_json_string, json_path: ibis_dtypes.str):  # type: ignore
+        """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
+        ...
+
+    return_type = x.type()
+    json_extract.__annotations__["return"] = return_type
+    json_extract_op = ibis_ops.udf.scalar.builtin(json_extract)
+    return json_extract_op(json_or_json_string=x, json_path=op.json_path)
 
 
 @scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
@@ -1794,13 +1788,6 @@ def json_set(
     """Produces a new SQL JSON value with the specified JSON data inserted or replaced."""
 
 
-@ibis.udf.scalar.builtin(name="json_extract")
-def json_extract(
-    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
-) -> ibis_dtypes.JSON:
-    """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
-
-
 @ibis.udf.scalar.builtin(name="json_extract_array")
 def json_extract_array(
     json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str

@@ -21,6 +21,7 @@
 from typing import Dict, Literal, Union
 
 import bigframes_vendored.constants as constants
+import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery
 import numpy as np
@@ -55,6 +56,7 @@
 BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38))
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
+JSON_DTYPE = db_dtypes.JSONDtype()
 
 # Used when storing Null expressions
 DEFAULT_DTYPE = FLOAT_DTYPE
@@ -187,6 +189,12 @@ class SimpleDtypeInfo:
         logical_bytes=40,
         clusterable=True,
     ),
+    # JSON has no corresponding arrow dtype
+    SimpleDtypeInfo(
+        dtype=JSON_DTYPE,
+        arrow_dtype=None,
+        type_kind=("JSON",),
+    ),
 )
 
 
@@ -271,8 +279,7 @@ def is_struct_like(type_: ExpressionType) -> bool:
 
 
 def is_json_like(type_: ExpressionType) -> bool:
-    # TODO: Add JSON type support
-    return type_ == STRING_DTYPE
+    return type_ == STRING_DTYPE or type_ == JSON_DTYPE
 
 
 def is_json_encoding_type(type_: ExpressionType) -> bool:
@@ -446,8 +453,6 @@ def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
     return bigframes_dtype_to_arrow_dtype(infer_literal_type(literal))
 
 
-# Don't have dtype for json, so just end up interpreting as STRING
-_REMAPPED_TYPEKINDS = {"JSON": "STRING"}
 _TK_TO_BIGFRAMES = {
     type_kind: mapping.dtype
     for mapping in SIMPLE_TYPES
@@ -471,12 +476,8 @@ def convert_schema_field(
         pa_struct = pa.struct(fields)
         pa_type = pa.list_(pa_struct) if is_repeated else pa_struct
         return field.name, pd.ArrowDtype(pa_type)
-    elif (
-        field.field_type in _TK_TO_BIGFRAMES or field.field_type in _REMAPPED_TYPEKINDS
-    ):
-        singular_type = _TK_TO_BIGFRAMES[
-            _REMAPPED_TYPEKINDS.get(field.field_type, field.field_type)
-        ]
+    elif field.field_type in _TK_TO_BIGFRAMES:
+        singular_type = _TK_TO_BIGFRAMES[field.field_type]
         if is_repeated:
             pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(singular_type))
             return field.name, pd.ArrowDtype(pa_type)

@@ -871,8 +871,7 @@ def output_type(self, *input_types):
                 + f"Received type: {right_type}"
             )
 
-        # After JSON type implementation, ONLY return JSON data.
-        return left_type
+        return dtypes.JSON_DTYPE
 
 
 # Ternary Ops

@@ -16,6 +16,7 @@
 from typing import Union
 
 import bigframes_vendored.constants as constants
+import db_dtypes  # type: ignore
 import geopandas  # type: ignore
 import pandas
 import pandas.arrays
@@ -104,6 +105,8 @@ def arrow_to_pandas(
             series = pandas.Series(pd_array, dtype=dtype)
         elif isinstance(dtype, pandas.ArrowDtype):
             series = _arrow_to_pandas_arrowdtype(column, dtype)
+        elif isinstance(dtype, db_dtypes.JSONDtype):
+            series = db_dtypes.JSONArray(column)
         else:
             series = column.to_pandas(types_mapper=lambda _: dtype)
 

@@ -62,8 +62,10 @@
     "sqlglot >=23.6.3,<25.2",
     "tabulate >= 0.9",
     "ipywidgets >=7.7.1",
-    "humanize >= 4.6.0",
-    "matplotlib >= 3.7.1",
+    "humanize >=4.6.0",
+    "matplotlib >=3.7.1",
+    # "db-dtypes >=1.3.0",
+    "db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d",
 ]
 extras = {
     # Optional test dependencies packages. If they're missed, may skip some tests.

@@ -26,5 +26,6 @@ tabulate==0.9
 ipywidgets==7.7.1
 humanize==4.6.0
 matplotlib==3.7.1
+db-dtypes @ git+https://github.com/googleapis/python-db-dtypes-pandas.git@4b84e4a6fada5ecfa7f910dca61e6de714abdb9d
 # extras
 pandas-gbq==0.19.0
@@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type():
 def test_json_extract_from_json():
     s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
     actual = bbq.json_extract(s, "$.a.b").to_pandas()
-    # After the introduction of the JSON type, the output should be a JSON-formatted series.
     expected = _get_series_from_json([[1, 2], None, 0]).to_pandas()
     pd.testing.assert_series_equal(
         actual,
@@ -129,11 +128,10 @@ def test_json_extract_from_json():
 def test_json_extract_from_string():
     s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
     actual = bbq.json_extract(s, "$.a.b")
-    expected = _get_series_from_json([[1, 2], None, 0])
+    expected = bpd.Series(["[1,2]", None, "0"])
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
-        check_names=False,
     )
 
 
@@ -142,17 +140,30 @@ def test_json_extract_w_invalid_series_type():
         bbq.json_extract(bpd.Series([1, 2]), "$.a")
 
 
+# The repeated JSON dtype is not supported because `dbjson` is not an arrow extension dtype.
+# def test_json_extract_array_from_json():
+#     s = _get_series_from_json([{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4","5"]}, {}])
+#     actual = bbq.json_extract_array(s, "$.a")
+#     expected = _get_series_from_json([["ab", "2", "3 xy"], [], ["4","5"], None])
+#     pd.testing.assert_series_equal(
+#         actual.to_pandas(),
+#         expected.to_pandas(),
+#     )
+
+
 def test_json_extract_array_from_json_strings():
-    s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
+    s = bpd.Series(
+        ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"]
+    )
     actual = bbq.json_extract_array(s, "$.a")
-    expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
+    expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None])
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
     )
 
 
-def test_json_extract_array_from_array_strings():
+def test_json_extract_array_from_json_array_strings():
     s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
     actual = bbq.json_extract_array(s)
     expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])

@@ -14,6 +14,7 @@
 
 from typing import Tuple
 
+import db_dtypes  # type:ignore
 import google.api_core.exceptions
 import pandas as pd
 import pandas.testing
@@ -248,22 +249,20 @@ def test_to_pandas_array_struct_correct_result(session):
 
 
 def test_load_json(session):
-    df = session.read_gbq(
-        """SELECT
-        JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column
-        """
-    )
+    df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column")
+    assert df.dtypes["json_column"] == db_dtypes.JSONDtype()
+    assert isinstance(df["json_column"][0], dict)
+    assert df["json_column"][0]["bar"]
+    assert df["json_column"][0]["foo"] == 10
 
     result = df.to_pandas()
-    expected = pd.DataFrame(
-        {
-            "json_column": ['{"bar":true,"foo":10}'],
-        },
-        dtype=pd.StringDtype(storage="pyarrow"),
+    pd_df = pd.DataFrame(
+        {"json_column": [{"bar": True, "foo": 10}]},
+        dtype=db_dtypes.JSONDtype(),
     )
-    expected.index = expected.index.astype("Int64")
-    pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
-    pd.testing.assert_series_equal(result["json_column"], expected["json_column"])
+    pd_df.index = pd_df.index.astype("Int64")
+    pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
+    pd.testing.assert_series_equal(result["json_column"], pd_df["json_column"])
 
 
 def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):

@@ -17,6 +17,7 @@
 import re
 import tempfile
 
+import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import numpy
 from packaging.version import Version
@@ -281,7 +282,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
 def test_get_column_w_json(json_df, json_pandas_df):
     series = json_df["json_col"]
     series_pandas = series.to_pandas()
-    assert series.dtype == pd.StringDtype(storage="pyarrow")
+    assert series.dtype == db_dtypes.JSONDtype()
     assert series_pandas.shape[0] == json_pandas_df.shape[0]