Skip to content

Commit

Permalink
GH-44066: [Python] Add Python wrapper for JsonExtensionType (#44070)
Browse files Browse the repository at this point in the history
### Rationale for this change

We [added canonical JsonExtensionType](#13901) and we should make it usable from Python.

### What changes are included in this PR?

Python wrapper for `JsonExtensionType` and `JsonArray` are added on Python side as well as `JsonArray` on c++ side.

### Are these changes tested?

Python tests for the extension type and array are included.

### Are there any user-facing changes?

This adds a json canonical extension type to pyarrow.
* GitHub Issue: #44066

Lead-authored-by: Rok Mihevc <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
rok and jorisvandenbossche authored Oct 22, 2024
1 parent 817ccbc commit bcb4653
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 4 deletions.
8 changes: 4 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def print_entry(label, value):
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
bool8, fixed_shape_tensor, opaque, uuid,
bool8, fixed_shape_tensor, json_, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
Expand All @@ -183,7 +183,7 @@ def print_entry(label, value):
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, Bool8Type, FixedShapeTensorType,
OpaqueType, UuidType,
JsonType, OpaqueType, UuidType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
Expand Down Expand Up @@ -218,7 +218,7 @@ def print_entry(label, value):
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
OpaqueArray, UuidArray,
JsonArray, OpaqueArray, UuidArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
Expand All @@ -236,7 +236,7 @@ def print_entry(label, value):
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar)

# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
Expand Down
27 changes: 27 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array):
return result


class JsonArray(ExtensionArray):
"""
Concrete class for Arrow arrays of JSON data type.
This does not guarantee that the JSON data actually
is valid JSON.
Examples
--------
Define the extension type for JSON array
>>> import pyarrow as pa
>>> json_type = pa.json_(pa.large_utf8())
Create an extension array
>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
"{ "id":30, "values":["a", "b"] }"
]
"""


class UuidArray(ExtensionArray):
"""
Concrete class for Arrow arrays of UUID data type.
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2871,6 +2871,13 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
shared_ptr[CArray] storage()


cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil:
cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType):

@staticmethod
CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type)


cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):

Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType):
cdef:
const CUuidType* uuid_ext_type

cdef class JsonType(BaseExtensionType):
cdef:
const CJsonType* json_ext_type


cdef class PyExtensionType(ExtensionType):
pass

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type(
out = OpaqueType.__new__(OpaqueType)
elif extension_name == b"arrow.uuid":
out = UuidType.__new__(UuidType)
elif extension_name == b"arrow.json":
out = JsonType.__new__(JsonType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)


class JsonScalar(ExtensionScalar):
"""
Concrete class for JSON extension scalar.
"""


class UuidScalar(ExtensionScalar):
"""
Concrete class for Uuid extension scalar.
Expand Down
11 changes: 11 additions & 0 deletions python/pyarrow/tests/parquet/test_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,3 +510,14 @@ def test_large_binary_overflow():
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)


@pytest.mark.parametrize("storage_type", (
pa.string(), pa.large_string()))
def test_json_extension_type(storage_type):
data = ['{"a": 1}', '{"b": 2}', None]
arr = pa.array(data, type=pa.json_(storage_type))

table = pa.table([arr], names=["ext"])

_simple_table_roundtrip(table)
53 changes: 53 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1926,3 +1926,56 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None


@pytest.mark.parametrize("storage_type", (
pa.string(), pa.large_string(), pa.string_view()))
def test_json(storage_type, pickle_module):
data = ['{"a": 1}', '{"b": 2}', None]
json_type = pa.json_(storage_type)
storage = pa.array(data, type=storage_type)
array = pa.array(data, type=json_type)
json_arr_class = json_type.__arrow_ext_class__()

assert pa.json_() == pa.json_(pa.utf8())
assert json_type.extension_name == "arrow.json"
assert json_type.storage_type == storage_type
assert json_type.__class__ is pa.JsonType

assert json_type == pa.json_(storage_type)
assert json_type != storage_type

assert isinstance(array, pa.JsonArray)

assert array.to_pylist() == data
assert array[0].as_py() == data[0]
assert array[2].as_py() is None

# Pickle roundtrip
result = pickle_module.loads(pickle_module.dumps(json_type))
assert result == json_type

# IPC roundtrip
buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
batch = ipc_read_batch(buf)
reconstructed_array = batch.column(0)
assert reconstructed_array.type == json_type
assert reconstructed_array == array
assert isinstance(array, json_arr_class)

assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
assert isinstance(array[0], pa.JsonScalar)

# cast storage -> extension type
result = storage.cast(json_type)
assert result == array

# cast extension type -> storage type
inner = array.cast(storage_type)
assert inner == storage

for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
with pytest.raises(
pa.ArrowInvalid,
match=f"Invalid storage type for JsonExtensionType: {storage_type}"):
pa.json_(storage_type)
3 changes: 3 additions & 0 deletions python/pyarrow/tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows():
pa.Bool8Array,
pa.Bool8Scalar,
pa.Bool8Type,
pa.JsonArray,
pa.JsonScalar,
pa.JsonType,
])
def test_extension_type_constructor_errors(klass):
# ARROW-2638: prevent calling extension class constructors directly
Expand Down
75 changes: 75 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar


cdef class JsonType(BaseExtensionType):
"""
Concrete class for JSON extension type.
Examples
--------
Define the extension type for JSON array
>>> import pyarrow as pa
>>> json_type = pa.json_(pa.large_utf8())
Create an extension array
>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
"{ "id":30, "values":["a", "b"] }"
]
"""

cdef void init(self, const shared_ptr[CDataType]& type) except *:
BaseExtensionType.init(self, type)
self.json_ext_type = <const CJsonType*> type.get()

def __arrow_ext_class__(self):
return JsonArray

def __reduce__(self):
return json_, (self.storage_type,)

def __arrow_ext_scalar_class__(self):
return JsonScalar


cdef class UuidType(BaseExtensionType):
"""
Concrete class for UUID extension type.
Expand Down Expand Up @@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)


def json_(DataType storage_type=utf8()):
"""
Create instance of JSON extension type.
Parameters
----------
storage_type : DataType, default pyarrow.string()
The underlying data type. Can be on of the following types:
string, large_string, string_view.
Returns
-------
type : JsonType
Examples
--------
Create an instance of JSON extension type:
>>> import pyarrow as pa
>>> pa.json_(pa.utf8())
JsonType(extension<arrow.json>)
Use the JSON type to create an array:
>>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8()))
<pyarrow.lib.JsonArray object at ...>
[
"{"a": 1}",
"{"b": 2}"
]
"""

cdef JsonType out = JsonType.__new__(JsonType)
c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type))
out.init(c_json_ext_type)
return out


def uuid():
"""
Create UuidType instance.
Expand Down

0 comments on commit bcb4653

Please sign in to comment.