From 86210e84de931695c625e13d9247f9479f21e301 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Thu, 1 Aug 2024 18:38:35 -0500 Subject: [PATCH] add python bindings --- python/pyarrow/__init__.py | 7 ++- python/pyarrow/array.pxi | 24 ++++++++ python/pyarrow/includes/libarrow.pxd | 11 ++++ python/pyarrow/lib.pxd | 3 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 4 ++ python/pyarrow/tests/test_extension_type.py | 58 ++++++++++++++++++++ python/pyarrow/tests/test_misc.py | 3 + python/pyarrow/types.pxi | 61 +++++++++++++++++++++ 9 files changed, 170 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index e52e0d242bee5..50371d59866cf 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -173,6 +173,7 @@ def print_entry(label, value): dictionary, run_end_encoded, fixed_shape_tensor, + bool8, field, type_for_alias, DataType, DictionaryType, StructType, @@ -182,7 +183,7 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, FixedShapeTensorType, + RunEndEncodedType, FixedShapeTensorType, Bool8Type, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -216,7 +217,7 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, FixedShapeTensorArray, + RunEndEncodedArray, FixedShapeTensorArray, Bool8Array, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -233,7 +234,7 @@ def print_entry(label, value): StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, - RunEndEncodedScalar, ExtensionScalar) + RunEndEncodedScalar, ExtensionScalar, Bool8Scalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 997f208a5dec4..744da9c8c445f 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4447,6 +4447,30 @@ cdef class FixedShapeTensorArray(ExtensionArray): FixedSizeListArray.from_arrays(values, shape[1:].prod()) ) +cdef class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + Examples + -------- + Define the extension type for an bool8 array + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + Create an extension array + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self): + return self.storage.to_numpy().view(np.bool_) cdef dict _array_classes = { _Type_NA: NullArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0d871f411b11b..b623317d9a6f8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2882,6 +2882,17 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens " arrow::extension::FixedShapeTensorArray"(CExtensionArray): const CResult[shared_ptr[CTensor]] ToTensor() const +cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: + cdef cppclass CBool8Type \ + " arrow::extension::Bool8Type"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + cdef cppclass CBool8Array \ + " arrow::extension::Bool8Array"(CExtensionArray): + pass + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 082d8470cdbb0..c7aa4e41ea541 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type +cdef class Bool8Type(BaseExtensionType): + cdef: + const CBool8Type* bool8_ext_type cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 966273b4bea84..2addf33e4b77d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -124,6 +124,8 @@ cdef api object pyarrow_wrap_data_type( return cpy_ext_type.GetInstance() elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) + elif ext_type.extension_name() == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 41bfde39adb6f..8c59339f22528 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1084,6 +1084,10 @@ cdef class FixedShapeTensorScalar(ExtensionScalar): ctensor = GetResultValue(c_type.MakeTensor(scalar)) return pyarrow_wrap_tensor(ctensor) +cdef class Bool8Scalar(ExtensionScalar): + """ + Concrete class for bool8 extension scalar. + """ cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 1c4d0175a2d97..186cfe46e4c4b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1661,3 +1661,61 @@ def test_legacy_int_type(): batch = ipc_read_batch(buf) assert isinstance(batch.column(0).type, LegacyIntType) assert batch.column(0) == ext_arr + +def test_bool8_type(pickle_module): + bool8_type = pa.bool8() + storage_type = pa.int8() + assert bool8_type.extension_name == "arrow.bool8" + assert bool8_type.storage_type == storage_type + assert str(bool8_type) == "extension" + + assert bool8_type == bool8_type + assert bool8_type == pa.bool8() + assert bool8_type != storage_type + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(bool8_type)) + assert result == bool8_type + + # IPC roundtrip + bool8_arr_class = bool8_type.__arrow_ext_class__() + storage = pa.array([-1, 0, 1, 2, None], storage_type) + arr = pa.ExtensionArray.from_storage(bool8_type, storage) + assert isinstance(arr, bool8_arr_class) + + with registered_extension_type(bool8_type): + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.bool8" + assert isinstance(batch.column(0), bool8_arr_class) + + # cast storage -> extension type + result = storage.cast(bool8_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(storage_type) + assert inner == storage + + # cast extension type -> arrow boolean type + bool_type = pa.bool_() + arrow_bool_arr = pa.array([True, False, True, True, None], bool_type) + cast_bool_arr = arr.cast(bool_type) + assert cast_bool_arr == arrow_bool_arr + + # cast arrow boolean type -> extension type, expecting canonical values + cast_bool8_arr = arrow_bool_arr.cast(bool8_type) + canonical_storage = pa.array([1, 0, 1, 1, None], storage_type) + canonical_bool8_arr = pa.ExtensionArray.from_storage(bool8_type, canonical_storage) + assert cast_bool8_arr == canonical_bool8_arr + + # zero-copy convert to numpy if non-null + with pytest.raises(pa.ArrowInvalid, match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True"): + arr.to_numpy() + + arr_np_bool = np.array([True, False, True, True], dtype=np.bool_) + arr_no_nulls = pa.ExtensionArray.from_storage(bool8_type, pa.array([-1, 0, 1, 2], storage_type)) + arr_to_np = arr_no_nulls.to_numpy() + assert np.array_equal(arr_to_np, arr_np_bool) + assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address # zero-copy diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index c42e4fbdfc2e8..031bc4dda4317 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -247,6 +247,9 @@ def test_set_timezone_db_path_non_windows(): pa.ProxyMemoryPool, pa.Device, pa.MemoryManager, + pa.Bool8Array, + pa.Bool8Scalar, + pa.Bool8Type, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 4343d7ea300b0..a16ddfdb4caea 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1809,6 +1809,32 @@ cdef class FixedShapeTensorType(BaseExtensionType): def __arrow_ext_scalar_class__(self): return FixedShapeTensorScalar +cdef class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + Examples + -------- + Create an instance of bool8 extension type: + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.bool8_ext_type = type.get() + + def __arrow_ext_class__(self): + return Bool8Array + + def __reduce__(self): + return bool8, () + + def __arrow_ext_scalar_class__(self): + return Bool8Scalar _py_extension_type_auto_load = False @@ -5206,6 +5232,41 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out +def bool8(): + """ + Create instance of bool8 extension type. + Examples + -------- + Create an instance of bool8 extension type: + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + Inspect the data type: + >>> type.storage_type + DataType(int8) + Create a table with a bool8 array: + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[True, False, True, True, null]] + Returns + ------- + type : Bool8Type + """ + + cdef Bool8Type out = Bool8Type.__new__(Bool8Type) + + with nogil: + c_type = GetResultValue(CBool8Type.Make()) + + out.init(c_type) + + return out cdef dict _type_aliases = { 'null': null,