From 0d4badbc9f3afac5a372c6eb7c4c150c339235a6 Mon Sep 17 00:00:00 2001 From: Kevin Wilson <khwilson@gmail.com> Date: Tue, 17 Sep 2024 13:58:47 -0400 Subject: [PATCH] GH-43809: [Docs] Update extension type examples to not use UUID (#44120) ### Rationale for this change UUID extension types were made canonical in #41299 and are getting native support in C++ and Python in #37298. As such, it makes sense to provide an alternative user-defined extension type as an example that is unlikely to become a canonical extension type anytime soon. After discussion in #43809, we determined a `RationalType` would make sense. Please note that this is a redo of #43849 as I made a blunder and accidentally pushed a branch that was in a wonky state. ### What changes are included in this PR? A change in several doc locations which reference a `UuidType` extension type have been changed to a `RationalType`. For consistency, this PR also changes single quotes (`''`) to double quotes (`""`) throughout the Python examples that it modifies. Also, seemingly unrelated to this change, some doctests began failing as numpy changed the `repr` of `float16`'s between 1.x and 2.x. We have updated the failing doctest so that it supports both styles. ### Are these changes tested? These are documentation changes and `archery docker run conda-python-docs` succeeds locally. ### Are there any user-facing changes? No. cc @ianmcook @rok * GitHub Issue: #43809 --------- Co-authored-by: Ian Cook <ianmcook@gmail.com> --- docs/source/format/Columnar.rst | 12 +- docs/source/format/Integration.rst | 31 +++- docs/source/python/extending_types.rst | 189 +++++++++++++++--------- python/pyarrow/types.pxi | 194 ++++++++++++++++--------- 4 files changed, 281 insertions(+), 145 deletions(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 4bd937d760d59..697c39b0cb1d9 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1596,12 +1596,12 @@ structure. These extension keys are: they should not be used for third-party extension types. This extension metadata can annotate any of the built-in Arrow logical -types. The intent is that an implementation that does not support an -extension type can still handle the underlying data. For example a -16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and -implementations that do not have this extension type can still work -with the underlying binary values and pass along the -``custom_metadata`` in subsequent Arrow protocol messages. +types. For example, Arrow specifies a canonical extension type that +represents a UUID as a ``FixedSizeBinary(16)``. Arrow implementations are +not required to support canonical extensions, so an implementation that +does not support this UUID type will simply interpret it as a +``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in +subsequent Arrow protocol messages. Extension types may or may not use the ``'ARROW:extension:metadata'`` field. Let's consider some example diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 0ab5b832ad012..f76aa0fefcf27 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -390,20 +390,37 @@ but can be of any type. Extension types are, as in the IPC format, represented as their underlying storage type plus some dedicated field metadata to reconstruct the extension -type. For example, assuming a "uuid" extension type backed by a -FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: +type. For example, assuming a "rational" extension type backed by a +``struct<numer: int32, denom: int32>`` storage, here is how a "rational" field +would be represented:: { "name" : "name_of_the_field", "nullable" : /* boolean */, "type" : { - "name" : "fixedsizebinary", - "byteWidth" : 16 + "name" : "struct" }, - "children" : [], + "children" : [ + { + "name": "numer", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + } + }, + { + "name": "denom", + "type": { + "name": "int", + "bitWidth": 32, + "isSigned": true + } + } + ], "metadata" : [ - {"key": "ARROW:extension:name", "value": "uuid"}, - {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + {"key": "ARROW:extension:name", "value": "rational"}, + {"key": "ARROW:extension:metadata", "value": "rational-serialized"} ] } diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index d746505348157..264cc8dca77e1 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -116,73 +116,103 @@ a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. Defining extension types ("user-defined types") ----------------------------------------------- -Arrow has the notion of extension types in the metadata specification as a -possibility to extend the built-in types. This is done by annotating any of the -built-in Arrow data types (the "storage type") with a custom type name and -optional serialized representation ("ARROW:extension:name" and -"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC -message). -See the :ref:`format_metadata_extension_types` section of the metadata -specification for more details. - -Pyarrow allows you to define such extension types from Python by subclassing -:class:`ExtensionType` and giving the derived class its own extension name -and serialization mechanism. The extension name and serialized metadata -can potentially be recognized by other (non-Python) Arrow implementations +Arrow affords a notion of extension types which allow users to annotate data +types with additional semantics. This allows developers both to +specify custom serialization and deserialization routines (for example, +to :ref:`Python scalars <custom-scalar-conversion>` and +:ref:`pandas <conversion-to-pandas>`) and to more easily interpret data. + +In Arrow, :ref:`extension types <format_metadata_extension_types>` +are specified by annotating any of the built-in Arrow data types +(the "storage type") with a custom type name and, optionally, a +bytestring that can be used to provide additional metadata (referred to as +"parameters" in this documentation). These appear as the +``ARROW:extension:name`` and ``ARROW:extension:metadata`` keys in the +Field's ``custom_metadata``. + +Note that since these annotations are part of the Arrow specification, +they can potentially be recognized by other (non-Python) Arrow consumers such as PySpark. -For example, we could define a custom UUID type for 128-bit numbers which can -be represented as ``FixedSizeBinary`` type with 16 bytes:: - - class UuidType(pa.ExtensionType): - - def __init__(self): - super().__init__(pa.binary(16), "my_package.uuid") - - def __arrow_ext_serialize__(self): - # Since we don't have a parameterized type, we don't need extra - # metadata to be deserialized - return b'' +PyArrow allows you to define extension types from Python by subclassing +:class:`ExtensionType` and giving the derived class its own extension name +and mechanism to (de)serialize any parameters. For example, we could define +a custom rational type for fractions which can be represented as a pair of +integers:: + + class RationalType(pa.ExtensionType): + + def __init__(self, data_type: pa.DataType): + if not pa.types.is_integer(data_type): + raise TypeError(f"data_type must be an integer type not {data_type}") + + super().__init__( + pa.struct( + [ + ("numer", data_type), + ("denom", data_type), + ], + ), + "my_package.rational", + ) + + def __arrow_ext_serialize__(self) -> bytes: + # No parameters are necessary + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): # Sanity checks, not required but illustrate the method signature. - assert storage_type == pa.binary(16) - assert serialized == b'' - # Return an instance of this subclass given the serialized - # metadata. - return UuidType() + assert pa.types.is_struct(storage_type) + assert pa.types.is_integer(storage_type[0].type) + assert storage_type[0].type == storage_type[1].type + assert serialized == b"" + + # return an instance of this subclass + return RationalType(storage_type[0].type) + The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` -define the serialization of an extension type instance. For non-parametric -types such as the above, the serialization payload can be left empty. +define the serialization and deserialization of an extension type instance. This can now be used to create arrays and tables holding the extension type:: - >>> uuid_type = UuidType() - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) - - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> rational_type = RationalType(pa.int32()) + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct<numer: int32, denom: int32>) + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, + ... ) + >>> arr = rational_type.wrap_array(storage_array) + >>> # or equivalently + >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr - <pyarrow.lib.ExtensionArray object at 0x7f75c2f300a0> - [ - A6861959108644B797664AEEE686B682, - 718747F48E5F4058A7261E2B6B228BE8, - 7FE201227D624D96A5CD8639DEF2A68B, - C6CA8C7F95744BFD9462A40B3F57A86C - ] + <pyarrow.lib.ExtensionArray object at 0x1067f5420> + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] This array can be included in RecordBatches, sent over IPC and received in another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the storage type:: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int32())) For example, creating a RecordBatch and writing it to a stream using the IPC protocol:: @@ -197,19 +227,45 @@ and then reading it back yields the proper type:: >>> with pa.ipc.open_stream(buf) as reader: ... result = reader.read_all() - >>> result.column('ext').type - UuidType(FixedSizeBinaryType(fixed_size_binary[16])) + >>> result.column("ext").type + RationalType(StructType(struct<numer: int32, denom: int32>)) + +Further, note that while we registered the concrete type +``RationalType(pa.int32())``, the same extension name +(``"my_package.rational"``) is used by ``RationalType(integer_type)`` +for *all* Arrow integer types. As such, the above code also allows users to +(de)serialize these data types:: + + >>> big_rational_type = RationalType(pa.int64()) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=big_rational_type.storage_type, + ... ) + >>> arr = big_rational_type.wrap_array(storage_array) + >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + >>> sink = pa.BufferOutputStream() + >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: + ... writer.write_batch(batch) + >>> buf = sink.getvalue() + >>> with pa.ipc.open_stream(buf) as reader: + ... result = reader.read_all() + >>> result.column("ext").type + RationalType(StructType(struct<numer: int64, denom: int64>)) The receiving application doesn't need to be Python but can still recognize -the extension type as a "my_package.uuid" type, if it has implemented its own +the extension type as a "my_package.rational" type if it has implemented its own extension type to receive it. If the type is not registered in the receiving application, it will fall back to the storage type. Parameterized extension type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The above example used a fixed storage type with no further metadata. But -more flexible, parameterized extension types are also possible. +The above example illustrated how to construct an extension type that requires +no additional metadata beyond its storage type. But Arrow also provides more +flexible, parameterized extension types. The example given here implements an extension type for the `pandas "period" data type <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-span-representation>`__, @@ -225,7 +281,7 @@ of the given frequency since 1970. # attributes need to be set first before calling # super init (as that calls serialize) self._freq = freq - super().__init__(pa.int64(), 'my_package.period') + super().__init__(pa.int64(), "my_package.period") @property def freq(self): @@ -240,7 +296,7 @@ of the given frequency since 1970. # metadata. serialized = serialized.decode() assert serialized.startswith("freq=") - freq = serialized.split('=')[1] + freq = serialized.split("=")[1] return PeriodType(freq) Here, we ensure to store all information in the serialized metadata that is @@ -274,7 +330,7 @@ the data as a 2-D Numpy array ``(N, 3)`` without any copy:: super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -313,6 +369,8 @@ This array can be sent over IPC, received in another Python process, and the cus extension array class will be preserved (as long as the receiving process registers the extension type using :func:`register_extension_type` before reading the IPC data). +.. _custom-scalar-conversion: + Custom scalar conversion ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -335,7 +393,7 @@ For example, if we wanted the above example 3D point type to return a custom super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") def __arrow_ext_serialize__(self): - return b'' + return b"" @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): @@ -354,6 +412,7 @@ Arrays built using this extension type now provide scalars that convert to our ` >>> arr.to_pylist() [Point3D(x=1.0, y=2.0, z=3.0), Point3D(x=4.0, y=5.0, z=6.0)] +.. _conversion-to-pandas: Conversion to pandas ~~~~~~~~~~~~~~~~~~~~ @@ -436,16 +495,16 @@ Extension arrays can be used as columns in ``pyarrow.Table`` or >>> data = [ ... pa.array([1, 2, 3]), - ... pa.array(['foo', 'bar', None]), + ... pa.array(["foo", "bar", None]), ... pa.array([True, None, True]), ... tensor_array, ... tensor_array_2 ... ] - >>> my_schema = pa.schema([('f0', pa.int8()), - ... ('f1', pa.string()), - ... ('f2', pa.bool_()), - ... ('tensors_int', tensor_type), - ... ('tensors_float', tensor_type_2)]) + >>> my_schema = pa.schema([("f0", pa.int8()), + ... ("f1", pa.string()), + ... ("f2", pa.bool_()), + ... ("tensors_int", tensor_type), + ... ("tensors_float", tensor_type_2)]) >>> table = pa.Table.from_arrays(data, schema=my_schema) >>> table pyarrow.Table @@ -541,7 +600,7 @@ or .. code-block:: python - >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=['C', 'H', 'W']) + >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"]) for ``NCHW`` format where: diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a46caff1f21a4..70f12e9796e80 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1627,59 +1627,97 @@ cdef class ExtensionType(BaseExtensionType): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) - Create an instance of UuidType extension type: + Create an instance of RationalType extension type: - >>> uuid_type = UuidType() + >>> rational_type = RationalType(pa.int32()) Inspect the extension type: - >>> uuid_type.extension_name - 'my_package.uuid' - >>> uuid_type.storage_type - FixedSizeBinaryType(fixed_size_binary[16]) + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct<numer: int32, denom: int32>) Wrap an array as an extension array: - >>> import uuid - >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) - >>> uuid_type.wrap_array(storage_array) + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type + ... ) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array <pyarrow.lib.ExtensionArray object at ...> - [ - ... - ] + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] Or do the same with creating an ExtensionArray: - >>> pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array <pyarrow.lib.ExtensionArray object at ...> - [ - ... - ] + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. """ def __cinit__(self): @@ -1739,7 +1777,7 @@ cdef class ExtensionType(BaseExtensionType): return NotImplementedError @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): + def __arrow_ext_deserialize__(cls, storage_type, serialized): """ Return an extension type instance from the storage type and serialized metadata. @@ -2067,30 +2105,39 @@ def register_extension_type(ext_type): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: DataType _type = ensure_type(ext_type, allow_none=False) @@ -2117,30 +2164,39 @@ def unregister_extension_type(type_name): Examples -------- - Define a UuidType extension type subclassing ExtensionType: + Define a RationalType extension type subclassing ExtensionType: >>> import pyarrow as pa - >>> class UuidType(pa.ExtensionType): - ... def __init__(self): - ... pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") - ... def __arrow_ext_serialize__(self): - ... # since we don't have a parameterized type, we don't need extra - ... # metadata to be deserialized - ... return b'' - ... @classmethod - ... def __arrow_ext_deserialize__(self, storage_type, serialized): - ... # return an instance of this subclass given the serialized - ... # metadata. - ... return UuidType() - ... + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) Register the extension type: - >>> pa.register_extension_type(UuidType()) + >>> pa.register_extension_type(RationalType(pa.int64())) Unregister the extension type: - >>> pa.unregister_extension_type("my_package.uuid") + >>> pa.unregister_extension_type("my_package.rational") """ cdef: c_string c_type_name = tobytes(type_name) @@ -4318,8 +4374,12 @@ def float16(): 15872, 32256 ] - >>> a.to_pylist() - [np.float16(1.5), np.float16(nan)] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [<class 'numpy.float16'>, <class 'numpy.float16'>] """ return primitive_type(_Type_HALF_FLOAT)