diff --git a/docs/integrations/delta-lake-dagster.md b/docs/integrations/delta-lake-dagster.md index 0fe413c7a0..3aa5a505a6 100644 --- a/docs/integrations/delta-lake-dagster.md +++ b/docs/integrations/delta-lake-dagster.md @@ -212,7 +212,7 @@ def iris_dataset() -> pd.DataFrame: ``` ## Using Delta Lake and Dagster with Polars -To read and write data to Delta Lake using pandas, use the `DeltaLakePolarsIOManager()`. +To read and write data to Delta Lake using polars, use the `DeltaLakePolarsIOManager()`. You will need to install it using: @@ -223,7 +223,7 @@ pip install dagster-deltalake-polars In your `Definitions` object, change the `io_manager` to `DeltaLakePolarsIOManager()`: ``` -from dagster_polars import DeltaLakePolarsIOManager +from dagster_deltalake_polars import DeltaLakePolarsIOManager defs = Definitions( assets=all_assets, diff --git a/python/deltalake/schema.py b/python/deltalake/schema.py index 8bc5c7e155..2008c43de0 100644 --- a/python/deltalake/schema.py +++ b/python/deltalake/schema.py @@ -95,6 +95,8 @@ def dtype_to_delta_dtype(dtype: pa.DataType) -> pa.DataType: return pa.timestamp("us", "UTC") elif type(dtype) is pa.FixedSizeBinaryType: return pa.binary() + elif isinstance(dtype, pa.ExtensionType): + return dtype.storage_type try: return dtype_map[dtype] except KeyError: diff --git a/python/stubs/pyarrow/__init__.pyi b/python/stubs/pyarrow/__init__.pyi index e500d11191..31943db8b8 100644 --- a/python/stubs/pyarrow/__init__.pyi +++ b/python/stubs/pyarrow/__init__.pyi @@ -14,6 +14,7 @@ FixedSizeListType: Any LargeListViewType: Any ListViewType: Any FixedSizeBinaryType: Any +ExtensionType: Any schema: Any map_: Any list_: Any diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py index a3ad6b62e1..34c833eb17 100644 --- a/python/tests/test_schema.py +++ b/python/tests/test_schema.py @@ -223,6 +223,26 @@ def test_delta_schema(): assert schema_without_metadata == Schema.from_pyarrow(pa_schema) +def _generate_test_type(): + class UuidType(pa.ExtensionType): + def __init__(self): + pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") + + def __arrow_ext_serialize__(self): + # since we don't have a parameterized type, we don't need extra + # metadata to be deserialized + return b"" + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + # return an instance of this subclass given the serialized + # metadata. + return UuidType() + + pa.register_extension_type(UuidType()) + return UuidType() + + def _generate_test_tuples(): test_tuples = [ ( @@ -515,6 +535,11 @@ def _generate_test_tuples(): ), ArrowSchemaConversionMode.NORMAL, ), + ( + pa.schema([("uuid", _generate_test_type())]), + pa.schema([("uuid", pa.binary(16))]), + ArrowSchemaConversionMode.NORMAL, + ), ] return test_tuples