GH-39914: [pyarrow] Reorder to_pandas extension dtype mapping (#44720)

### Rationale for this change This is a long standing [pandas ticket](pandas-dev/pandas#53011) with some fairly horrible workarounds, where complex arrow types do not serialise well to pandas as the pandas metadata string is not parseable. However, `types_mapper` always had highest priority as it overrode what was set before. ### What changes are included in this PR? By switching the logical ordering, it means that we don't need to call `_pandas_api.pandas_dtype(dtype)` when using the pyarrow backend, thus resolving the issue of complex `dtype` with `list` or `struct`. It will likely still fail if the numpy backend is used, but at least this gives a working solution rather than an inability to load files at all. ### Are these changes tested? Existing tests should stay unchanged and a new test for the complex type has been added ### Are there any user-facing changes? **This PR contains a "Critical Fix".** This makes `pd.read_parquet(..., dtype_backend="pyarrow")` work with complex data types where the metadata added by pyarrow during `pd.to_parquet` is not serialisable and currently throwing an exception. This issue currently prevents the use of pyarrow as the default backend for pandas. * GitHub Issue: #39914 Lead-authored-by: bretttully <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]> Co-authored-by: Brett Tully <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
apache · Nov 27, 2024 · 8548c22 · 8548c22
1 parent 26b08a3
commit 8548c22
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 20 deletions.
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -848,6 +848,25 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
     if _pandas_api.extension_dtype is None:
         return ext_columns
 
+    # use the specified mapping of built-in arrow types to pandas dtypes
+    if types_mapper:
+        for field in table.schema:
+            typ = field.type
+            pandas_dtype = types_mapper(typ)
+            if pandas_dtype is not None:
+                ext_columns[field.name] = pandas_dtype
+
+    # infer from extension type in the schema
+    for field in table.schema:
+        typ = field.type
+        if field.name not in ext_columns and isinstance(typ, pa.BaseExtensionType):
+            try:
+                pandas_dtype = typ.to_pandas_dtype()
+            except NotImplementedError:
+                pass
+            else:
+                ext_columns[field.name] = pandas_dtype
+
     # infer the extension columns from the pandas metadata
     for col_meta in columns_metadata:
         try:
@@ -856,33 +875,14 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
             name = col_meta['name']
         dtype = col_meta['numpy_type']
 
-        if dtype not in _pandas_supported_numpy_types:
+        if name not in ext_columns and dtype not in _pandas_supported_numpy_types:
             # pandas_dtype is expensive, so avoid doing this for types
             # that are certainly numpy dtypes
             pandas_dtype = _pandas_api.pandas_dtype(dtype)
             if isinstance(pandas_dtype, _pandas_api.extension_dtype):
                 if hasattr(pandas_dtype, "__from_arrow__"):
                     ext_columns[name] = pandas_dtype
 
-    # infer from extension type in the schema
-    for field in table.schema:
-        typ = field.type
-        if isinstance(typ, pa.BaseExtensionType):
-            try:
-                pandas_dtype = typ.to_pandas_dtype()
-            except NotImplementedError:
-                pass
-            else:
-                ext_columns[field.name] = pandas_dtype
-
-    # use the specified mapping of built-in arrow types to pandas dtypes
-    if types_mapper:
-        for field in table.schema:
-            typ = field.type
-            pandas_dtype = types_mapper(typ)
-            if pandas_dtype is not None:
-                ext_columns[field.name] = pandas_dtype
-
     return ext_columns
 
 

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -4411,6 +4411,31 @@ def test_to_pandas_extension_dtypes_mapping():
     assert isinstance(result['a'].dtype, pd.PeriodDtype)
 
 
+def test_to_pandas_extension_dtypes_mapping_complex_type():
+    # https://github.com/apache/arrow/pull/44720
+    if Version(pd.__version__) < Version("1.5.2"):
+        pytest.skip("Test relies on pd.ArrowDtype")
+    pa_type = pa.struct(
+        [
+            pa.field("bar", pa.bool_(), nullable=False),
+            pa.field("baz", pa.float32(), nullable=True),
+        ],
+    )
+    pd_type = pd.ArrowDtype(pa_type)
+    schema = pa.schema([pa.field("foo", pa_type)])
+    df0 = pd.DataFrame(
+        [
+            {"foo": {"bar": True, "baz": np.float32(1)}},
+            {"foo": {"bar": True, "baz": None}},
+        ],
+    ).astype({"foo": pd_type})
+
+    # Round trip df0 into df1
+    table = pa.Table.from_pandas(df0, schema=schema)
+    df1 = table.to_pandas(types_mapper=pd.ArrowDtype)
+    pd.testing.assert_frame_equal(df0, df1)
+
+
 def test_array_to_pandas():
     if Version(pd.__version__) < Version("1.1"):
         pytest.skip("ExtensionDtype to_pandas method missing")