diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f5b5ab53ebab5..93450df155c9a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1426,6 +1426,12 @@ tasks: # ensure we have at least one build with parquet encryption disabled PARQUET_REQUIRE_ENCRYPTION: "OFF" {% endif %} + {% if pandas_version == "nightly" %} + # TODO can be removed once this is enabled by default in pandas >= 3 + # This is to enable the Pandas feature. + # See: https://github.com/pandas-dev/pandas/pull/58459 + PANDAS_FUTURE_INFER_STRING: "1" + {% endif %} {% if not cache_leaf %} # use the latest pandas release, so prevent reusing any cached layers flags: --no-leaf-cache diff --git a/docker-compose.yml b/docker-compose.yml index 5cb96e62c1163..43dd3511fcf18 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1375,6 +1375,7 @@ services: PYTEST_ARGS: # inherit HYPOTHESIS_PROFILE: # inherit PYARROW_TEST_HYPOTHESIS: # inherit + PANDAS_FUTURE_INFER_STRING: # inherit volumes: *conda-volumes command: *python-conda-command diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f86caf1433d4e..2ef42051d9ad2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size): "return a pyarrow Array or ChunkedArray.") if isinstance(res, ChunkedArray) and res.num_chunks==1: res = res.chunk(0) + if type is not None and res.type != type: + res = res.cast(type) return res diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 74f0d981b52f4..5be6f03f86ed6 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21, _is_ge_v3 + bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict def __init__(self): self._lock = Lock() @@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') + self._is_ge_v3_strict = self._loose_version >= Version('3.0.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v3 + def is_ge_v3_strict(self): + self._check_import() + return self._is_ge_v3_strict + + def uses_string_dtype(self): + if self.is_ge_v3_strict(): + return True + try: + if self.pd.options.future.infer_string: + return True + except: + pass + return False + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d0582f825b529..e9655914ad767 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name): } string_dtype = 'object' - if name is not None and not isinstance(name, str): + if ( + name is not None + and not (isinstance(name, float) and np.isnan(name)) + and not isinstance(name, str) + ): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ @@ -340,8 +344,8 @@ def _column_name_to_strings(name): return str(tuple(map(_column_name_to_strings, name))) elif isinstance(name, Sequence): raise TypeError("Unsupported type for MultiIndex level") - elif name is None: - return None + elif name is None or (isinstance(name, float) and np.isnan(name)): + return name return str(name) @@ -790,10 +794,12 @@ def table_to_dataframe( table, index = _reconstruct_index(table, index_descriptors, all_columns, types_mapper) ext_columns_dtypes = _get_extension_dtypes( - table, all_columns, types_mapper) + table, all_columns, types_mapper, options, categories) else: index = _pandas_api.pd.RangeIndex(table.num_rows) - ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper) + ext_columns_dtypes = _get_extension_dtypes( + table, [], types_mapper, options, categories + ) _check_data_column_metadata_consistency(all_columns) columns = _deserialize_column_index(table, all_columns, column_indexes) @@ -838,7 +844,7 @@ def table_to_dataframe( } -def _get_extension_dtypes(table, columns_metadata, types_mapper=None): +def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories): """ Based on the stored column pandas metadata and the extension types in the arrow schema, infer which columns should be converted to a @@ -851,6 +857,9 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): and then we can check if this dtype supports conversion from arrow. """ + strings_to_categorical = options["strings_to_categorical"] + categories = categories or [] + ext_columns = {} # older pandas version that does not yet support extension dtypes @@ -889,9 +898,32 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None): # that are certainly numpy dtypes pandas_dtype = _pandas_api.pandas_dtype(dtype) if isinstance(pandas_dtype, _pandas_api.extension_dtype): + if isinstance(pandas_dtype, _pandas_api.pd.StringDtype): + # when the metadata indicate to use the string dtype, + # ignore this in case: + # - it is specified to convert strings / this column to categorical + # - the column itself is dictionary encoded and would otherwise be + # converted to categorical + if strings_to_categorical or name in categories: + continue + try: + if pa.types.is_dictionary(table.schema.field(name).type): + continue + except KeyError: + pass if hasattr(pandas_dtype, "__from_arrow__"): ext_columns[name] = pandas_dtype + # for pandas 3.0+, use pandas' new default string dtype + if _pandas_api.uses_string_dtype() and not strings_to_categorical: + for field in table.schema: + if field.name not in ext_columns and ( + pa.types.is_string(field.type) + or pa.types.is_large_string(field.type) + or pa.types.is_string_view(field.type) + ) and field.name not in categories: + ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan) + return ext_columns @@ -1049,9 +1081,9 @@ def get_pandas_logical_type_map(): 'date': 'datetime64[D]', 'datetime': 'datetime64[ns]', 'datetimetz': 'datetime64[ns]', - 'unicode': np.str_, + 'unicode': 'str', 'bytes': np.bytes_, - 'string': np.str_, + 'string': 'str', 'integer': np.int64, 'floating': np.float64, 'decimal': np.object_, @@ -1142,6 +1174,20 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) + elif ( + level.dtype == "str" and numpy_dtype == "object" + and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) + ): + # the metadata indicate that the original dataframe used object dtype, + # but ignore this and keep string dtype if: + # - the original columns used mixed types -> we don't attempt to faithfully + # roundtrip in this case, but keep the column names as strings + # - the original columns were inferred to be strings but stored in object + # dtype -> we don't restore the object dtype because all metadata + # generated using pandas < 3 will have this case by default, and + # for pandas >= 3 we want to use the default string dtype for .columns + new_levels.append(level) + continue elif level.dtype != dtype: level = level.astype(dtype) # ARROW-9096: if original DataFrame was upcast we keep that diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e6fcd6149ee04..6f28205a18e13 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1020,7 +1020,7 @@ def test_replace_slice(): offsets = range(-3, 4) arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -1031,7 +1031,7 @@ def test_replace_slice(): assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) - series = arr.to_pandas() + series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') @@ -2132,7 +2132,8 @@ def test_strftime(): for fmt in formats: options = pc.StrftimeOptions(fmt) result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + # cast to the same type as result to ignore string vs large_string + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) fmt = "%Y-%m-%dT%H:%M:%S" @@ -2140,34 +2141,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) - expected = pa.array(ts.strftime(fmt + "%Z")) + expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S")) + expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime("%S.%f")) + expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") result = pc.strftime(tsa, options=options) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Test timestamps without timezone @@ -2175,7 +2176,7 @@ def test_strftime(): ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) - expected = pa.array(ts.strftime(fmt)) + expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format assert pc.strftime(tsa, fmt) == result diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 18c8cd5b654e6..249fb621279a6 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -426,7 +426,11 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - _check_pandas_roundtrip(df, version=version) + if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + expected = df.astype("str") + else: + expected = df + _check_pandas_roundtrip(df, version=version, expected=expected) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index d5c936df072ae..f356874c576ce 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -349,6 +349,17 @@ def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) + def test_float_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan]) + _check_pandas_roundtrip(df, preserve_index=True) + + @pytest.mark.filterwarnings( + "ignore:The DataFrame has column names of mixed type:UserWarning" + ) + def test_string_column_index_with_missing(self): + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None]) + _check_pandas_roundtrip(df, preserve_index=True) + def test_index_metadata_field_name(self): # test None case, and strangely named non-index columns df = pd.DataFrame( @@ -359,8 +370,11 @@ def test_index_metadata_field_name(self): ), columns=['a', None, '__index_level_0__'], ) - with pytest.warns(UserWarning): + if _pandas_api.uses_string_dtype(): t = pa.Table.from_pandas(df, preserve_index=True) + else: + with pytest.warns(UserWarning): + t = pa.Table.from_pandas(df, preserve_index=True) js = t.schema.pandas_metadata col1, col2, col3, idx0, foo = js['columns'] @@ -368,8 +382,12 @@ def test_index_metadata_field_name(self): assert col1['name'] == 'a' assert col1['name'] == col1['field_name'] - assert col2['name'] is None - assert col2['field_name'] == 'None' + if _pandas_api.uses_string_dtype(): + assert np.isnan(col2['name']) + assert col2['field_name'] == 'nan' + else: + assert col2['name'] is None + assert col2['field_name'] == 'None' assert col3['name'] == '__index_level_0__' assert col3['name'] == col3['field_name'] @@ -411,7 +429,9 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['numpy_type'] == 'object' + assert column_indexes['numpy_type'] == ( + 'str' if _pandas_api.uses_string_dtype() else 'object' + ) assert column_indexes['pandas_type'] == 'unicode' md = column_indexes['metadata'] @@ -1680,7 +1700,10 @@ def test_pandas_unicode(self): repeats = 1000 values = ['foo', None, 'bar', 'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = pa.field('strings', pa.string()) + field = pa.field( + 'strings', + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) schema = pa.schema([field]) ex_values = ['foo', None, 'bar', 'mañana', None] expected = pd.DataFrame({'strings': ex_values * repeats}) @@ -3323,6 +3346,10 @@ def _assert_nunique(obj, expected): def test_to_pandas_deduplicate_strings_array_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3335,6 +3362,10 @@ def test_to_pandas_deduplicate_strings_array_types(): def test_to_pandas_deduplicate_strings_table_types(): + if _pandas_api.uses_string_dtype(): + pytest.skip( + "pandas uses string dtype and not object dtype, keyword has no effect" + ) nunique = 100 repeats = 10 values = _generate_dedup_example(nunique, repeats) @@ -3798,20 +3829,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs): x = obj.to_pandas(**kwargs) # noqa # Memory allocation unchanged -- either zero copy or self-destructing - assert pa.total_allocated_bytes() == prior_allocation + if _pandas_api.uses_string_dtype(): + # for the string array of the columns Index + # -> increase the size to account for overallocation for small arrays + max_index_allocation = max(192, x.columns.nbytes * 2) + assert pa.total_allocated_bytes() <= (prior_allocation + max_index_allocation) + else: + assert pa.total_allocated_bytes() == prior_allocation def test_to_pandas_split_blocks(): # ARROW-3789 t = pa.table([ - pa.array([1, 2, 3, 4, 5], type='i1'), - pa.array([1, 2, 3, 4, 5], type='i4'), - pa.array([1, 2, 3, 4, 5], type='i8'), - pa.array([1, 2, 3, 4, 5], type='f4'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), - pa.array([1, 2, 3, 4, 5], type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='i1'), + pa.array([1, 2, 3, 4, 5]*100, type='i4'), + pa.array([1, 2, 3, 4, 5]*100, type='i8'), + pa.array([1, 2, 3, 4, 5]*100, type='f4'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), + pa.array([1, 2, 3, 4, 5]*100, type='f8'), ], ['f{}'.format(i) for i in range(8)]) _check_blocks_created(t, 8) @@ -3856,7 +3893,12 @@ def test_table_uses_memory_pool(): prior_allocation = pa.total_allocated_bytes() x = t.to_pandas() - assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8) + new_allocation = 3 * N * 8 + if _pandas_api.uses_string_dtype(): + # for the small columns Index + new_allocation += 128 + + assert pa.total_allocated_bytes() == (prior_allocation + new_allocation) # Check successful garbage collection x = None # noqa @@ -4134,7 +4176,10 @@ def test_dictionary_encoded_nested_to_pandas(): def test_dictionary_from_pandas(): cat = pd.Categorical(['a', 'b', 'a']) - expected_type = pa.dictionary(pa.int8(), pa.string()) + expected_type = pa.dictionary( + pa.int8(), + pa.large_string() if _pandas_api.uses_string_dtype() else pa.string() + ) result = pa.array(cat) assert result.to_pylist() == ['a', 'b', 'a']