Skip to content

Commit

Permalink
apacheGH-36412: [Python][CI] Fix extra deprecation warnings in the pa…
Browse files Browse the repository at this point in the history
…ndas nightly build (apache#39609)

Fixes left deprecation warnings coming from the pandas development version, by updating our test code to avoid the deprecated patterns.
* Closes: apache#36412

Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
AlenkaF and jorisvandenbossche authored Jan 17, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent f55c0d7 commit 6eeee3b
Showing 5 changed files with 35 additions and 38 deletions.
15 changes: 2 additions & 13 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
@@ -967,20 +967,9 @@ def _extract_index_level(table, result_table, field_name,
# The serialized index column was removed by the user
return result_table, None, None

pd = _pandas_api.pd

col = table.column(i)
values = col.to_pandas(types_mapper=types_mapper).values

if hasattr(values, 'flags') and not values.flags.writeable:
# ARROW-1054: in pandas 0.19.2, factorize will reject
# non-writeable arrays when calling MultiIndex.from_arrays
values = values.copy()

if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None:
index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz)
else:
index_level = pd.Series(values, dtype=values.dtype, copy=False)
index_level = col.to_pandas(types_mapper=types_mapper)
index_level.name = None
result_table = result_table.remove_column(
result_table.schema.get_field_index(field_name)
)
4 changes: 2 additions & 2 deletions python/pyarrow/tests/parquet/test_datetime.py
Original file line number Diff line number Diff line change
@@ -116,7 +116,7 @@ def test_coerce_timestamps(tempdir):
df_expected = df.copy()
for i, x in enumerate(df_expected['datetime64']):
if isinstance(x, np.ndarray):
df_expected['datetime64'][i] = x.astype('M8[us]')
df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')

tm.assert_frame_equal(df_expected, df_read)

@@ -429,7 +429,7 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir):
# nanosecond timestamps by default
n = 9
df = pd.DataFrame({'x': range(n)},
index=pd.date_range('2017-01-01', freq='1n', periods=n))
index=pd.date_range('2017-01-01', freq='ns', periods=n))
tb = pa.Table.from_pandas(df)

filename = tempdir / 'written.parquet'
6 changes: 3 additions & 3 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
@@ -2360,18 +2360,18 @@ def _check_temporal_rounding(ts, values, unit):
unit_shorthand = {
"nanosecond": "ns",
"microsecond": "us",
"millisecond": "L",
"millisecond": "ms",
"second": "s",
"minute": "min",
"hour": "H",
"hour": "h",
"day": "D"
}
greater_unit = {
"nanosecond": "us",
"microsecond": "ms",
"millisecond": "s",
"second": "min",
"minute": "H",
"minute": "h",
"hour": "d",
}
ta = pa.array(ts)
6 changes: 4 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -178,12 +178,14 @@ def multisourcefs(request):

# simply split the dataframe into four chunks to construct a data source
# from each chunk into its own directory
df_a, df_b, df_c, df_d = np.array_split(df, 4)
n = len(df)
df_a, df_b, df_c, df_d = [df.iloc[i:i+n//4] for i in range(0, n, n//4)]

# create a directory containing a flat sequence of parquet files without
# any partitioning involved
mockfs.create_dir('plain')
for i, chunk in enumerate(np.array_split(df_a, 10)):
n = len(df_a)
for i, chunk in enumerate([df_a.iloc[i:i+n//10] for i in range(0, n, n//10)]):
path = 'plain/chunk-{}.parquet'.format(i)
with mockfs.open_output_stream(path) as out:
pq.write_table(_table_from_pandas(chunk), out)
42 changes: 24 additions & 18 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -113,6 +113,10 @@ def _check_pandas_roundtrip(df, expected=None, use_threads=False,
if expected is None:
expected = df

for col in expected.columns:
if expected[col].dtype == 'object':
expected[col] = expected[col].replace({np.nan: None})

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "elementwise comparison failed", DeprecationWarning)
@@ -152,6 +156,9 @@ def _check_array_roundtrip(values, expected=None, mask=None,
expected = pd.Series(values).copy()
expected[mask.copy()] = None

if expected.dtype == 'object':
expected = expected.replace({np.nan: None})

tm.assert_series_equal(pd.Series(result), expected, check_names=False)


@@ -478,7 +485,7 @@ def test_mixed_column_names(self):
preserve_index=True)

def test_binary_column_name(self):
if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
if Version("2.0.0") <= Version(pd.__version__) < Version("3.0.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
@@ -3108,7 +3115,7 @@ def _fully_loaded_dataframe_example():

@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
if Version("2.0.0") <= Version(pd.__version__) < Version("3.0.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
@@ -3491,7 +3498,7 @@ def test_table_from_pandas_schema_field_order_metadata():
# ensure that a different field order in specified schema doesn't
# mangle metadata
df = pd.DataFrame({
"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=2),
"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=2),
"float": np.random.randn(2)
})

@@ -4181,8 +4188,6 @@ def _Int64Dtype__from_arrow__(self, array):


def test_convert_to_extension_array(monkeypatch):
import pandas.core.internals as _int

# table converted from dataframe with extension types (so pandas_metadata
# has this information)
df = pd.DataFrame(
@@ -4193,16 +4198,15 @@ def test_convert_to_extension_array(monkeypatch):
# Int64Dtype is recognized -> convert to extension block by default
# for a proper roundtrip
result = table.to_pandas()
assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")
assert isinstance(_get_mgr(result).blocks[1], _int.ExtensionBlock)
assert _get_mgr(result).blocks[1].values.dtype == pd.Int64Dtype()
tm.assert_frame_equal(result, df)

# test with missing values
df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
table2 = pa.table(df2)
result = table2.to_pandas()
assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
tm.assert_frame_equal(result, df2)

# monkeypatch pandas Int64Dtype to *not* have the protocol method
@@ -4215,7 +4219,7 @@ def test_convert_to_extension_array(monkeypatch):
# Int64Dtype has no __from_arrow__ -> use normal conversion
result = table.to_pandas()
assert len(_get_mgr(result).blocks) == 1
assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")


class MyCustomIntegerType(pa.ExtensionType):
@@ -4233,21 +4237,19 @@ def to_pandas_dtype(self):

def test_conversion_extensiontype_to_extensionarray(monkeypatch):
# converting extension type to linked pandas ExtensionDtype/Array
import pandas.core.internals as _int

storage = pa.array([1, 2, 3, 4], pa.int64())
arr = pa.ExtensionArray.from_storage(MyCustomIntegerType(), storage)
table = pa.table({'a': arr})

# extension type points to Int64Dtype, which knows how to create a
# pandas ExtensionArray
result = arr.to_pandas()
assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
expected = pd.Series([1, 2, 3, 4], dtype='Int64')
tm.assert_series_equal(result, expected)

result = table.to_pandas()
assert isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == pd.Int64Dtype()
expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
tm.assert_frame_equal(result, expected)

@@ -4261,7 +4263,7 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch):
pd.core.arrays.integer.NumericDtype, "__from_arrow__")

result = arr.to_pandas()
assert not isinstance(_get_mgr(result).blocks[0], _int.ExtensionBlock)
assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64")
expected = pd.Series([1, 2, 3, 4])
tm.assert_series_equal(result, expected)

@@ -4312,10 +4314,14 @@ def test_array_to_pandas():
def test_roundtrip_empty_table_with_extension_dtype_index():
df = pd.DataFrame(index=pd.interval_range(start=0, end=3))
table = pa.table(df)
table.to_pandas().index == pd.Index([{'left': 0, 'right': 1},
{'left': 1, 'right': 2},
{'left': 2, 'right': 3}],
dtype='object')
if Version(pd.__version__) > Version("1.0"):
tm.assert_index_equal(table.to_pandas().index, df.index)
else:
tm.assert_index_equal(table.to_pandas().index,
pd.Index([{'left': 0, 'right': 1},
{'left': 1, 'right': 2},
{'left': 2, 'right': 3}],
dtype='object'))


@pytest.mark.parametrize("index", ["a", ["a", "b"]])

0 comments on commit 6eeee3b

Please sign in to comment.