From df8e0f9a84f28a5f7c737215a8379669f507a022 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Thu, 19 Dec 2024 13:52:09 -0500 Subject: [PATCH 1/2] Fix from_lists for multiple-chunk series --- src/nested_pandas/series/packer.py | 32 ++++++++++++++++++++--- tests/nested_pandas/series/test_packer.py | 18 +++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index 1ea8abb..88f0506 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -196,10 +196,34 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr nested_pandas.series.dtype.NestedDtype : The dtype of the output series. nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes. """ - struct_array = pa.StructArray.from_arrays( - [df[column] for column in df.columns], - names=df.columns, - ) + # When series is converted to pa.array it may be both Array and ChunkedArray + # We convert it to chunked for the sake of consistency + pa_arrays_maybe_chunked = {column: pa.array(df[column]) for column in df.columns} + pa_chunked_arrays = { + column: arr if isinstance(arr, pa.ChunkedArray) else pa.chunked_array([arr]) + for column, arr in pa_arrays_maybe_chunked.items() + } + + # If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no + # data copying. + chunk_lengths = np.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()]) + if np.all(chunk_lengths == chunk_lengths[0]): + n_chunks = chunk_lengths.shape[1] + chunks = [] + for i in range(n_chunks): + chunks.append( + pa.StructArray.from_arrays( + [arr.chunk(i) for arr in pa_chunked_arrays.values()], + names=pa_chunked_arrays.keys(), + ) + ) + struct_array = pa.chunked_array(chunks) + else: # "flatten" the chunked arrays + struct_array = pa.StructArray.from_arrays( + [arr.combine_chunks() for arr in pa_chunked_arrays.values()], + names=pa_chunked_arrays.keys(), + ) + ext_array = NestedExtensionArray(struct_array, validate=validate) return pd.Series( ext_array, diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index d06f8d2..d8f14db 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -286,6 +286,24 @@ def test_pack_lists(): assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name]) +def test_pack_lists_with_chunked_arrays(): + """Issue https://github.com/lincc-frameworks/nested-pandas/issues/180""" + chunked_a = pd.Series( + pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3), + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + name="a", + ) + chunked_b = pd.Series( + pa.chunked_array([pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]])] * 3), + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + name="b", + ) + list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5]) + series = packer.pack_lists(list_df) + assert_series_equal(series.nest.get_list_series("a"), chunked_a) + assert_series_equal(series.nest.get_list_series("b"), chunked_b) + + def test_pack_seq_with_dfs_and_index(): """Test pack_seq().""" dfs = [ From 5d39c52e72d330c715dbbb7d80df965ee704d835 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Thu, 19 Dec 2024 14:13:13 -0500 Subject: [PATCH 2/2] Fix uneven chunks case --- src/nested_pandas/series/packer.py | 8 ++++---- tests/nested_pandas/series/test_packer.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index 88f0506..dedaaca 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -206,11 +206,11 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr # If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no # data copying. - chunk_lengths = np.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()]) - if np.all(chunk_lengths == chunk_lengths[0]): - n_chunks = chunk_lengths.shape[1] + chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()]) + if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths): chunks = [] - for i in range(n_chunks): + numpy_chunks = next(iter(pa_chunked_arrays.values())).num_chunks + for i in range(numpy_chunks): chunks.append( pa.StructArray.from_arrays( [arr.chunk(i) for arr in pa_chunked_arrays.values()], diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index d8f14db..b022a17 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -304,6 +304,24 @@ def test_pack_lists_with_chunked_arrays(): assert_series_equal(series.nest.get_list_series("b"), chunked_b) +def test_pack_lists_with_uneven_chunked_arrays(): + """Issue https://github.com/lincc-frameworks/nested-pandas/issues/180""" + chunked_a = pd.Series( + pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3), + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + name="a", + ) + chunked_b = pd.Series( + pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]] * 3), + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + name="b", + ) + list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5]) + series = packer.pack_lists(list_df) + assert_series_equal(series.nest.get_list_series("a"), chunked_a) + assert_series_equal(series.nest.get_list_series("b"), chunked_b) + + def test_pack_seq_with_dfs_and_index(): """Test pack_seq().""" dfs = [