Skip to content

Commit

Permalink
Merge pull request #181 from lincc-frameworks/fix-from-lists
Browse files Browse the repository at this point in the history
Fix pack_lists for multiple-chunk series
  • Loading branch information
hombit authored Dec 19, 2024
2 parents 3cb19bd + 5d39c52 commit 27dd430
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 4 deletions.
32 changes: 28 additions & 4 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,34 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr
nested_pandas.series.dtype.NestedDtype : The dtype of the output series.
nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes.
"""
struct_array = pa.StructArray.from_arrays(
[df[column] for column in df.columns],
names=df.columns,
)
# When series is converted to pa.array it may be both Array and ChunkedArray
# We convert it to chunked for the sake of consistency
pa_arrays_maybe_chunked = {column: pa.array(df[column]) for column in df.columns}
pa_chunked_arrays = {
column: arr if isinstance(arr, pa.ChunkedArray) else pa.chunked_array([arr])
for column, arr in pa_arrays_maybe_chunked.items()
}

# If all chunk arrays have the same chunk lengths, we can build a chunked struct array with no
# data copying.
chunk_lengths = pa.array([[len(chunk) for chunk in arr.chunks] for arr in pa_chunked_arrays.values()])
if all(chunk_length == chunk_lengths[0] for chunk_length in chunk_lengths):
chunks = []
numpy_chunks = next(iter(pa_chunked_arrays.values())).num_chunks
for i in range(numpy_chunks):
chunks.append(
pa.StructArray.from_arrays(
[arr.chunk(i) for arr in pa_chunked_arrays.values()],
names=pa_chunked_arrays.keys(),
)
)
struct_array = pa.chunked_array(chunks)
else: # "flatten" the chunked arrays
struct_array = pa.StructArray.from_arrays(
[arr.combine_chunks() for arr in pa_chunked_arrays.values()],
names=pa_chunked_arrays.keys(),
)

ext_array = NestedExtensionArray(struct_array, validate=validate)
return pd.Series(
ext_array,
Expand Down
36 changes: 36 additions & 0 deletions tests/nested_pandas/series/test_packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,42 @@ def test_pack_lists():
assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name])


def test_pack_lists_with_chunked_arrays():
"""Issue https://github.com/lincc-frameworks/nested-pandas/issues/180"""
chunked_a = pd.Series(
pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3),
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
name="a",
)
chunked_b = pd.Series(
pa.chunked_array([pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]])] * 3),
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
name="b",
)
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
series = packer.pack_lists(list_df)
assert_series_equal(series.nest.get_list_series("a"), chunked_a)
assert_series_equal(series.nest.get_list_series("b"), chunked_b)


def test_pack_lists_with_uneven_chunked_arrays():
"""Issue https://github.com/lincc-frameworks/nested-pandas/issues/180"""
chunked_a = pd.Series(
pa.chunked_array([pa.array([[1, 2, 3], [4, 5]])] * 3),
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
name="a",
)
chunked_b = pd.Series(
pa.array([[0.0, 1.0, 2.0], [3.0, 4.0]] * 3),
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
name="b",
)
list_df = pd.DataFrame({"a": chunked_a, "b": chunked_b}, index=[0, 1, 2, 3, 4, 5])
series = packer.pack_lists(list_df)
assert_series_equal(series.nest.get_list_series("a"), chunked_a)
assert_series_equal(series.nest.get_list_series("b"), chunked_b)


def test_pack_seq_with_dfs_and_index():
"""Test pack_seq()."""
dfs = [
Expand Down

0 comments on commit 27dd430

Please sign in to comment.