diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index a5edab7..c8acb59 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -57,36 +57,6 @@ def pack( return pack_seq(obj, name=name, index=index, dtype=dtype) -def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame: - """Pack a "flat" dataframe into a "nested" dataframe. - - For the input dataframe with repeated indexes, make a pandas.DataFrame, - where each original column is replaced by a column of lists, and, - optionally, a "structure" column is added, containing a structure of - lists with the original columns. - - Parameters - ---------- - df : pd.DataFrame - Input dataframe, with repeated indexes. - - name : str, optional - Name of the structure column. The default is None, which means no - structure column is added. - - Returns - ------- - pd.DataFrame - Output dataframe. - """ - # TODO: we can optimize name=None case a bit - struct_series = pack_flat(df, name=name) - packed_df = struct_series.nest.to_lists() - if name is not None: - packed_df[name] = struct_series - return packed_df - - def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: """Make a structure of lists representation of a "flat" dataframe. @@ -116,7 +86,7 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays. """ - # TODO: think about the case when the data is pre-sorted and we don't need a data copy. + # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted flat = df.sort_index(kind="stable") return pack_sorted_df_into_struct(flat, name=name) @@ -177,6 +147,9 @@ def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd. pd.Series Output series, with unique indexes. """ + if not df.index.is_monotonic_increasing: + raise ValueError("The index of the input dataframe must be sorted") + packed_df = view_sorted_df_as_list_arrays(df) # No need to validate the dataframe, the length of the nested arrays is forced to be the same by # the view_sorted_df_as_list_arrays function. @@ -243,8 +216,11 @@ def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame: Output dataframe, with unique indexes. It is a view over the input dataframe, so it would mute the input dataframe if modified. """ + if not df.index.is_monotonic_increasing: + raise ValueError("The index of the input dataframe must be sorted") + offset_array = calculate_sorted_index_offsets(df.index) - unique_index = df.index.values[offset_array[:-1]] + unique_index = df.index[offset_array[:-1]] series_ = { column: view_sorted_series_as_list_array(df[column], offset_array, unique_index) @@ -278,10 +254,13 @@ def view_sorted_series_as_list_array( Output series, with unique indexes. It is a view over the input series, so it would mute the input series if modified. """ + if not series.index.is_monotonic_increasing: + raise ValueError("The index of the input series must be sorted") + if offset is None: offset = calculate_sorted_index_offsets(series.index) if unique_index is None: - unique_index = series.index.values[offset[:-1]] + unique_index = series.index[offset[:-1]] list_array = pa.ListArray.from_arrays( offset, @@ -310,12 +289,12 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: Output array of offsets, one element more than the number of unique index values. """ - # TODO: implement multi-index support - index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1) - - if np.any(index_diff < 0): - raise ValueError("Table index must be strictly sorted.") + if not index.is_monotonic_increasing: + raise ValueError("The index must be sorted") - offset = np.nonzero(index_diff)[0] + # pd.Index.duplicated returns False for the first occurance and True for all others. + # So our offsets would be indexes of these False values with the array length in the end. + offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0] + offset = np.append(offset_but_last, len(index)) return offset diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index 976661d..a77edf4 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pyarrow as pa +import pytest from nested_pandas import NestedDtype from nested_pandas.series import packer from numpy.testing import assert_array_equal @@ -14,7 +15,7 @@ def test_pack_with_flat_df(): "a": [1, 2, 3, 4], "b": [0, 1, 0, 1], }, - index=[1, 2, 1, 2], + index=pd.MultiIndex.from_arrays(([1, 1, 1, 1], [1, 2, 1, 2])), ) series = packer.pack(df, name="series") @@ -23,7 +24,7 @@ def test_pack_with_flat_df(): (np.array([1, 3]), np.array([0, 0])), (np.array([2, 4]), np.array([1, 1])), ], - index=[1, 2], + index=pd.MultiIndex.from_arrays(([1, 1], [1, 2])), dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), name="series", ) @@ -87,55 +88,6 @@ def test_pack_with_series_of_dfs(): assert_series_equal(series, desired) -def test_pack_flat_into_df(): - """Test pack_flat_into_df().""" - df = pd.DataFrame( - data={ - "a": [7, 8, 9, 1, 2, 3, 4, 5, 6], - "b": [0, 1, 0, 0, 1, 0, 1, 0, 1], - }, - index=[4, 4, 4, 1, 1, 2, 2, 3, 3], - ) - actual = packer.pack_flat_into_df(df, name="struct") - - desired = pd.DataFrame( - data={ - "a": pd.Series( - data=[ - np.array([1, 2]), - np.array([3, 4]), - np.array([5, 6]), - np.array([7, 8, 9]), - ], - dtype=pd.ArrowDtype(pa.list_(pa.int64())), - index=[1, 2, 3, 4], - ), - "b": pd.Series( - data=[ - np.array([0, 1]), - np.array([0, 1]), - np.array([0, 1]), - np.array([0, 1, 0]), - ], - dtype=pd.ArrowDtype(pa.list_(pa.int64())), - index=[1, 2, 3, 4], - ), - "struct": pd.Series( - data=[ - (np.array([1, 2]), np.array([0, 1])), - (np.array([3, 4]), np.array([0, 1])), - (np.array([5, 6]), np.array([0, 1])), - (np.array([7, 8, 9]), np.array([0, 1, 0])), - ], - dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), - index=[1, 2, 3, 4], - ), - }, - ) - - assert_frame_equal(actual, desired) - - def test_pack_flat(): """Test pack_flat().""" df = pd.DataFrame( @@ -186,6 +138,19 @@ def test_pack_sorted_df_into_struct(): assert_series_equal(actual, desired) +def test_pack_sorted_df_into_struct_raises_when_not_sorted(): + """Test pack_sorted_df_into_struct() raises when not sorted.""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [0, 1, 0, 1, 0, 1, 0, 1, 0], + }, + index=[1, 2, 1, 2, 3, 3, 4, 4, 4], + ) + with pytest.raises(ValueError): + packer.pack_sorted_df_into_struct(df) + + def test_pack_lists(): """Test pack_lists().""" packed_df = pd.DataFrame( @@ -362,6 +327,19 @@ def test_view_sorted_df_as_list_arrays(): assert_frame_equal(nested_df, desired_nested) +def test_view_sorted_df_as_list_arrays_raises_when_not_sorted(): + """Test view_sorted_df_as_list_arrays() raises when not sorted.""" + flat_df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [0, 1, 0, 1, 0, 1, 0, 1, 0], + }, + index=[1, 2, 1, 2, 3, 3, 4, 4, 4], + ) + with pytest.raises(ValueError): + packer.view_sorted_df_as_list_arrays(flat_df) + + def test_view_sorted_series_as_list_array(): """Test view_sorted_series_as_list_array().""" series = pd.Series( @@ -386,3 +364,49 @@ def test_view_sorted_series_as_list_array(): name="my_series", ) assert_series_equal(nested, desired_nested) + + +def test_view_sorted_series_as_list_array_raises_when_not_sorted(): + """Test view_sorted_series_as_list_array() raises when not sorted.""" + series = pd.Series( + data=[1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[1, 2, 1, 2, 3, 3, 4, 4, 4], + ) + with pytest.raises(ValueError): + packer.view_sorted_series_as_list_array(series) + + +@pytest.mark.parametrize( + "index,offsets", + [ + (pd.Index([1, 2, 3, 4]), np.array([0, 1, 2, 3, 4])), + (pd.Index([1, 1, 2, 2, 3, 3, 4, 4, 4]), np.array([0, 2, 4, 6, 9])), + (pd.Index([1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 9])), + (pd.Index([1, 2, 2, 2, 3, 3, 4]), np.array([0, 1, 4, 6, 7])), + ( + pd.MultiIndex.from_product([[1, 2, 3], ["a", "a", "b", "b", "b"]]), + np.array([0, 2, 5, 7, 10, 12, 15]), + ), + ( + pd.MultiIndex.from_arrays( + ( + [1, 1, 1, 1, 1, 1, 2, 2], + ["a", "a", "a", "a", "b", "b", "z", "z"], + [1, 2, 2, 2, 9, 9, 9, 9], + ), + names=["id1", "id2", "id3"], + ), + np.array([0, 1, 4, 6, 8]), + ), + ], +) +def test_calculate_sorted_index_offsets(index, offsets): + """Test calculate_sorted_index_offsets().""" + assert_array_equal(packer.calculate_sorted_index_offsets(index), offsets) + + +def test_calculate_sorted_index_offsets_raises_when_not_sorted(): + """Test calculate_sorted_index_offsets() raises when not sorted.""" + index = pd.Index([1, 2, 1, 2, 3, 3, 4, 4, 4]) + with pytest.raises(ValueError): + packer.calculate_sorted_index_offsets(index)