Skip to content

Commit 96503cd

Browse files
authored
Merge pull request #168 from lincc-frameworks/non-uniq-idx
Handle non-unique index
2 parents 1a2a1d2 + b652f7a commit 96503cd

File tree

6 files changed

+313
-54
lines changed

6 files changed

+313
-54
lines changed

src/nested_pandas/nestedframe/core.py

Lines changed: 56 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@
1212
from pandas.api.extensions import no_default
1313
from pandas.core.computation.expr import PARSERS, PandasExprVisitor
1414

15-
from nested_pandas.series import packer
15+
from nested_pandas.nestedframe.utils import extract_nest_names
1616
from nested_pandas.series.dtype import NestedDtype
17-
18-
from ..series.packer import pack_sorted_df_into_struct
19-
from .utils import extract_nest_names
17+
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
2018

2119

2220
class NestedPandasExprVisitor(PandasExprVisitor):
@@ -219,10 +217,8 @@ def __setitem__(self, key, value):
219217
"." in key and key.split(".")[0] in self.nested_columns
220218
):
221219
nested, col = key.split(".")
222-
new_flat = self[nested].nest.to_flat()
223-
new_flat[col] = value
224-
packed = packer.pack(new_flat)
225-
return super().__setitem__(nested, packed)
220+
new_nested_series = self[nested].nest.with_flat_field(col, value)
221+
return super().__setitem__(nested, new_nested_series)
226222

227223
# Adding a new nested structure from a column
228224
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
@@ -231,8 +227,9 @@ def __setitem__(self, key, value):
231227
if isinstance(value, pd.Series):
232228
value.name = col
233229
value = value.to_frame()
234-
packed = packer.pack(value)
235-
return super().__setitem__(new_nested, packed)
230+
new_df = self.add_nested(value, name=new_nested)
231+
self._update_inplace(new_df)
232+
return None
236233

237234
return super().__setitem__(key, value)
238235

@@ -242,6 +239,7 @@ def add_nested(
242239
name: str,
243240
*,
244241
how: str = "left",
242+
on: None | str | list[str] = None,
245243
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
246244
) -> Self: # type: ignore[name-defined] # noqa: F821
247245
"""Packs input object to a nested column and adds it to the NestedFrame
@@ -272,6 +270,8 @@ def add_nested(
272270
index, and sort it lexicographically.
273271
- inner: form intersection of calling frame's index with other
274272
frame's index, preserving the order of the calling index.
273+
on : str, default: None
274+
A column in the list
275275
dtype : dtype or None
276276
NestedDtype to use for the nested column; pd.ArrowDtype or
277277
pa.DataType can also be used to specify the nested dtype. If None,
@@ -282,13 +282,16 @@ def add_nested(
282282
NestedFrame
283283
A new NestedFrame with the added nested column.
284284
"""
285+
if on is not None and not isinstance(on, str):
286+
raise ValueError("Currently we only support a single column for 'on'")
285287
# Add sources to objects
286-
packed = packer.pack(obj, name=name, dtype=dtype)
288+
packed = pack(obj, name=name, on=on, dtype=dtype)
287289
new_df = self.copy()
288-
return new_df.join(packed, how=how)
290+
res = new_df.join(packed, how=how, on=on)
291+
return res
289292

290293
@classmethod
291-
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
294+
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
292295
"""Creates a NestedFrame with base and nested columns from a flat
293296
dataframe.
294297
@@ -304,7 +307,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
304307
in the list will attempt to be packed into a single nested column
305308
with the name provided in `nested_name`. If None, is defined as all
306309
columns not in `base_columns`.
307-
index: str, or None
310+
on: str or None
308311
The name of a column to use as the new index. Typically, the index
309312
should have a unique value per row for base columns, and should
310313
repeat for nested columns. For example, a dataframe with two
@@ -330,11 +333,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
330333
"""
331334

332335
# Resolve new index
333-
if index is not None:
336+
if on is not None:
334337
# if a base column is chosen remove it
335-
if index in base_columns:
336-
base_columns = [col for col in base_columns if col != index]
337-
df = df.set_index(index)
338+
if on in base_columns:
339+
base_columns = [col for col in base_columns if col != on]
340+
df = df.set_index(on)
338341

339342
# drop duplicates on index
340343
out_df = df[base_columns][~df.index.duplicated(keep="first")]
@@ -401,7 +404,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
401404
raise ValueError("No columns were assigned as list columns.")
402405

403406
# Pack list columns into a nested column
404-
packed_df = packer.pack_lists(df[list_columns])
407+
packed_df = pack_lists(df[list_columns])
405408
packed_df.name = name
406409

407410
# join the nested column to the base_column df
@@ -519,17 +522,33 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
519522
# since it operated on the base attributes.
520523
if isinstance(result, _SeriesFromNest):
521524
nest_name, flat_nest = result.nest_name, result.flat_nest
522-
new_flat_nest = flat_nest.loc[result]
523-
result = self.copy()
524-
result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)
525+
# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
526+
list_index = self[nest_name].array.get_list_index()
527+
flat_nest = flat_nest.set_index(list_index)
528+
query_result = result.set_axis(list_index)
529+
# Selecting flat values matching the query result
530+
new_flat_nest = flat_nest[query_result]
531+
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
525532
else:
526-
result = self.loc[result]
533+
new_df = self.loc[result]
527534

528535
if inplace:
529-
self._update_inplace(result)
536+
self._update_inplace(new_df)
530537
return None
531538
else:
532-
return result
539+
return new_df
540+
541+
def _set_filtered_flat_df(self, nest_name, flat_df):
542+
"""Set a filtered flat dataframe for a nested column
543+
544+
Here we assume that flat_df has filtered "ordinal" index,
545+
e.g. flat_df.index == [0, 2, 2, 2], while self.index
546+
is arbitrary (e.g. ["a", "b", "a"]),
547+
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
548+
"""
549+
new_df = self.reset_index(drop=True)
550+
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
551+
return new_df.set_index(self.index)
533552

534553
def _resolve_dropna_target(self, on_nested, subset):
535554
"""resolves the target layer for a given set of dropna kwargs"""
@@ -654,34 +673,32 @@ def dropna(
654673
return super().dropna(
655674
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
656675
)
676+
if ignore_index:
677+
raise ValueError("ignore_index is not supported for nested columns")
657678
if subset is not None:
658679
subset = [col.split(".")[-1] for col in subset]
680+
target_flat = self[target].nest.to_flat()
681+
target_flat = target_flat.set_index(self[target].array.get_list_index())
659682
if inplace:
660-
target_flat = self[target].nest.to_flat()
661683
target_flat.dropna(
662684
axis=axis,
663685
how=how,
664686
thresh=thresh,
665687
subset=subset,
666-
inplace=inplace,
667-
ignore_index=ignore_index,
688+
inplace=True,
668689
)
669-
self[target] = packer.pack_flat(target_flat)
670-
return self
671-
# Or if not inplace
672-
new_df = self.copy()
673-
new_df[target] = packer.pack_flat(
674-
new_df[target]
675-
.nest.to_flat()
676-
.dropna(
690+
else:
691+
target_flat = target_flat.dropna(
677692
axis=axis,
678693
how=how,
679694
thresh=thresh,
680695
subset=subset,
681-
inplace=inplace,
682-
ignore_index=ignore_index,
696+
inplace=False,
683697
)
684-
)
698+
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
699+
if inplace:
700+
self._update_inplace(new_df)
701+
return None
685702
return new_df
686703

687704
def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override]

src/nested_pandas/series/ext_array.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,14 @@ def num_chunks(self) -> int:
648648
"""Number of chunks in underlying pyarrow.ChunkedArray"""
649649
return self._chunked_array.num_chunks
650650

651+
def get_list_index(self) -> np.ndarray:
652+
"""Keys mapping values to lists"""
653+
if len(self) == 0:
654+
# Since we have no list offsets, return an empty array
655+
return np.array([], dtype=int)
656+
list_index = np.arange(len(self))
657+
return np.repeat(list_index, np.diff(self.list_offsets))
658+
651659
def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
652660
"""Iterate over single field nested lists, as numpy arrays
653661

src/nested_pandas/series/packer.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def pack(
2727
name: str | None = None,
2828
*,
2929
index=None,
30+
on: None | str | list[str] = None,
3031
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
3132
) -> pd.Series:
3233
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
@@ -40,6 +41,8 @@ def pack(
4041
index : convertable to pd.Index, optional
4142
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
4243
and this value is used to override the index after the nesting.
44+
on: str or list of str, optional
45+
Column name(s) to join on. If None, the index is used.
4346
dtype : dtype or None
4447
NestedDtype of the output series, or other type to derive from. If None,
4548
the dtype is inferred from the first non-missing dataframe.
@@ -50,14 +53,14 @@ def pack(
5053
Output series.
5154
"""
5255
if isinstance(obj, pd.DataFrame):
53-
nested = pack_flat(obj, name=name)
56+
nested = pack_flat(obj, name=name, on=on)
5457
if index is not None:
5558
nested.index = index
5659
return nested
5760
return pack_seq(obj, name=name, index=index, dtype=dtype)
5861

5962

60-
def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
63+
def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
6164
"""Make a structure of lists representation of a "flat" dataframe.
6265
6366
For the input dataframe with repeated indexes, make a pandas.Series,
@@ -73,6 +76,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
7376
Input dataframe, with repeated indexes.
7477
name : str, optional
7578
Name of the pd.Series.
79+
on : str or list of str, optional
80+
Column name(s) to join on. If None, the df's index is used.
7681
7782
Returns
7883
-------
@@ -86,9 +91,11 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
8691
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
8792
"""
8893

94+
if on is not None:
95+
df = df.set_index(on)
8996
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
90-
flat = df.sort_index(kind="stable")
91-
return pack_sorted_df_into_struct(flat, name=name)
97+
sorted_flat = df.sort_index(kind="stable")
98+
return pack_sorted_df_into_struct(sorted_flat, name=name)
9299

93100

94101
def pack_seq(

0 commit comments

Comments
 (0)