Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor fixes #170

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.api.types import is_bool_dtype
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.nestedframe.utils import extract_nest_names
Expand Down Expand Up @@ -272,7 +271,8 @@
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, list of str, default: None
Columns to join on.
Columns in `obj` frame to use as an index to join on rather than
`obj`'s index.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand Down Expand Up @@ -519,14 +519,11 @@
# to the nest and repack. Otherwise, apply it to this instance as usual,
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
if not is_bool_dtype(result.dtype):
raise ValueError("Query condition must evaluate to a boolean Series")

nest_name, flat_nest = result.nest_name, result.flat_nest

# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
query_result = result.set_axis(self[nest_name].array.list_index)
list_index = self[nest_name].array.get_list_index()
flat_nest = flat_nest.set_index(list_index)
query_result = result.set_axis(list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
Expand Down Expand Up @@ -675,11 +672,11 @@
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")

Check warning on line 675 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L675

Added line #L675 was not covered by tests
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.list_index)
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat.dropna(
axis=axis,
Expand Down
3 changes: 1 addition & 2 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,7 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

@property
def list_index(self) -> np.ndarray:
def get_list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))
Expand Down
125 changes: 123 additions & 2 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import pandas as pd
import pyarrow as pa
import pytest
from pandas.testing import assert_frame_equal

from nested_pandas import NestedFrame
from nested_pandas.datasets import generate_data
from nested_pandas.nestedframe.core import _SeriesFromNest
from pandas.testing import assert_frame_equal


def test_nestedframe_construction():
Expand Down Expand Up @@ -184,13 +185,40 @@ def test_add_nested_with_flat_df():
assert_frame_equal(base.nested.nest.to_flat(), nested, check_dtype=False)


def test_add_nested_with_flat_df_on():
"""Test that add_nested correctly adds a nested column to the base df with on argument"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], }, index=[0, 1, 2])

nested = pd.DataFrame(
data={"new_index": [0, 0, 1, 1, 2, 2, 2, 2, 2,],
"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested", on="new_index")

assert "nested" in base.columns

# We expected that when the nested frame is packed in with 'on'='new_index' that the
# resulting flat frame is indexed by the new_index values.
expected_flat = nested.set_index("new_index")
expected_flat.index.name = None # The index name is not preserved in the flat frame

assert_frame_equal(base.nested.nest.to_flat(), expected_flat, check_dtype=False)

def test_add_nested_with_flat_df_and_mismatched_index():
"""Test add_nested when index values of base are missing matches in nested"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
data={
"c": [0, 2, 4, 1, 4, 3, 1, 4, 1],
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
# A column we can have as an alternative joining index with 'on'
"new_index": [1, 1, 1, 1, 2, 2, 5, 5, 5],
},
# no data for base index value of "2" and introduces new index value "4"
index=[0, 0, 0, 1, 1, 1, 1, 4, 4],
)
Expand All @@ -212,6 +240,23 @@ def test_add_nested_with_flat_df_and_mismatched_index():
default_res = base.add_nested(nested, "nested")
assert_frame_equal(left_res, default_res)

# Test still adding the nested frame in a "left" fashion but on the "new_index" column
left_res_on = base.add_nested(nested, "nested", how="left", on="new_index")
assert "nested" in left_res_on.columns
# Check that the index of the base layer is still being used
assert (left_res_on.index == base.index).all()
# Assert that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in left_res_on["nested"].nest.to_flat().columns
for idx in left_res_on.index:
# Check that the nested column is aligned correctly to the base layer
if idx in nested["new_index"].values:
assert left_res_on.loc[idx]["nested"] is not None
# Check that it is present in new the index we constructed for the nested layer
assert idx in left_res_on["nested"].nest.to_flat().index
else:
assert left_res_on.loc[idx]["nested"] is None
assert idx not in left_res_on["nested"].nest.to_flat().index

# Test adding the nested frame in a "right" fashion, where the index of the "right"
# frame (our nested layer) is preserved
right_res = base.add_nested(nested, "nested", how="right")
Expand All @@ -235,6 +280,36 @@ def test_add_nested_with_flat_df_and_mismatched_index():
else:
assert not pd.isna(right_res.loc[idx][col])

# Test still adding the nested frame in a "right" fashion but on the "new_index" column
right_res_on = base.add_nested(nested, "nested", how="right", on="new_index")
assert "nested" in right_res_on.columns
# Check that the index of the nested layer is being used. Note that separate
# from a traditional join this will not be the same as our nested layer index
# and is just dropping values from the base layer that don't have a match in
# the nested layer.
assert (right_res_on.index.values == np.unique(nested.new_index.values)).all()

# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
# Check that the flattend nested layer has the same index as the original column we joined on
all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)

# For each index check that the base layer is aligned correctly to the nested layer
for idx in right_res_on.index:
# Check that the nested column is aligned correctly to the base layer. Here
# it should never be None
assert right_res_on.loc[idx]["nested"] is not None
# Check the values for each column in our "base" layer
for col in base.columns:
assert col in right_res_on.columns
if idx not in base.index:
assert idx in nested["new_index"].values
# We expect a NaN value in the base layer due to the "right" join
assert pd.isna(right_res_on.loc[idx][col])
else:
# We expect a NaN value in the base layer due to the "right" join
assert not pd.isna(right_res_on.loc[idx][col])

# Test the "outer" behavior
outer_res = base.add_nested(nested, "nested", how="outer")
assert "nested" in outer_res.columns
Expand All @@ -255,6 +330,36 @@ def test_add_nested_with_flat_df_and_mismatched_index():
else:
assert not pd.isna(outer_res.loc[idx][col])

# Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index")
assert "nested" in outer_res_on.columns
# We expect the new index to be the union of the base and nested column we used
# for the 'on' argument
assert set(outer_res_on.index) == set(base.index).union(set(nested.new_index))

# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
# Check that the flattend nested layer has the same index as the original column we joined on
# Note that it does not have index values only present in the base layer since those empty rows
# are dropped when we flatten the nested frame.
all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)

for idx in outer_res_on.index:
# Check that the nested column is aligned correctly to the base layer
if idx in nested["new_index"].values:
assert outer_res_on.loc[idx]["nested"] is not None
else:
assert outer_res_on.loc[idx]["nested"] is None
# Check the values for each column in our "base" layer
for col in base.columns:
assert col in outer_res_on.columns
if idx not in base.index:
assert idx in nested["new_index"].values
# We expect a NaN value in the base layer due to the "outer" join
assert pd.isna(outer_res_on.loc[idx][col])
else:
assert not pd.isna(outer_res_on.loc[idx][col])

# Test the "inner" behavior
inner_res = base.add_nested(nested, "nested", how="inner")
assert "nested" in inner_res.columns
Expand All @@ -268,6 +373,22 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert col in inner_res.columns
assert not pd.isna(inner_res.loc[idx][col])

# Test still adding the nested frame in a "inner" fashion but on the "new_index" column
inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index")
assert "nested" in inner_res_on.columns
# We expect the new index to be the set intersection of the base and nested column we used
# for the 'on' argument
assert set(inner_res_on.index) == set(base.index).intersection(set(nested.new_index))
# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
for idx in inner_res_on.index:
# None of our nested values should be None
assert inner_res_on.loc[idx]["nested"] is not None
assert idx in nested["new_index"].values
# Check the values for each column in our "base" layer
for col in base.columns:
assert col in inner_res_on.columns
assert not pd.isna(inner_res_on.loc[idx][col])

def test_add_nested_with_series():
"""Test that add_nested correctly adds a nested column to the base df"""
Expand Down
Loading