Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the Pandas expression tree for query preflighting. #175

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 96 additions & 3 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.core.computation import ops
from pandas.core.computation.eval import Expr, ensure_scope
from pandas.core.computation.expr import PARSERS, PandasExprVisitor
from pandas.core.computation.parsing import clean_column_name

from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct

Expand Down Expand Up @@ -79,6 +81,22 @@ class _NestResolver(dict):
def __init__(self, outer: NestedFrame):
self._outer = outer
super().__init__()
# Pre-load the field resolvers for all columns which are known at present.
for column in outer.nested_columns:
self._initialize_field_resolver(column, outer)

def _initialize_field_resolver(self, column: str, outer: NestedFrame):
"""
Initialize a resolver for the given nested column, and also an alias
for it, in the case of column names that have spaces or are otherwise
not identifier-like.
"""
super().__setitem__(column, _NestedFieldResolver(column, outer))
clean_id = clean_column_name(column)
# And once more for the cleaned name, if it's different.
# This allows us to capture references to it from the Pandas evaluator.
if clean_id != column:
super().__setitem__(clean_id, _NestedFieldResolver(column, outer))

def __contains__(self, item):
top_nest = item if "." not in item else item.split(".")[0].strip()
Expand All @@ -89,7 +107,7 @@ def __getitem__(self, item):
if not super().__contains__(top_nest):
if top_nest not in self._outer.nested_columns:
raise KeyError(f"Unknown nest {top_nest}")
super().__setitem__(top_nest, _NestedFieldResolver(top_nest, self._outer))
self._initialize_field_resolver(top_nest, self._outer)
return super().__getitem__(top_nest)

def __setitem__(self, item, _):
Expand Down Expand Up @@ -133,6 +151,48 @@ def __getattr__(self, item_name: str):
raise AttributeError(f"No attribute {item_name}")


def _subexprs_by_nest(parents: list, node) -> dict[str, list]:
"""
Given an expression which contains references to both base and nested
columns, return a dictionary of the sub-expressions that should be
evaluated independently, keyed by nesting context.

The key of the dictionary is the name of the nested column, and will
be a blank string in the case of base columns. The value is a list
of the parent nodes that lead to sub-expressions that can be evaluated
successfully.

While this is not in use today for automatically splitting expressions,
it can be used to detect whether an expression is suitably structured
for evaluation: the returned dictionary should have a single key.
"""
if isinstance(node, ops.Term) and not isinstance(node, ops.Constant):
if isinstance(node.value, _SeriesFromNest):
return {node.value.nest_name: parents}
return {getattr(node, "upper_name", ""): parents}
if not isinstance(node, ops.Op):
return {}
sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)]
result: dict[str, list] = {}
for source in sources:
child = _subexprs_by_nest(parents, source)
for k, v in child.items():
result.setdefault(k, []).append(v)
# After a complete traversal across sources, check for any necessary splits.
# If it's homogenous, move the split-node up the tree.
if len(result) == 1:
# Let the record of each parent node drift up the tree,
# and merge the subtrees into a single node, since by definition,
# this node is homogeneous over all of its children, and can
# be evaluated in a single step.
result = {k: [node] for k in result}
# If the result is either empty or has more than one key, leave the result
# alone. Each key represents a different nest (with a blank string for the base),
# and the value is the highest point in the expression tree where the expression
# was still within a single nest.
return result


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.

Expand Down Expand Up @@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
kwargs["parser"] = "nested-pandas"
return super().eval(expr, **kwargs)

def extract_nest_names(
self,
expr: str,
local_dict=None,
global_dict=None,
resolvers=(),
level: int = 0,
target=None,
**kwargs,
) -> set[str]:
"""
Given a string expression, parse it and visit the resulting expression tree,
surfacing the nesting types. The purpose is to identify expressions that attempt
to mix base and nested columns, or columns from two different nests.
"""
index_resolvers = self._get_index_resolvers()
column_resolvers = self._get_cleaned_column_resolvers()
resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers)
# Parser needs to be the "nested-pandas" parser.
# We also need the same variable context that eval() will have, so that
# backtick-quoted names are substituted as expected.
env = ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, parser="nested-pandas", env=env)
expr_tree = parsed_expr.terms
separable = _subexprs_by_nest([], expr_tree)
return set(separable.keys())

def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None:
"""
Query the columns of a NestedFrame with a boolean expression. Specified
Expand Down Expand Up @@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
# At present, the query expression must be either entirely within a
# single nest, or have nothing but base columns. Mixed structures are not
# supported, so preflight the expression.
nest_names = extract_nest_names(expr)
nest_names = self.extract_nest_names(expr, **kwargs)
if len(nest_names) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
result = self.eval(expr, **kwargs)
Expand Down
61 changes: 0 additions & 61 deletions src/nested_pandas/nestedframe/utils.py

This file was deleted.

19 changes: 19 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,25 @@ def test_query():
assert base["nested.d"].shape == (2,)


def test_query_on_non_identifier_columns():
"""
Column names very often follow the same rules as Python identifiers, but
they are not required to. Test that query() can handle such names.
"""
# Taken from GH#174
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nf2 = nf.query("`good dog` > 3")
assert nf.shape == (3, 3)
assert nf2.shape == (2, 3)
nf3 = nf.query("`bad dog`.a > 2")
assert nf3["bad dog"].nest["a"].size == 4


def test_dropna():
"""Test that dropna works on all layers"""

Expand Down
48 changes: 36 additions & 12 deletions tests/nested_pandas/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.utils import count_nested


Expand Down Expand Up @@ -52,16 +51,41 @@ def test_check_expr_nesting():
used to ensure that an expression-based query does not try to combine base and nested
sub-expressions.
"""
assert extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
assert extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert extract_nest_names("-1.52e-5 < abc < 35.2e2") == {""}
assert extract_nest_names("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""}
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={
"c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1],
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
"label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"],
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
b1 = base.add_nested(nested, "nested")
assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""}

b2 = base.add_nested(nested.copy(), "n")
assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""}

abc = pd.DataFrame(
data={
"c": [3, 1, 4, 1, 5, 9, 2, 6, 5],
"d": [1, 4, 1, 2, 1, 3, 5, 6, 2],
"g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
b3 = base.add_nested(abc, "abc").add_nested(abc, "c")
assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"}

assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"}
assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"}
assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"}

assert extract_nest_names("a.b > 2 & c.d < 5") == {"a", "c"}
assert b1.extract_nest_names("a>3") == {""}
assert b1.extract_nest_names("a > 3") == {""}

assert extract_nest_names("a>3") == {""}
assert extract_nest_names("a > 3") == {""}
assert extract_nest_names("test.a>5&b==2") == {"test", ""}
assert extract_nest_names("test.a > 5 & b == 2") == {"test", ""}
assert extract_nest_names("(a.b > 3)&(a.c == 'f')") == {"a"}
assert extract_nest_names("(a.b > 3) & (a.c == 'f')") == {"a"}
b4 = base.add_nested(nested, "test")
assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""}
assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}