Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query expression preflighting now prohibits crossing nests. #165

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import NestingType, check_expr_nesting
from .utils import check_expr_nesting


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -506,11 +506,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None
# At present, the query expression must be either entirely within the
# nested namespace or the base namespace. Mixed structures are not
# At present, the query expression must be either entirely within a
# single nest, or have nothing but base columns. Mixed structures are not
# supported, so preflight the expression.
nesting_types = check_expr_nesting(expr)
if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types:
nest_names = check_expr_nesting(expr)
if len(nest_names) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
result = self.eval(expr, **kwargs)
# If the result is a _SeriesFromNest, then the evaluation has caused unpacking,
Expand Down
47 changes: 37 additions & 10 deletions src/nested_pandas/nestedframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,56 @@ class NestingType(Enum):
NESTED = "nested"
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved


def _expr_nesting_type(node: ast.expr | None) -> set[NestingType]:
def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]:
"""
Given an expression which contains references to both base and nested columns,
return a dictionary of the sub-expressions that should be evaluated independently.
The key of the dictionary is the name of the nested column, and will be a blank
string in the case of base columns. The value is a list of the parent nodes
that lead to sub-expressions that can be evaluated successfully.

While this is not in use today for automatically splitting expressions, it can
be used to detect whether an expression is suitably structured for evaluation:
the returned dictionary should have a single key.
"""
if not isinstance(node, ast.expr):
return set()
return {}
if isinstance(node, ast.Name):
return {NestingType.BASE}
if isinstance(node, ast.Attribute):
return {NestingType.NESTED}
return {"": parents}
if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name):
return {node.value.id: parents}
sources = (
[getattr(node, "left", None), getattr(node, "right", None)]
+ getattr(node, "values", [])
+ getattr(node, "comparators", [])
)
result: set[NestingType] = set()
result: dict[str, list] = {}
for s in sources:
result.update(_expr_nesting_type(s))
child = _actionable_splits(parents, s)
for k, v in child.items():
result[k] = result.get(k, []) + v
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
# After a complete traversal across sources, check for any necessary splits.
# If it's homogenous, move the split-node up the tree.
if len(result) == 1:
# Let the record of each parent node drift up the tree,
# and merge the subtrees into a single node, since by definition,
# this node is homogeneous over all of its children, and can
# be evaluated in a single step.
result = {k: [node] for k in result}
else:
# At this point, we need to split the expression. The idea here is that
# we want a succession of efficient queries, each of which will produce
# a subset of either the base or the nested columns.
pass
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
return result


def check_expr_nesting(expr: str) -> set[NestingType]:
def check_expr_nesting(expr: str) -> set[str]:
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
"""
Given a string expression, parse it and visit the resulting AST, surfacing
the nesting types. The purpose is to identify expressions that attempt
to mix base and nested columns, which will need to be handled specially.
to mix base and nested columns, or columns from two different nests.
"""
expr_tree = ast.parse(expr, mode="eval").body
return _expr_nesting_type(expr_tree)
separable = _actionable_splits([], expr_tree)
return set(list(separable.keys()))
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
29 changes: 12 additions & 17 deletions tests/nested_pandas/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from nested_pandas.nestedframe.utils import NestingType, check_expr_nesting
from nested_pandas.nestedframe.utils import check_expr_nesting
from nested_pandas.utils import count_nested


Expand Down Expand Up @@ -52,21 +52,16 @@ def test_check_expr_nesting():
used to ensure that an expression-based query does not try to combine base and nested
sub-expressions.
"""
assert check_expr_nesting("a > 2 & nested.c > 1") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {NestingType.NESTED}
assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {NestingType.BASE}
assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {
NestingType.NESTED,
NestingType.BASE,
}
assert check_expr_nesting("a > 2 & nested.c > 1") == {"", "nested"}
assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {""}
assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""}

# NOTE: this correctly captures the desired behavior here, but suggests that the two nests
# are interoperable, which is too strong a claim.
assert check_expr_nesting("a.b > 2 & c.d < 5") == {NestingType.NESTED}
assert check_expr_nesting("a.b > 2 & c.d < 5") == {"a", "c"}

assert check_expr_nesting("a>3") == {NestingType.BASE}
assert check_expr_nesting("a > 3") == {NestingType.BASE}
assert check_expr_nesting("test.a>5&b==2") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("test.a > 5 & b == 2") == {NestingType.NESTED, NestingType.BASE}
assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {NestingType.NESTED}
assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {NestingType.NESTED}
assert check_expr_nesting("a>3") == {""}
assert check_expr_nesting("a > 3") == {""}
assert check_expr_nesting("test.a>5&b==2") == {"test", ""}
assert check_expr_nesting("test.a > 5 & b == 2") == {"test", ""}
assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {"a"}
assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {"a"}