From db29663612ff1d473d94d12d738f385155e53eaf Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Fri, 18 Oct 2024 14:32:18 -0700 Subject: [PATCH 1/5] Query expression preflighting now prohibits crossing nests. Doing so never worked, but now it is correctly detected. --- src/nested_pandas/nestedframe/core.py | 10 +++--- src/nested_pandas/nestedframe/utils.py | 47 +++++++++++++++++++------ tests/nested_pandas/utils/test_utils.py | 29 +++++++-------- 3 files changed, 54 insertions(+), 32 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 0890732..57ea937 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -16,7 +16,7 @@ from nested_pandas.series.dtype import NestedDtype from ..series.packer import pack_sorted_df_into_struct -from .utils import NestingType, check_expr_nesting +from .utils import check_expr_nesting class NestedPandasExprVisitor(PandasExprVisitor): @@ -506,11 +506,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None - # At present, the query expression must be either entirely within the - # nested namespace or the base namespace. Mixed structures are not + # At present, the query expression must be either entirely within a + # single nest, or have nothing but base columns. Mixed structures are not # supported, so preflight the expression. - nesting_types = check_expr_nesting(expr) - if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types: + nest_names = check_expr_nesting(expr) + if len(nest_names) > 1: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") result = self.eval(expr, **kwargs) # If the result is a _SeriesFromNest, then the evaluation has caused unpacking, diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index b219cea..7deb365 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -12,29 +12,56 @@ class NestingType(Enum): NESTED = "nested" -def _expr_nesting_type(node: ast.expr | None) -> set[NestingType]: +def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]: + """ + Given an expression which contains references to both base and nested columns, + return a dictionary of the sub-expressions that should be evaluated independently. + The key of the dictionary is the name of the nested column, and will be a blank + string in the case of base columns. The value is a list of the parent nodes + that lead to sub-expressions that can be evaluated successfully. + + While this is not in use today for automatically splitting expressions, it can + be used to detect whether an expression is suitably structured for evaluation: + the returned dictionary should have a single key. + """ if not isinstance(node, ast.expr): - return set() + return {} if isinstance(node, ast.Name): - return {NestingType.BASE} - if isinstance(node, ast.Attribute): - return {NestingType.NESTED} + return {"": parents} + if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name): + return {node.value.id: parents} sources = ( [getattr(node, "left", None), getattr(node, "right", None)] + getattr(node, "values", []) + getattr(node, "comparators", []) ) - result: set[NestingType] = set() + result: dict[str, list] = {} for s in sources: - result.update(_expr_nesting_type(s)) + child = _actionable_splits(parents, s) + for k, v in child.items(): + result[k] = result.get(k, []) + v + # After a complete traversal across sources, check for any necessary splits. + # If it's homogenous, move the split-node up the tree. + if len(result) == 1: + # Let the record of each parent node drift up the tree, + # and merge the subtrees into a single node, since by definition, + # this node is homogeneous over all of its children, and can + # be evaluated in a single step. + result = {k: [node] for k in result} + else: + # At this point, we need to split the expression. The idea here is that + # we want a succession of efficient queries, each of which will produce + # a subset of either the base or the nested columns. + pass return result -def check_expr_nesting(expr: str) -> set[NestingType]: +def check_expr_nesting(expr: str) -> set[str]: """ Given a string expression, parse it and visit the resulting AST, surfacing the nesting types. The purpose is to identify expressions that attempt - to mix base and nested columns, which will need to be handled specially. + to mix base and nested columns, or columns from two different nests. """ expr_tree = ast.parse(expr, mode="eval").body - return _expr_nesting_type(expr_tree) + separable = _actionable_splits([], expr_tree) + return set(list(separable.keys())) diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py index d76d592..ef33b81 100644 --- a/tests/nested_pandas/utils/test_utils.py +++ b/tests/nested_pandas/utils/test_utils.py @@ -2,7 +2,7 @@ import pandas as pd import pytest from nested_pandas import NestedFrame -from nested_pandas.nestedframe.utils import NestingType, check_expr_nesting +from nested_pandas.nestedframe.utils import check_expr_nesting from nested_pandas.utils import count_nested @@ -52,21 +52,16 @@ def test_check_expr_nesting(): used to ensure that an expression-based query does not try to combine base and nested sub-expressions. """ - assert check_expr_nesting("a > 2 & nested.c > 1") == {NestingType.NESTED, NestingType.BASE} - assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {NestingType.NESTED} - assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {NestingType.BASE} - assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == { - NestingType.NESTED, - NestingType.BASE, - } + assert check_expr_nesting("a > 2 & nested.c > 1") == {"", "nested"} + assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {"nested"} + assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {""} + assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""} - # NOTE: this correctly captures the desired behavior here, but suggests that the two nests - # are interoperable, which is too strong a claim. - assert check_expr_nesting("a.b > 2 & c.d < 5") == {NestingType.NESTED} + assert check_expr_nesting("a.b > 2 & c.d < 5") == {"a", "c"} - assert check_expr_nesting("a>3") == {NestingType.BASE} - assert check_expr_nesting("a > 3") == {NestingType.BASE} - assert check_expr_nesting("test.a>5&b==2") == {NestingType.NESTED, NestingType.BASE} - assert check_expr_nesting("test.a > 5 & b == 2") == {NestingType.NESTED, NestingType.BASE} - assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {NestingType.NESTED} - assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {NestingType.NESTED} + assert check_expr_nesting("a>3") == {""} + assert check_expr_nesting("a > 3") == {""} + assert check_expr_nesting("test.a>5&b==2") == {"test", ""} + assert check_expr_nesting("test.a > 5 & b == 2") == {"test", ""} + assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {"a"} + assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {"a"} From f617344d13fd4b84e03c8260a550b3dea7004cf7 Mon Sep 17 00:00:00 2001 From: "Derek T. Jones" Date: Thu, 31 Oct 2024 11:50:02 -0700 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Konstantin Malanchev --- src/nested_pandas/nestedframe/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 7deb365..3d9de12 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -39,7 +39,7 @@ def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[s for s in sources: child = _actionable_splits(parents, s) for k, v in child.items(): - result[k] = result.get(k, []) + v + result.setdefault(k, []).append(v) # After a complete traversal across sources, check for any necessary splits. # If it's homogenous, move the split-node up the tree. if len(result) == 1: @@ -64,4 +64,4 @@ def check_expr_nesting(expr: str) -> set[str]: """ expr_tree = ast.parse(expr, mode="eval").body separable = _actionable_splits([], expr_tree) - return set(list(separable.keys())) + return set(separable.keys()) From ce37a14859b19004ed8b3db0db75bdade643248a Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 31 Oct 2024 11:53:41 -0700 Subject: [PATCH 3/5] Remove vestigial NestedType enum; clarify recursive logic. --- src/nested_pandas/nestedframe/utils.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 3d9de12..67f6630 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -2,14 +2,6 @@ from __future__ import annotations import ast -from enum import Enum - - -class NestingType(Enum): - """Types of sub-expressions possible in a NestedFrame string expression.""" - - BASE = "base" - NESTED = "nested" def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]: @@ -48,11 +40,10 @@ def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[s # this node is homogeneous over all of its children, and can # be evaluated in a single step. result = {k: [node] for k in result} - else: - # At this point, we need to split the expression. The idea here is that - # we want a succession of efficient queries, each of which will produce - # a subset of either the base or the nested columns. - pass + # If the result is either empty or has more than one key, leave the result + # alone. Each key represents a different nest (with a blank string for the base), + # and the value is the highest point in the expression tree where the expression + # was still within a single nest. return result From 6aff4cf4ea90f88a7272babedc9ba61192e741f1 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 31 Oct 2024 11:58:38 -0700 Subject: [PATCH 4/5] Better names for utility functions. --- src/nested_pandas/nestedframe/core.py | 4 ++-- src/nested_pandas/nestedframe/utils.py | 29 ++++++++++++++----------- tests/nested_pandas/utils/test_utils.py | 24 ++++++++++---------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 57ea937..8736c19 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -16,7 +16,7 @@ from nested_pandas.series.dtype import NestedDtype from ..series.packer import pack_sorted_df_into_struct -from .utils import check_expr_nesting +from .utils import extract_nest_names class NestedPandasExprVisitor(PandasExprVisitor): @@ -509,7 +509,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | # At present, the query expression must be either entirely within a # single nest, or have nothing but base columns. Mixed structures are not # supported, so preflight the expression. - nest_names = check_expr_nesting(expr) + nest_names = extract_nest_names(expr) if len(nest_names) > 1: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") result = self.eval(expr, **kwargs) diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 67f6630..4e56b5b 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -4,17 +4,20 @@ import ast -def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]: +def _subexprs_by_nest(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]: """ - Given an expression which contains references to both base and nested columns, - return a dictionary of the sub-expressions that should be evaluated independently. - The key of the dictionary is the name of the nested column, and will be a blank - string in the case of base columns. The value is a list of the parent nodes - that lead to sub-expressions that can be evaluated successfully. + Given an expression which contains references to both base and nested + columns, return a dictionary of the sub-expressions that should be + evaluated independently, keyed by nesting context. - While this is not in use today for automatically splitting expressions, it can - be used to detect whether an expression is suitably structured for evaluation: - the returned dictionary should have a single key. + The key of the dictionary is the name of the nested column, and will + be a blank string in the case of base columns. The value is a list + of the parent nodes that lead to sub-expressions that can be evaluated + successfully. + + While this is not in use today for automatically splitting expressions, + it can be used to detect whether an expression is suitably structured + for evaluation: the returned dictionary should have a single key. """ if not isinstance(node, ast.expr): return {} @@ -28,8 +31,8 @@ def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[s + getattr(node, "comparators", []) ) result: dict[str, list] = {} - for s in sources: - child = _actionable_splits(parents, s) + for source in sources: + child = _subexprs_by_nest(parents, source) for k, v in child.items(): result.setdefault(k, []).append(v) # After a complete traversal across sources, check for any necessary splits. @@ -47,12 +50,12 @@ def _actionable_splits(parents: list[ast.expr], node: ast.expr | None) -> dict[s return result -def check_expr_nesting(expr: str) -> set[str]: +def extract_nest_names(expr: str) -> set[str]: """ Given a string expression, parse it and visit the resulting AST, surfacing the nesting types. The purpose is to identify expressions that attempt to mix base and nested columns, or columns from two different nests. """ expr_tree = ast.parse(expr, mode="eval").body - separable = _actionable_splits([], expr_tree) + separable = _subexprs_by_nest([], expr_tree) return set(separable.keys()) diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py index ef33b81..e2ec526 100644 --- a/tests/nested_pandas/utils/test_utils.py +++ b/tests/nested_pandas/utils/test_utils.py @@ -2,7 +2,7 @@ import pandas as pd import pytest from nested_pandas import NestedFrame -from nested_pandas.nestedframe.utils import check_expr_nesting +from nested_pandas.nestedframe.utils import extract_nest_names from nested_pandas.utils import count_nested @@ -52,16 +52,16 @@ def test_check_expr_nesting(): used to ensure that an expression-based query does not try to combine base and nested sub-expressions. """ - assert check_expr_nesting("a > 2 & nested.c > 1") == {"", "nested"} - assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {"nested"} - assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {""} - assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""} + assert extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"} + assert extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"} + assert extract_nest_names("-1.52e-5 < abc < 35.2e2") == {""} + assert extract_nest_names("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""} - assert check_expr_nesting("a.b > 2 & c.d < 5") == {"a", "c"} + assert extract_nest_names("a.b > 2 & c.d < 5") == {"a", "c"} - assert check_expr_nesting("a>3") == {""} - assert check_expr_nesting("a > 3") == {""} - assert check_expr_nesting("test.a>5&b==2") == {"test", ""} - assert check_expr_nesting("test.a > 5 & b == 2") == {"test", ""} - assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {"a"} - assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {"a"} + assert extract_nest_names("a>3") == {""} + assert extract_nest_names("a > 3") == {""} + assert extract_nest_names("test.a>5&b==2") == {"test", ""} + assert extract_nest_names("test.a > 5 & b == 2") == {"test", ""} + assert extract_nest_names("(a.b > 3)&(a.c == 'f')") == {"a"} + assert extract_nest_names("(a.b > 3) & (a.c == 'f')") == {"a"} From a03b23bff330cf91a5ad87c4afb929ad7512c723 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 31 Oct 2024 12:22:32 -0700 Subject: [PATCH 5/5] Add a high-level test of `query()`, showing the resolution of #160 --- tests/nested_pandas/nestedframe/test_nestedframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index fa5da34..151b5f0 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -447,6 +447,11 @@ def test_query(): # Check for the multi-layer error with pytest.raises(ValueError): base.query("a > 2 & nested.c > 1") + # Create another nest in order to further test the multi-layer error + base_2 = base.eval("nest2.c = nested.c + 1") + assert len(base_2.nested_columns) == 2 + with pytest.raises(ValueError): + base_2.query("nested.c > 1 & nest2.c > 2") # Test nested queries nest_queried = base.query("nested.c > 1")