diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 2e9e552..4970930 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -65,7 +65,7 @@ def _split_query(self, expr) -> dict: """Splits a pandas query into multiple subqueries for nested and base layers""" # Ensure query has needed spacing for upcoming split expr = _ensure_spacing(expr) - nest_exprs = {col: [] for col in self.nested_cols + ["base"]} # type: dict + nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict split_expr = expr.split(" ") i = 0 @@ -93,7 +93,48 @@ def _split_query(self, expr) -> dict: return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0} def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 - """overwrite query to first check which columns should be queried""" + """ + Query the columns of a NestedFrame with a boolean expression. Specified + queries can target nested columns in addition to the typical column set + + Parameters + ---------- + expr : str + The query string to evaluate. + + Access nested columns using `nested_df.nested_col` (where + `nested_df` refers to a particular nested dataframe and + `nested_col` is a column of that nested dataframe). + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + Returns + ------- + DataFrame + DataFrame resulting from the provided query expression. + + Notes + ----- + Queries that target a particular nested structure return a dataframe + with rows of that particular nested structure filtered. For example, + querying the NestedFrame "df" with nested structure "my_nested" as + below will return all rows of df, but with mynested filtered by the + condition: + + >>> df.query("mynested.a > 2") + """ # Rebuild queries for each specified nested/base layer exprs_to_use = self._split_query(expr) @@ -111,5 +152,5 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 result = super().query(exprs_to_use["base"], inplace=False) else: # TODO: does not work with queries that empty the dataframe - result[expr] = result[expr].ts.query_flat(exprs_to_use[expr]) + result[expr] = result[expr].nest.query_flat(exprs_to_use[expr]) return result diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 8c70da3..802db81 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from nested_pandas import NestedFrame @@ -74,3 +75,29 @@ def test_add_nested(): assert "nested" in base.columns assert base.nested.nest.to_flat().equals(nested) + + +def test_query(): + """Test that NestedFrame.query handles nested queries correctly""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + # Test vanilla queries + base = base.add_nested(nested, "nested") + assert len(base.query("a > 2")) == 1 + + # Check for the multi-layer error + with pytest.raises(ValueError): + base.query("a > 2 & nested.c > 1") + + # Test nested queries + nest_queried = base.query("nested.c > 1") + assert len(nest_queried.nested.nest.to_flat()) == 5 + + nest_queried = base.query("nested.c > 1 and nested.d>2") + assert len(nest_queried.nested.nest.to_flat()) == 4 diff --git a/tests/nested_pandas/nestedframe/test_utils.py b/tests/nested_pandas/nestedframe/test_utils.py new file mode 100644 index 0000000..3908cb8 --- /dev/null +++ b/tests/nested_pandas/nestedframe/test_utils.py @@ -0,0 +1,17 @@ +import pytest +from nested_pandas.nestedframe import utils + + +@pytest.mark.parametrize( + "in_out", + [ + ("a>3", "a > 3"), + ("test.a>5&b==2", "test.a > 5 & b == 2"), + ("b > 3", "b > 3"), + ("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"), + ], +) +def test_ensure_spacing(in_out): + """test a set of input queries to make sure spacing is done correctly""" + expr, output = in_out + assert utils._ensure_spacing(expr) == output