Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
dougbrn committed Apr 5, 2024
1 parent 73e1842 commit 7908233
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 3 deletions.
47 changes: 44 additions & 3 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _split_query(self, expr) -> dict:
"""Splits a pandas query into multiple subqueries for nested and base layers"""
# Ensure query has needed spacing for upcoming split
expr = _ensure_spacing(expr)
nest_exprs = {col: [] for col in self.nested_cols + ["base"]} # type: dict
nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict
split_expr = expr.split(" ")

i = 0
Expand Down Expand Up @@ -93,7 +93,48 @@ def _split_query(self, expr) -> dict:
return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0}

def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
"""overwrite query to first check which columns should be queried"""
"""
Query the columns of a NestedFrame with a boolean expression. Specified
queries can target nested columns in addition to the typical column set
Parameters
----------
expr : str
The query string to evaluate.
Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
You can refer to column names that are not valid Python variable names
by surrounding them in backticks. Thus, column names containing spaces
or punctuations (besides underscores) or starting with digits must be
surrounded by backticks. (For example, a column named "Area (cm^2)" would
be referenced as ```Area (cm^2)```). Column names which are Python keywords
(like "list", "for", "import", etc) cannot be used.
For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.
Returns
-------
DataFrame
DataFrame resulting from the provided query expression.
Notes
-----
Queries that target a particular nested structure return a dataframe
with rows of that particular nested structure filtered. For example,
querying the NestedFrame "df" with nested structure "my_nested" as
below will return all rows of df, but with mynested filtered by the
condition:
>>> df.query("mynested.a > 2")
"""

# Rebuild queries for each specified nested/base layer
exprs_to_use = self._split_query(expr)
Expand All @@ -111,5 +152,5 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
result = super().query(exprs_to_use["base"], inplace=False)
else:
# TODO: does not work with queries that empty the dataframe
result[expr] = result[expr].ts.query_flat(exprs_to_use[expr])
result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
return result
27 changes: 27 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame


Expand Down Expand Up @@ -74,3 +75,29 @@ def test_add_nested():

assert "nested" in base.columns
assert base.nested.nest.to_flat().equals(nested)


def test_query():
"""Test that NestedFrame.query handles nested queries correctly"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

# Test vanilla queries
base = base.add_nested(nested, "nested")
assert len(base.query("a > 2")) == 1

# Check for the multi-layer error
with pytest.raises(ValueError):
base.query("a > 2 & nested.c > 1")

# Test nested queries
nest_queried = base.query("nested.c > 1")
assert len(nest_queried.nested.nest.to_flat()) == 5

nest_queried = base.query("nested.c > 1 and nested.d>2")
assert len(nest_queried.nested.nest.to_flat()) == 4
17 changes: 17 additions & 0 deletions tests/nested_pandas/nestedframe/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pytest
from nested_pandas.nestedframe import utils


@pytest.mark.parametrize(
"in_out",
[
("a>3", "a > 3"),
("test.a>5&b==2", "test.a > 5 & b == 2"),
("b > 3", "b > 3"),
("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"),
],
)
def test_ensure_spacing(in_out):
"""test a set of input queries to make sure spacing is done correctly"""
expr, output = in_out
assert utils._ensure_spacing(expr) == output

0 comments on commit 7908233

Please sign in to comment.