Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for rank window functions #1191

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions dask_sql/physical/rel/logical/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,22 @@ def map_on_each_group(
# Calculate the results
new_columns = {}
for f, new_column_name, temporary_operand_columns in operations:
if f is None:
if f == "row_number":
# This is the row_number operator.
# We do not need to do any windowing
column_result = range(1, len(partitioned_group) + 1)
elif f == "rank":
column_result = partitioned_group.rank(method="min", na_option="top").iloc[
:, 0
]
elif f == "dense_rank":
column_result = partitioned_group.rank(
method="dense", na_option="top"
).iloc[:, 0]
elif f == "percent_rank":
column_result = partitioned_group.rank(
method="min", na_option="top", pct=True
).iloc[:, 0]
else:
column_result = f(windowed_group, *temporary_operand_columns)

Expand All @@ -226,7 +238,6 @@ class DaskWindowPlugin(BaseRelPlugin):
class_name = "Window"

OPERATION_MAPPING = {
"row_number": None, # That is the easiest one: we do not even need to have any windowing. We therefore threat it separately
"$sum0": SumOperation(),
"sum": SumOperation(),
"count": CountOperation(),
Expand All @@ -236,6 +247,11 @@ class DaskWindowPlugin(BaseRelPlugin):
"first_value": FirstValueOperation(),
"last_value": LastValueOperation(),
"avg": AvgOperation(),
# operations that don't require windowing
"row_number": "row_number",
"rank": "rank",
"dense_rank": "dense_rank",
"percent_rank": "percent_rank",
}

def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer:
Expand Down
13 changes: 4 additions & 9 deletions tests/integration/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,26 +586,21 @@ def test_window_row_number_partition_by():
)


@pytest.mark.xfail(
reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
)
def test_window_ranks():
a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
eq_sqlite(
"""
SELECT *,
RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2,
PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a3
FROM a
ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
""",
a=a,
)


@pytest.mark.xfail(
reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
)
def test_window_ranks_partition_by():
a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
eq_sqlite(
Expand All @@ -624,7 +619,7 @@ def test_window_ranks_partition_by():


@pytest.mark.xfail(
reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
reason="Need to implement lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
)
def test_window_lead_lag():
a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
Expand All @@ -647,7 +642,7 @@ def test_window_lead_lag():


@pytest.mark.xfail(
reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
reason="Need to implement lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
)
def test_window_lead_lag_partition_by():
a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
Expand Down
Loading