dask-contrib · charlesbluca · Jun 29, 2023 · Jul 19, 2023
@@ -198,10 +198,22 @@ def map_on_each_group(
     # Calculate the results
     new_columns = {}
     for f, new_column_name, temporary_operand_columns in operations:
-        if f is None:
+        if f == "row_number":
             # This is the row_number operator.
             # We do not need to do any windowing
             column_result = range(1, len(partitioned_group) + 1)
+        elif f == "rank":
+            column_result = partitioned_group.rank(method="min", na_option="top").iloc[
+                :, 0
+            ]
+        elif f == "dense_rank":
+            column_result = partitioned_group.rank(
+                method="dense", na_option="top"
+            ).iloc[:, 0]
+        elif f == "percent_rank":
+            column_result = partitioned_group.rank(
+                method="min", na_option="top", pct=True
+            ).iloc[:, 0]
         else:
             column_result = f(windowed_group, *temporary_operand_columns)
 
@@ -226,7 +238,6 @@ class DaskWindowPlugin(BaseRelPlugin):
     class_name = "Window"
 
     OPERATION_MAPPING = {
-        "row_number": None,  # That is the easiest one: we do not even need to have any windowing. We therefore threat it separately
         "$sum0": SumOperation(),
         "sum": SumOperation(),
         "count": CountOperation(),
@@ -236,6 +247,11 @@ class DaskWindowPlugin(BaseRelPlugin):
         "first_value": FirstValueOperation(),
         "last_value": LastValueOperation(),
         "avg": AvgOperation(),
+        # operations that don't require windowing
+        "row_number": "row_number",
+        "rank": "rank",
+        "dense_rank": "dense_rank",
+        "percent_rank": "percent_rank",
     }
 
     def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer:

@@ -586,26 +586,21 @@ def test_window_row_number_partition_by():
     )
 
 
-@pytest.mark.xfail(
-    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
-)
 def test_window_ranks():
     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
     eq_sqlite(
         """
         SELECT *,
             RANK() OVER (PARTITION BY a ORDER BY b DESC NULLS FIRST, c) AS a1,
             DENSE_RANK() OVER (ORDER BY a ASC, b DESC NULLS LAST, c DESC) AS a2,
-            PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a4
+            PERCENT_RANK() OVER (ORDER BY a ASC, b ASC NULLS LAST, c) AS a3
         FROM a
+        ORDER BY a NULLS FIRST, b NULLS FIRST, c NULLS FIRST
         """,
         a=a,
     )
 
 
-@pytest.mark.xfail(
-    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
-)
 def test_window_ranks_partition_by():
     a = make_rand_df(100, a=int, b=(float, 50), c=(str, 50))
     eq_sqlite(
@@ -624,7 +619,7 @@ def test_window_ranks_partition_by():
 
 
 @pytest.mark.xfail(
-    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+    reason="Need to implement lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
 )
 def test_window_lead_lag():
     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))
@@ -647,7 +642,7 @@ def test_window_lead_lag():
 
 
 @pytest.mark.xfail(
-    reason="Need to implement rank/lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
+    reason="Need to implement lead/lag window functions, see https://github.com/dask-contrib/dask-sql/issues/878"
 )
 def test_window_lead_lag_partition_by():
     a = make_rand_df(100, a=float, b=(int, 50), c=(str, 50))