Comet pivot

SemyonSinchenko · Jun 4, 2024 · d0c7f88 · d0c7f88
1 parent d65d206
commit d0c7f88
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 22 deletions.
diff --git a/docs/benchmark_results.md b/docs/benchmark_results.md
@@ -43,6 +43,7 @@ See `src/lib.rs` for details of the implementation.
 | PySpark Comet case-when | 94.06 |
 | PySpark-4 polars-udf | 53.06 |
 | PySpark pivot | 104.21 |
+| PySpark Comet pivot | 106.69 |
 
 
 ## Small Dataset
@@ -55,15 +56,15 @@ See `src/lib.rs` for details of the implementation.
 
 | Tool | Time of processing in seconds |
 | ---- | ----------------------------- |
-| Pandas pivot | 214.67 |
-| Polars pivot | 41.20 |
-| DuckDB pivot | 28.60 |
-| DuckDB case-when | 304.52 |
-| PySpark pandas-udf | 516.38 |
-| PySpark case-when | 1808.99 |
-| PySpark Comet case-when | 729.75 |
-| PySpark-4 polars-udf | 356.19 |
-| PySpark pivot | 151.60 |
+| Pandas pivot | OOM |
+| Polars pivot | OOM |
+| DuckDB pivot | 2181.59 |
+| PySpark pandas-udf | 5983.14 |
+| PySpark case-when | 17653.46 |
+| PySpark Comet case-when | 4873.54 |
+| PySpark-4 polars-udf | 4704.73 |
+| PySpark pivot | 455.49 |
+| PySpark Comet pivot | 412.17 |
 
 
 

diff --git a/docs/static/results_overview.png b/docs/static/results_overview.png
diff --git a/results/results_small.json b/results/results_small.json
@@ -1,11 +1,11 @@
 {
-    "Pandas pivot": 214.66964602470398,
-    "Polars pivot": 41.2032790184021,
-    "DuckDB pivot": 28.596801280975342,
-    "DuckDB case-when": 304.51860308647156,
-    "PySpark pandas-udf": 516.3818678855896,
-    "PySpark case-when": 1808.9883704185486,
-    "PySpark Comet case-when": 729.7482228279114,
-    "PySpark-4 polars-udf": 356.1914813518524,
-    "PySpark pivot": 151.59994101524353
+    "Pandas pivot": -1,
+    "Polars pivot": -1,
+    "DuckDB pivot": 2181.5874252319336,
+    "PySpark pandas-udf": 5983.137866973877,
+    "PySpark case-when": 17653.461864709854,
+    "PySpark Comet case-when": 4873.537589073181,
+    "PySpark-4 polars-udf": 4704.727216005325,
+    "PySpark pivot": 455.48546409606934,
+    "PySpark Comet pivot": 412.1713216304779
 }
diff --git a/results/results_tiny.json b/results/results_tiny.json
@@ -7,5 +7,6 @@
     "DuckDB case-when": 36.58749794960022,
     "PySpark Comet case-when": 94.0553047657013,
     "PySpark-4 polars-udf": 53.060457944869995,
-    "PySpark pivot": 104.20818734169006
+    "PySpark pivot": 104.20818734169006,
+    "PySpark Comet pivot": 106.69124364852905
 }
diff --git a/scripts/fill_template.py b/scripts/fill_template.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import jinja2
+import matplotlib.pyplot as plt
 
 RESULTS_TINY = "results_tiny.json"
 RESULTS_SMALL = "results_small.json"
@@ -18,20 +19,71 @@
     results_small = json.load(proj_root.joinpath("results").joinpath(RESULTS_SMALL).open("r"))
     results_medium = json.load(proj_root.joinpath("results").joinpath(RESULTS_MEDIUM).open("r"))
 
+    def clean_ooms(dct):
+        for k in dct:
+            if dct[k] == -1:
+                dct[k] = 0
+
+    clean_ooms(results_tiny)
+    clean_ooms(results_small)
+    clean_ooms(results_medium)
+
+    images_prefix = proj_root.joinpath("docs").joinpath("static").absolute().__str__()
+    f, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 12))
+
+    f.suptitle("Time processing in seconds, less are better")
+
+    y_pos = list(range(len(results_tiny)))
+    ax[0].set_title("17M of rows")
+    ax[0].barh(y_pos, [v for _, v in results_tiny.items()], align="center")
+    ax[0].set_yticks(y_pos, results_tiny.keys())
+    for i, kk in enumerate(results_medium):
+        if results_tiny[kk] == 0:
+            # Inspired by https://stackoverflow.com/a/30229062
+            ax[0].text(100, i, "Memory error or no space left on device", color="red", verticalalignment="center")
+
+    y_pos = list(range(len(results_small)))
+    ax[1].set_title("170M of rows")
+    ax[1].barh(y_pos, [v for _, v in results_small.items()], align="center")
+    ax[1].set_yticks(y_pos, results_small.keys())
+    for i, kk in enumerate(results_medium):
+        if results_small[kk] == 0:
+            # Inspired by https://stackoverflow.com/a/30229062
+            ax[1].text(100, i, "Memory error or no space left on device", color="red", verticalalignment="center")
+
+    y_pos = list(range(len(results_medium)))
+    ax[2].set_title("1.7B of rows")
+    ax[2].barh(y_pos, [v for _, v in results_medium.items()], align="center")
+    ax[2].set_yticks(y_pos, results_medium.keys())
+    for i, kk in enumerate(results_medium):
+        if results_medium[kk] == 0:
+            # Inspired by https://stackoverflow.com/a/30229062
+            ax[2].text(
+                100,
+                i,
+                "Memory error or no space left on device",
+                color="red",
+                verticalalignment="center",
+            )
+
+    f.tight_layout()
+
+    f.savefig(f"{images_prefix}/results_overview.png")
+
     for key in results_tiny:
-        if results_tiny[key] == -1:
+        if results_tiny[key] == 0:
             results_tiny[key] = "OOM"
         else:
             results_tiny[key] = f"{results_tiny[key]:.2f}"
 
     for key in results_small:
-        if results_small[key] == -1:
+        if results_small[key] == 0:
             results_small[key] = "OOM"
         else:
             results_small[key] = f"{results_small[key]:.2f}"
 
     for key in results_medium:
-        if results_medium[key] == -1:
+        if results_medium[key] == 0:
             results_medium[key] = "OOM"
         else:
             results_medium[key] = f"{results_medium[key]:.2f}"