Skip to content

Commit

Permalink
Comet pivot
Browse files Browse the repository at this point in the history
  • Loading branch information
SemyonSinchenko committed Jun 4, 2024
1 parent d65d206 commit d0c7f88
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 22 deletions.
19 changes: 10 additions & 9 deletions docs/benchmark_results.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ See `src/lib.rs` for details of the implementation.
| PySpark Comet case-when | 94.06 |
| PySpark-4 polars-udf | 53.06 |
| PySpark pivot | 104.21 |
| PySpark Comet pivot | 106.69 |


## Small Dataset
Expand All @@ -55,15 +56,15 @@ See `src/lib.rs` for details of the implementation.

| Tool | Time of processing in seconds |
| ---- | ----------------------------- |
| Pandas pivot | 214.67 |
| Polars pivot | 41.20 |
| DuckDB pivot | 28.60 |
| DuckDB case-when | 304.52 |
| PySpark pandas-udf | 516.38 |
| PySpark case-when | 1808.99 |
| PySpark Comet case-when | 729.75 |
| PySpark-4 polars-udf | 356.19 |
| PySpark pivot | 151.60 |
| Pandas pivot | OOM |
| Polars pivot | OOM |
| DuckDB pivot | 2181.59 |
| PySpark pandas-udf | 5983.14 |
| PySpark case-when | 17653.46 |
| PySpark Comet case-when | 4873.54 |
| PySpark-4 polars-udf | 4704.73 |
| PySpark pivot | 455.49 |
| PySpark Comet pivot | 412.17 |



Expand Down
Binary file modified docs/static/results_overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions results/results_small.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"Pandas pivot": 214.66964602470398,
"Polars pivot": 41.2032790184021,
"DuckDB pivot": 28.596801280975342,
"DuckDB case-when": 304.51860308647156,
"PySpark pandas-udf": 516.3818678855896,
"PySpark case-when": 1808.9883704185486,
"PySpark Comet case-when": 729.7482228279114,
"PySpark-4 polars-udf": 356.1914813518524,
"PySpark pivot": 151.59994101524353
"Pandas pivot": -1,
"Polars pivot": -1,
"DuckDB pivot": 2181.5874252319336,
"PySpark pandas-udf": 5983.137866973877,
"PySpark case-when": 17653.461864709854,
"PySpark Comet case-when": 4873.537589073181,
"PySpark-4 polars-udf": 4704.727216005325,
"PySpark pivot": 455.48546409606934,
"PySpark Comet pivot": 412.1713216304779
}
3 changes: 2 additions & 1 deletion results/results_tiny.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
"DuckDB case-when": 36.58749794960022,
"PySpark Comet case-when": 94.0553047657013,
"PySpark-4 polars-udf": 53.060457944869995,
"PySpark pivot": 104.20818734169006
"PySpark pivot": 104.20818734169006,
"PySpark Comet pivot": 106.69124364852905
}
58 changes: 55 additions & 3 deletions scripts/fill_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

import jinja2
import matplotlib.pyplot as plt

RESULTS_TINY = "results_tiny.json"
RESULTS_SMALL = "results_small.json"
Expand All @@ -18,20 +19,71 @@
results_small = json.load(proj_root.joinpath("results").joinpath(RESULTS_SMALL).open("r"))
results_medium = json.load(proj_root.joinpath("results").joinpath(RESULTS_MEDIUM).open("r"))

def clean_ooms(dct):
for k in dct:
if dct[k] == -1:
dct[k] = 0

clean_ooms(results_tiny)
clean_ooms(results_small)
clean_ooms(results_medium)

images_prefix = proj_root.joinpath("docs").joinpath("static").absolute().__str__()
f, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 12))

f.suptitle("Time processing in seconds, less are better")

y_pos = list(range(len(results_tiny)))
ax[0].set_title("17M of rows")
ax[0].barh(y_pos, [v for _, v in results_tiny.items()], align="center")
ax[0].set_yticks(y_pos, results_tiny.keys())
for i, kk in enumerate(results_medium):
if results_tiny[kk] == 0:
# Inspired by https://stackoverflow.com/a/30229062
ax[0].text(100, i, "Memory error or no space left on device", color="red", verticalalignment="center")

y_pos = list(range(len(results_small)))
ax[1].set_title("170M of rows")
ax[1].barh(y_pos, [v for _, v in results_small.items()], align="center")
ax[1].set_yticks(y_pos, results_small.keys())
for i, kk in enumerate(results_medium):
if results_small[kk] == 0:
# Inspired by https://stackoverflow.com/a/30229062
ax[1].text(100, i, "Memory error or no space left on device", color="red", verticalalignment="center")

y_pos = list(range(len(results_medium)))
ax[2].set_title("1.7B of rows")
ax[2].barh(y_pos, [v for _, v in results_medium.items()], align="center")
ax[2].set_yticks(y_pos, results_medium.keys())
for i, kk in enumerate(results_medium):
if results_medium[kk] == 0:
# Inspired by https://stackoverflow.com/a/30229062
ax[2].text(
100,
i,
"Memory error or no space left on device",
color="red",
verticalalignment="center",
)

f.tight_layout()

f.savefig(f"{images_prefix}/results_overview.png")

for key in results_tiny:
if results_tiny[key] == -1:
if results_tiny[key] == 0:
results_tiny[key] = "OOM"
else:
results_tiny[key] = f"{results_tiny[key]:.2f}"

for key in results_small:
if results_small[key] == -1:
if results_small[key] == 0:
results_small[key] = "OOM"
else:
results_small[key] = f"{results_small[key]:.2f}"

for key in results_medium:
if results_medium[key] == -1:
if results_medium[key] == 0:
results_medium[key] = "OOM"
else:
results_medium[key] = f"{results_medium[key]:.2f}"
Expand Down

0 comments on commit d0c7f88

Please sign in to comment.