Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tests/benchmark.db
Binary file not shown.
Binary file added tests/benchmarks/benchmark.db
Binary file not shown.
25 changes: 25 additions & 0 deletions tests/benchmarks/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import numpy as np
from dask.datasets import timeseries
from dask.sizeof import sizeof
from dask.utils import format_bytes

Expand Down Expand Up @@ -58,3 +60,26 @@ def test_shuffle(small_client):
shuf = df.shuffle(0, shuffle="tasks")
result = shuf.size
wait(result, small_client, 20 * 60)


def test_ddf_isin(small_client):
"""
Checks the efficiency of serializing a large list for filtering
a dask dataframe, and filtering the dataframe by column
based on that list.

To get enough unique integers to make this meaningful, we need to create
colum 'A' as a float and then convert it to an int. This is caused by
the fact that random ints in timeseries() are drawn from a Poisson distribution.
"""
N = 10_000_000
rs = np.random.RandomState(42)
ddf = timeseries(end="2000-05-01", dtypes={"A": float, "B": int}, seed=42)
ddf.A = ddf.A.mul(N)
ddf.A = ddf.A.astype(int).persist()
a_column_unique_values = np.arange(1, N // 10)
filter_values_list = sorted(
rs.choice(a_column_unique_values, len(a_column_unique_values) // 2).tolist()
)
tmp_ddf = ddf.loc[ddf["A"].isin(filter_values_list)]
wait(tmp_ddf, small_client, 20 * 60)