diff --git a/tests/benchmark.db b/tests/benchmark.db new file mode 100644 index 0000000000..fba3fac4d5 Binary files /dev/null and b/tests/benchmark.db differ diff --git a/tests/benchmarks/benchmark.db b/tests/benchmarks/benchmark.db new file mode 100644 index 0000000000..fba3fac4d5 Binary files /dev/null and b/tests/benchmarks/benchmark.db differ diff --git a/tests/benchmarks/test_dataframe.py b/tests/benchmarks/test_dataframe.py index 4d4f5ce00d..da1fc87239 100644 --- a/tests/benchmarks/test_dataframe.py +++ b/tests/benchmarks/test_dataframe.py @@ -1,3 +1,5 @@ +import numpy as np +from dask.datasets import timeseries from dask.sizeof import sizeof from dask.utils import format_bytes @@ -58,3 +60,26 @@ def test_shuffle(small_client): shuf = df.shuffle(0, shuffle="tasks") result = shuf.size wait(result, small_client, 20 * 60) + + +def test_ddf_isin(small_client): + """ + Checks the efficiency of serializing a large list for filtering + a dask dataframe, and filtering the dataframe by column + based on that list. + + To get enough unique integers to make this meaningful, we need to create + colum 'A' as a float and then convert it to an int. This is caused by + the fact that random ints in timeseries() are drawn from a Poisson distribution. + """ + N = 10_000_000 + rs = np.random.RandomState(42) + ddf = timeseries(end="2000-05-01", dtypes={"A": float, "B": int}, seed=42) + ddf.A = ddf.A.mul(N) + ddf.A = ddf.A.astype(int).persist() + a_column_unique_values = np.arange(1, N // 10) + filter_values_list = sorted( + rs.choice(a_column_unique_values, len(a_column_unique_values) // 2).tolist() + ) + tmp_ddf = ddf.loc[ddf["A"].isin(filter_values_list)] + wait(tmp_ddf, small_client, 20 * 60)