Skip to content

Commit 72c6fa0

Browse files
committed
Some benchmarks with arrow strings.
1 parent 01e9544 commit 72c6fa0

File tree

2 files changed

+48
-0
lines changed

2 files changed

+48
-0
lines changed

tests/benchmarks/test_arrow.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from ..utils_test import cluster_memory, timeseries_of_size, wait
2+
3+
4+
def test_unique(small_client, convert_string):
5+
"""Find unique values"""
6+
memory = cluster_memory(small_client)
7+
df = timeseries_of_size(memory)
8+
result = df.name.unique()
9+
wait(result, small_client, 10 * 60)
10+
11+
12+
def test_contains(small_client, convert_string):
13+
"""String contains"""
14+
memory = cluster_memory(small_client)
15+
df = timeseries_of_size(memory)
16+
result = df.name.contains("a")
17+
wait(result, small_client, 10 * 60)
18+
19+
20+
def test_startswith(small_client, convert_string):
21+
"""String starts with"""
22+
memory = cluster_memory(small_client)
23+
df = timeseries_of_size(memory)
24+
result = df.name.startswith("B")
25+
wait(result, small_client, 10 * 60)
26+
27+
28+
def test_filter(small_client, convert_string):
29+
"""How fast can we filter a DataFrame?"""
30+
memory = cluster_memory(small_client)
31+
df = timeseries_of_size(memory)
32+
name = df.head(1).name.iloc[0] # Get first name that appears
33+
result = df[df.name == name]
34+
wait(result, small_client, 10 * 60)
35+
36+
37+
def test_value_counts(small_client, convert_string):
38+
"""Value counts on string values"""
39+
memory = cluster_memory(small_client)
40+
df = timeseries_of_size(memory)
41+
result = df.name.value_counts()
42+
wait(result, small_client, 10 * 60)

tests/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,12 @@ def configure_shuffling(shuffle_method):
634634
yield
635635

636636

637+
@pytest.fixture
638+
def convert_string():
639+
with dask.config.set({"dataframe.convert_string": True}):
640+
yield
641+
642+
637643
# Include https://github.com/dask/distributed/pull/7534
638644
P2P_RECHUNK_AVAILABLE = Version(distributed.__version__) >= Version("2023.2.1")
639645

0 commit comments

Comments
 (0)