Some benchmarks with arrow strings.

j-bennet · j-bennet · commit 72c6fa0cfdba · 2023-03-17T12:54:00.000-07:00
diff --git a/tests/benchmarks/test_arrow.py b/tests/benchmarks/test_arrow.py
@@ -0,0 +1,42 @@
+from ..utils_test import cluster_memory, timeseries_of_size, wait
+
+
+def test_unique(small_client, convert_string):
+    """Find unique values"""
+    memory = cluster_memory(small_client)
+    df = timeseries_of_size(memory)
+    result = df.name.unique()
+    wait(result, small_client, 10 * 60)
+
+
+def test_contains(small_client, convert_string):
+    """String contains"""
+    memory = cluster_memory(small_client)
+    df = timeseries_of_size(memory)
+    result = df.name.contains("a")
+    wait(result, small_client, 10 * 60)
+
+
+def test_startswith(small_client, convert_string):
+    """String starts with"""
+    memory = cluster_memory(small_client)
+    df = timeseries_of_size(memory)
+    result = df.name.startswith("B")
+    wait(result, small_client, 10 * 60)
+
+
+def test_filter(small_client, convert_string):
+    """How fast can we filter a DataFrame?"""
+    memory = cluster_memory(small_client)
+    df = timeseries_of_size(memory)
+    name = df.head(1).name.iloc[0]  # Get first name that appears
+    result = df[df.name == name]
+    wait(result, small_client, 10 * 60)
+
+
+def test_value_counts(small_client, convert_string):
+    """Value counts on string values"""
+    memory = cluster_memory(small_client)
+    df = timeseries_of_size(memory)
+    result = df.name.value_counts()
+    wait(result, small_client, 10 * 60)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -634,6 +634,12 @@ def configure_shuffling(shuffle_method):
         yield
 
 
+@pytest.fixture
+def convert_string():
+    with dask.config.set({"dataframe.convert_string": True}):
+        yield
+
+
 # Include https://github.com/dask/distributed/pull/7534
 P2P_RECHUNK_AVAILABLE = Version(distributed.__version__) >= Version("2023.2.1")