From 654ae6ccd409c0dcf1fedd62baea2f82f3594c3c Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 14 Dec 2022 16:30:56 -0600
Subject: [PATCH 1/5] Try pyarrow-backed dtypes

---
 .github/workflows/tests.yml                 | 51 +++++++++++----------
 ci/create_runtime_meta.py                   |  2 +-
 tests/benchmarks/h2o/test_h2o_benchmarks.py |  5 +-
 tests/benchmarks/test_parquet.py            |  6 ++-
 tests/conftest.py                           |  4 +-
 tests/runtime/test_xgboost.py               |  1 +
 tests/stability/test_shuffle.py             |  4 +-
 7 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f0f8bfb513..33c42edbef 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -32,43 +32,44 @@ jobs:
         os: [ubuntu-latest]
         python-version: ["3.9"]
         pytest_args: [tests]
-        runtime-version: [upstream, latest, "0.2.1"]
+        runtime-version: [upstream]
+        # runtime-version: [upstream, latest, "0.2.1"]
         include:
           # Run stability tests on Python 3.8
           - pytest_args: tests/stability
             python-version: "3.8"
             runtime-version: upstream
             os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: latest
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: "0.2.1"
-            os: ubuntu-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.8"
+          #   runtime-version: latest
+          #   os: ubuntu-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.8"
+          #   runtime-version: "0.2.1"
+          #   os: ubuntu-latest
           # Run stability tests on Python 3.10
           - pytest_args: tests/stability
             python-version: "3.10"
             runtime-version: upstream
             os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: latest
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: "0.2.1"
-            os: ubuntu-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.10"
+          #   runtime-version: latest
+          #   os: ubuntu-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.10"
+          #   runtime-version: "0.2.1"
+          #   os: ubuntu-latest
           # Run stability tests on Python Windows and MacOS (latest py39 only)
-          - pytest_args: tests/stability
-            python-version: "3.9"
-            runtime-version: latest
-            os: windows-latest
-          - pytest_args: tests/stability
-            python-version: "3.9"
-            runtime-version: latest
-            os: macos-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.9"
+          #   runtime-version: latest
+          #   os: windows-latest
+          # - pytest_args: tests/stability
+          #   python-version: "3.9"
+          #   runtime-version: latest
+          #   os: macos-latest
 
     steps:
       - name: Checkout
diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py
index 532ae715cb..b44f427cdf 100644
--- a/ci/create_runtime_meta.py
+++ b/ci/create_runtime_meta.py
@@ -44,7 +44,7 @@ def main():
         requirements.append(
             {
                 "pip": [
-                    "git+https://github.com/dask/dask@main",
+                    "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes",
                     "git+https://github.com/dask/distributed@main",
                 ]
             }
diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py
index a97c88d97d..aa8be8b916 100644
--- a/tests/benchmarks/h2o/test_h2o_benchmarks.py
+++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py
@@ -50,7 +50,10 @@ def ddf(request):
         )
     else:
         yield dd.read_parquet(
-            request.param, engine="pyarrow", storage_options={"anon": True}
+            request.param,
+            engine="pyarrow",
+            storage_options={"anon": True},
+            use_nullable_dtypes=True,
         )
 
 
diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py
index 1ad6d6428b..825cd97267 100644
--- a/tests/benchmarks/test_parquet.py
+++ b/tests/benchmarks/test_parquet.py
@@ -50,6 +50,7 @@ def test_read_spark_generated_data(parquet_client):
         "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet",
         engine="pyarrow",
         index="sample_id",
+        use_nullable_dtypes=True,
     )
     ddf.groupby(ddf.index).first().compute()
 
@@ -65,6 +66,7 @@ def test_read_hive_partitioned_data(parquet_client):
     ddf = dd.read_parquet(
         "s3://coiled-runtime-ci/ookla-open-data/type=fixed/**.parquet",
         engine="pyarrow",
+        use_nullable_dtypes=True,
     )
 
     ddf.groupby(["year", "quarter"]).first().compute()
@@ -108,4 +110,6 @@ def load(path):
             parquet_client.submit(pandas.read_parquet, path, engine="pyarrow")
         )
     elif kind == "dask":
-        distributed.wait(dd.read_parquet(path, engine="pyarrow").persist())
+        distributed.wait(
+            dd.read_parquet(path, engine="pyarrow", use_nullable_dtypes=True).persist()
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index 24146805af..0fdf766ed4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -82,7 +82,9 @@ def get_coiled_runtime_version():
             return "unknown"
 
 
-dask.config.set({"coiled.account": "dask-engineering"})
+dask.config.set(
+    {"coiled.account": "dask-engineering", "dataframe.nullable_backend": "pyarrow"}
+)
 
 COILED_RUNTIME_VERSION = get_coiled_runtime_version()
 COILED_SOFTWARE_NAME = "package_sync"
diff --git a/tests/runtime/test_xgboost.py b/tests/runtime/test_xgboost.py
index 2daadd78bb..3d5e829f84 100644
--- a/tests/runtime/test_xgboost.py
+++ b/tests/runtime/test_xgboost.py
@@ -12,6 +12,7 @@ def test_xgboost_distributed_training(small_client):
     ddf = dd.read_parquet(
         "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet",
         storage_options={"anon": True},
+        use_nullable_dtypes=True,
     )
     ddf = ddf.partitions[0:30]
     ddf = ddf.persist()
diff --git a/tests/stability/test_shuffle.py b/tests/stability/test_shuffle.py
index 38b817fa7b..941386d760 100644
--- a/tests/stability/test_shuffle.py
+++ b/tests/stability/test_shuffle.py
@@ -30,7 +30,9 @@ def test_shuffle_parquet(small_client, s3_url, s3_storage_options):
 
     # Test `read_parquet` + `shuffle` + `to_parquet` works
     shuffled_url = s3_url + "/shuffled.parquet"
-    df_shuffled = dd.read_parquet(dataset_url, storage_options=s3_storage_options)
+    df_shuffled = dd.read_parquet(
+        dataset_url, storage_options=s3_storage_options, use_nullable_dtypes=True
+    )
     df_shuffled = df_shuffled.shuffle(on="x")
     # Workaround for performance issue. See https://github.com/coiled/coiled-runtime/issues/79
     # for more details. Replace with the line below once using `dask>=2022.04.2`.

From 2605146812fa4fb12d145e0d119b0bb226464d7f Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 14 Dec 2022 16:36:49 -0600
Subject: [PATCH 2/5] Switch

---
 ci/create_runtime_meta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py
index b44f427cdf..e46320b12d 100644
--- a/ci/create_runtime_meta.py
+++ b/ci/create_runtime_meta.py
@@ -44,8 +44,8 @@ def main():
         requirements.append(
             {
                 "pip": [
-                    "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes",
                     "git+https://github.com/dask/distributed@main",
+                    "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes",
                 ]
             }
         )

From 89488b9514a702df6dffcb407c690d302ffe7942 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 14 Dec 2022 20:22:22 -0600
Subject: [PATCH 3/5] Dataset

---
 tests/benchmarks/h2o/test_h2o_benchmarks.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py
index aa8be8b916..c27c2a497f 100644
--- a/tests/benchmarks/h2o/test_h2o_benchmarks.py
+++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py
@@ -14,12 +14,13 @@
 @pytest.fixture(
     scope="module",
     params=[
-        "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_single.csv",
-        # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_single.csv",
-        # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_single.csv",
-        "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet",
-        "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet",
-        # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet",
+        # "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_single.csv",
+        # # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_single.csv",
+        # # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_single.csv",
+        # "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet",
+        # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet",
+        # # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet",
+        "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet",
     ],
     ids=[
         "0.5 GB (csv)",

From 57ddc7354404f0a4d58c3c0ec2c9e473fdd2cbb5 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 14 Dec 2022 21:36:09 -0600
Subject: [PATCH 4/5] Fixup

---
 tests/benchmarks/h2o/test_h2o_benchmarks.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py
index c27c2a497f..8ce7f99554 100644
--- a/tests/benchmarks/h2o/test_h2o_benchmarks.py
+++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py
@@ -23,12 +23,12 @@
         "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet",
     ],
     ids=[
-        "0.5 GB (csv)",
-        # "5 GB (csv)",
-        # "50 GB (csv)",
-        "0.5 GB (parquet)",
-        "5 GB (parquet)",
-        # "50 GB (parquet)",
+        # "0.5 GB (csv)",
+        # # "5 GB (csv)",
+        # # "50 GB (csv)",
+        # "0.5 GB (parquet)",
+        # "5 GB (parquet)",
+        "50 GB (parquet)",
     ],
 )
 def ddf(request):

From 2fef82a248b278e32dc93314024186f93d79005c Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 25 Jan 2023 14:56:19 -0600
Subject: [PATCH 5/5] Update

---
 ci/create_runtime_meta.py        | 2 +-
 tests/benchmarks/test_parquet.py | 6 +-----
 tests/conftest.py                | 6 +++++-
 tests/runtime/test_xgboost.py    | 1 -
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py
index e46320b12d..8bb59a67fa 100644
--- a/ci/create_runtime_meta.py
+++ b/ci/create_runtime_meta.py
@@ -45,7 +45,7 @@ def main():
             {
                 "pip": [
                     "git+https://github.com/dask/distributed@main",
-                    "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes",
+                    "git+https://github.com/jrbourbeau/dask@nullable-config",
                 ]
             }
         )
diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py
index 825cd97267..1ad6d6428b 100644
--- a/tests/benchmarks/test_parquet.py
+++ b/tests/benchmarks/test_parquet.py
@@ -50,7 +50,6 @@ def test_read_spark_generated_data(parquet_client):
         "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet",
         engine="pyarrow",
         index="sample_id",
-        use_nullable_dtypes=True,
     )
     ddf.groupby(ddf.index).first().compute()
 
@@ -66,7 +65,6 @@ def test_read_hive_partitioned_data(parquet_client):
     ddf = dd.read_parquet(
         "s3://coiled-runtime-ci/ookla-open-data/type=fixed/**.parquet",
         engine="pyarrow",
-        use_nullable_dtypes=True,
     )
 
     ddf.groupby(["year", "quarter"]).first().compute()
@@ -110,6 +108,4 @@ def load(path):
             parquet_client.submit(pandas.read_parquet, path, engine="pyarrow")
         )
     elif kind == "dask":
-        distributed.wait(
-            dd.read_parquet(path, engine="pyarrow", use_nullable_dtypes=True).persist()
-        )
+        distributed.wait(dd.read_parquet(path, engine="pyarrow").persist())
diff --git a/tests/conftest.py b/tests/conftest.py
index 20bdc2214c..a6124993fd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -84,7 +84,11 @@ def get_coiled_runtime_version():
 
 
 dask.config.set(
-    {"coiled.account": "dask-engineering", "dataframe.nullable_backend": "pyarrow"}
+    {
+        "coiled.account": "dask-engineering",
+        "dataframe.nullable_dtypes": True,
+        "dataframe.dtype_backend": "pyarrow",
+    }
 )
 
 COILED_RUNTIME_VERSION = get_coiled_runtime_version()
diff --git a/tests/runtime/test_xgboost.py b/tests/runtime/test_xgboost.py
index 3d5e829f84..2daadd78bb 100644
--- a/tests/runtime/test_xgboost.py
+++ b/tests/runtime/test_xgboost.py
@@ -12,7 +12,6 @@ def test_xgboost_distributed_training(small_client):
     ddf = dd.read_parquet(
         "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet",
         storage_options={"anon": True},
-        use_nullable_dtypes=True,
     )
     ddf = ddf.partitions[0:30]
     ddf = ddf.persist()