From 654ae6ccd409c0dcf1fedd62baea2f82f3594c3c Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 14 Dec 2022 16:30:56 -0600 Subject: [PATCH 1/5] Try pyarrow-backed dtypes --- .github/workflows/tests.yml | 51 +++++++++++---------- ci/create_runtime_meta.py | 2 +- tests/benchmarks/h2o/test_h2o_benchmarks.py | 5 +- tests/benchmarks/test_parquet.py | 6 ++- tests/conftest.py | 4 +- tests/runtime/test_xgboost.py | 1 + tests/stability/test_shuffle.py | 4 +- 7 files changed, 43 insertions(+), 30 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f0f8bfb513..33c42edbef 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,43 +32,44 @@ jobs: os: [ubuntu-latest] python-version: ["3.9"] pytest_args: [tests] - runtime-version: [upstream, latest, "0.2.1"] + runtime-version: [upstream] + # runtime-version: [upstream, latest, "0.2.1"] include: # Run stability tests on Python 3.8 - pytest_args: tests/stability python-version: "3.8" runtime-version: upstream os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.8" - runtime-version: latest - os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.8" - runtime-version: "0.2.1" - os: ubuntu-latest + # - pytest_args: tests/stability + # python-version: "3.8" + # runtime-version: latest + # os: ubuntu-latest + # - pytest_args: tests/stability + # python-version: "3.8" + # runtime-version: "0.2.1" + # os: ubuntu-latest # Run stability tests on Python 3.10 - pytest_args: tests/stability python-version: "3.10" runtime-version: upstream os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.10" - runtime-version: latest - os: ubuntu-latest - - pytest_args: tests/stability - python-version: "3.10" - runtime-version: "0.2.1" - os: ubuntu-latest + # - pytest_args: tests/stability + # python-version: "3.10" + # runtime-version: latest + # os: ubuntu-latest + # - pytest_args: tests/stability + # python-version: "3.10" + # runtime-version: "0.2.1" + # os: ubuntu-latest # Run stability tests on Python Windows and MacOS (latest py39 only) - - pytest_args: tests/stability - python-version: "3.9" - runtime-version: latest - os: windows-latest - - pytest_args: tests/stability - python-version: "3.9" - runtime-version: latest - os: macos-latest + # - pytest_args: tests/stability + # python-version: "3.9" + # runtime-version: latest + # os: windows-latest + # - pytest_args: tests/stability + # python-version: "3.9" + # runtime-version: latest + # os: macos-latest steps: - name: Checkout diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py index 532ae715cb..b44f427cdf 100644 --- a/ci/create_runtime_meta.py +++ b/ci/create_runtime_meta.py @@ -44,7 +44,7 @@ def main(): requirements.append( { "pip": [ - "git+https://github.com/dask/dask@main", + "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes", "git+https://github.com/dask/distributed@main", ] } diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py index a97c88d97d..aa8be8b916 100644 --- a/tests/benchmarks/h2o/test_h2o_benchmarks.py +++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py @@ -50,7 +50,10 @@ def ddf(request): ) else: yield dd.read_parquet( - request.param, engine="pyarrow", storage_options={"anon": True} + request.param, + engine="pyarrow", + storage_options={"anon": True}, + use_nullable_dtypes=True, ) diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py index 1ad6d6428b..825cd97267 100644 --- a/tests/benchmarks/test_parquet.py +++ b/tests/benchmarks/test_parquet.py @@ -50,6 +50,7 @@ def test_read_spark_generated_data(parquet_client): "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet", engine="pyarrow", index="sample_id", + use_nullable_dtypes=True, ) ddf.groupby(ddf.index).first().compute() @@ -65,6 +66,7 @@ def test_read_hive_partitioned_data(parquet_client): ddf = dd.read_parquet( "s3://coiled-runtime-ci/ookla-open-data/type=fixed/**.parquet", engine="pyarrow", + use_nullable_dtypes=True, ) ddf.groupby(["year", "quarter"]).first().compute() @@ -108,4 +110,6 @@ def load(path): parquet_client.submit(pandas.read_parquet, path, engine="pyarrow") ) elif kind == "dask": - distributed.wait(dd.read_parquet(path, engine="pyarrow").persist()) + distributed.wait( + dd.read_parquet(path, engine="pyarrow", use_nullable_dtypes=True).persist() + ) diff --git a/tests/conftest.py b/tests/conftest.py index 24146805af..0fdf766ed4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -82,7 +82,9 @@ def get_coiled_runtime_version(): return "unknown" -dask.config.set({"coiled.account": "dask-engineering"}) +dask.config.set( + {"coiled.account": "dask-engineering", "dataframe.nullable_backend": "pyarrow"} +) COILED_RUNTIME_VERSION = get_coiled_runtime_version() COILED_SOFTWARE_NAME = "package_sync" diff --git a/tests/runtime/test_xgboost.py b/tests/runtime/test_xgboost.py index 2daadd78bb..3d5e829f84 100644 --- a/tests/runtime/test_xgboost.py +++ b/tests/runtime/test_xgboost.py @@ -12,6 +12,7 @@ def test_xgboost_distributed_training(small_client): ddf = dd.read_parquet( "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet", storage_options={"anon": True}, + use_nullable_dtypes=True, ) ddf = ddf.partitions[0:30] ddf = ddf.persist() diff --git a/tests/stability/test_shuffle.py b/tests/stability/test_shuffle.py index 38b817fa7b..941386d760 100644 --- a/tests/stability/test_shuffle.py +++ b/tests/stability/test_shuffle.py @@ -30,7 +30,9 @@ def test_shuffle_parquet(small_client, s3_url, s3_storage_options): # Test `read_parquet` + `shuffle` + `to_parquet` works shuffled_url = s3_url + "/shuffled.parquet" - df_shuffled = dd.read_parquet(dataset_url, storage_options=s3_storage_options) + df_shuffled = dd.read_parquet( + dataset_url, storage_options=s3_storage_options, use_nullable_dtypes=True + ) df_shuffled = df_shuffled.shuffle(on="x") # Workaround for performance issue. See https://github.com/coiled/coiled-runtime/issues/79 # for more details. Replace with the line below once using `dask>=2022.04.2`. From 2605146812fa4fb12d145e0d119b0bb226464d7f Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 14 Dec 2022 16:36:49 -0600 Subject: [PATCH 2/5] Switch --- ci/create_runtime_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py index b44f427cdf..e46320b12d 100644 --- a/ci/create_runtime_meta.py +++ b/ci/create_runtime_meta.py @@ -44,8 +44,8 @@ def main(): requirements.append( { "pip": [ - "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes", "git+https://github.com/dask/distributed@main", + "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes", ] } ) From 89488b9514a702df6dffcb407c690d302ffe7942 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 14 Dec 2022 20:22:22 -0600 Subject: [PATCH 3/5] Dataset --- tests/benchmarks/h2o/test_h2o_benchmarks.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py index aa8be8b916..c27c2a497f 100644 --- a/tests/benchmarks/h2o/test_h2o_benchmarks.py +++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py @@ -14,12 +14,13 @@ @pytest.fixture( scope="module", params=[ - "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_single.csv", - # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_single.csv", - # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_single.csv", - "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet", - "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet", - # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet", + # "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_single.csv", + # # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_single.csv", + # # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_single.csv", + # "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet", + # "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet", + # # "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet", + "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet", ], ids=[ "0.5 GB (csv)", From 57ddc7354404f0a4d58c3c0ec2c9e473fdd2cbb5 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 14 Dec 2022 21:36:09 -0600 Subject: [PATCH 4/5] Fixup --- tests/benchmarks/h2o/test_h2o_benchmarks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/benchmarks/h2o/test_h2o_benchmarks.py b/tests/benchmarks/h2o/test_h2o_benchmarks.py index c27c2a497f..8ce7f99554 100644 --- a/tests/benchmarks/h2o/test_h2o_benchmarks.py +++ b/tests/benchmarks/h2o/test_h2o_benchmarks.py @@ -23,12 +23,12 @@ "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet", ], ids=[ - "0.5 GB (csv)", - # "5 GB (csv)", - # "50 GB (csv)", - "0.5 GB (parquet)", - "5 GB (parquet)", - # "50 GB (parquet)", + # "0.5 GB (csv)", + # # "5 GB (csv)", + # # "50 GB (csv)", + # "0.5 GB (parquet)", + # "5 GB (parquet)", + "50 GB (parquet)", ], ) def ddf(request): From 2fef82a248b278e32dc93314024186f93d79005c Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 25 Jan 2023 14:56:19 -0600 Subject: [PATCH 5/5] Update --- ci/create_runtime_meta.py | 2 +- tests/benchmarks/test_parquet.py | 6 +----- tests/conftest.py | 6 +++++- tests/runtime/test_xgboost.py | 1 - 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ci/create_runtime_meta.py b/ci/create_runtime_meta.py index e46320b12d..8bb59a67fa 100644 --- a/ci/create_runtime_meta.py +++ b/ci/create_runtime_meta.py @@ -45,7 +45,7 @@ def main(): { "pip": [ "git+https://github.com/dask/distributed@main", - "git+https://github.com/jrbourbeau/dask@pyarrow-use-nullable-dtypes", + "git+https://github.com/jrbourbeau/dask@nullable-config", ] } ) diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py index 825cd97267..1ad6d6428b 100644 --- a/tests/benchmarks/test_parquet.py +++ b/tests/benchmarks/test_parquet.py @@ -50,7 +50,6 @@ def test_read_spark_generated_data(parquet_client): "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet", engine="pyarrow", index="sample_id", - use_nullable_dtypes=True, ) ddf.groupby(ddf.index).first().compute() @@ -66,7 +65,6 @@ def test_read_hive_partitioned_data(parquet_client): ddf = dd.read_parquet( "s3://coiled-runtime-ci/ookla-open-data/type=fixed/**.parquet", engine="pyarrow", - use_nullable_dtypes=True, ) ddf.groupby(["year", "quarter"]).first().compute() @@ -110,6 +108,4 @@ def load(path): parquet_client.submit(pandas.read_parquet, path, engine="pyarrow") ) elif kind == "dask": - distributed.wait( - dd.read_parquet(path, engine="pyarrow", use_nullable_dtypes=True).persist() - ) + distributed.wait(dd.read_parquet(path, engine="pyarrow").persist()) diff --git a/tests/conftest.py b/tests/conftest.py index 20bdc2214c..a6124993fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -84,7 +84,11 @@ def get_coiled_runtime_version(): dask.config.set( - {"coiled.account": "dask-engineering", "dataframe.nullable_backend": "pyarrow"} + { + "coiled.account": "dask-engineering", + "dataframe.nullable_dtypes": True, + "dataframe.dtype_backend": "pyarrow", + } ) COILED_RUNTIME_VERSION = get_coiled_runtime_version() diff --git a/tests/runtime/test_xgboost.py b/tests/runtime/test_xgboost.py index 3d5e829f84..2daadd78bb 100644 --- a/tests/runtime/test_xgboost.py +++ b/tests/runtime/test_xgboost.py @@ -12,7 +12,6 @@ def test_xgboost_distributed_training(small_client): ddf = dd.read_parquet( "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet", storage_options={"anon": True}, - use_nullable_dtypes=True, ) ddf = ddf.partitions[0:30] ddf = ddf.persist()