From 6b39e43ae538606e59e84797ba022da515b55218 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Mon, 9 Sep 2024 18:30:20 -0500 Subject: [PATCH 01/10] fix: empty arrays in ak.to_parquet with extensionarray=True (#3234) --- src/awkward/_connect/pyarrow/table_conv.py | 2 +- tests/test_2772_parquet_extn_array_metadata.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/awkward/_connect/pyarrow/table_conv.py b/src/awkward/_connect/pyarrow/table_conv.py index 53f286c730..4434c94009 100644 --- a/src/awkward/_connect/pyarrow/table_conv.py +++ b/src/awkward/_connect/pyarrow/table_conv.py @@ -221,7 +221,7 @@ def replace_schema(table: pyarrow.Table, new_schema: pyarrow.Schema) -> pyarrow. new_batches.append( pyarrow.RecordBatch.from_arrays(arrays=columns, schema=new_schema) ) - return pyarrow.Table.from_batches(new_batches) + return pyarrow.Table.from_batches(new_batches, schema=new_schema) def array_with_replacement_type( diff --git a/tests/test_2772_parquet_extn_array_metadata.py b/tests/test_2772_parquet_extn_array_metadata.py index fd4c9fede6..aafdb84338 100644 --- a/tests/test_2772_parquet_extn_array_metadata.py +++ b/tests/test_2772_parquet_extn_array_metadata.py @@ -206,3 +206,12 @@ def test_selective_parquet(tmp_path): ak.to_parquet(ak_tbl, filename) tbl_tr = ak.from_parquet(filename, columns=["struct_array", "indexed"]) assert to_list(tbl_tr["struct_array"]) == to_list(ak_tbl["struct_array"]) + + +@pytest.mark.parametrize("doit", [False, True]) +def test_empty(tmp_path, doit): + filename = os.path.join(tmp_path, "whatever.parquet") + + ak.to_parquet(ak.Array([{"x": 1, "y": 1.1}])[0:0], filename, extensionarray=doit) + + assert str(ak.from_parquet(filename).type) == "0 * {x: int64, y: float64}" From a0b3858a3b6ebd9e6ef7851a64cdb8116d8ec85b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:57:49 +0000 Subject: [PATCH 02/10] chore: update pre-commit hooks (#3204) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: update pre-commit hooks updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.5 → v0.6.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.5...v0.6.4) - [github.com/python-jsonschema/check-jsonschema: 0.29.1 → 0.29.2](https://github.com/python-jsonschema/check-jsonschema/compare/0.29.1...0.29.2) - [github.com/pre-commit/mirrors-mypy: v1.11.0 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.0...v1.11.2) - [github.com/abravalheri/validate-pyproject: v0.18 → v0.19](https://github.com/abravalheri/validate-pyproject/compare/v0.18...v0.19) * chore: drop some unneeded config --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski Co-authored-by: Henry Schreiner --- .pre-commit-config.yaml | 8 ++++---- pyproject.toml | 7 +------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index acd55d8d29..d8dac77444 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: additional_dependencies: [pyyaml] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.6.4 hooks: - id: ruff args: ["--fix", "--show-fixes"] @@ -62,13 +62,13 @@ repos: files: ^tests/ - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.29.1 + rev: 0.29.2 hooks: - id: check-github-workflows args: ["--verbose"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.0 + rev: v1.11.2 hooks: - id: mypy files: src @@ -76,6 +76,6 @@ repos: - numpy>=1.24 - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.18 + rev: v0.19 hooks: - id: validate-pyproject diff --git a/pyproject.toml b/pyproject.toml index 678c07a279..20e701f5de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,7 +136,7 @@ filterwarnings = [ ] log_cli_level = "info" -[tool.pylint.master] +[tool.pylint] py-version = "3.8" jobs = "0" ignore-paths = [ @@ -251,7 +251,6 @@ ignore_errors = true ignore_missing_imports = true [tool.ruff] -src = ["src"] extend-exclude = [ "studies", "pybind11", @@ -289,10 +288,6 @@ ignore = [ "PLC1901", # x == "" can be simplified to not x (empty string is falsey) "ISC001", # Conflicts with the formatter in 0.1.2 ] -unfixable = [ - "T20", # Removes print statements - "F841", # Removes unused variables -] typing-modules = ["awkward._typing"] external = [] mccabe.max-complexity = 100 From 7a825bfc2e8a05010d2cfd79a52b4ac25347de4e Mon Sep 17 00:00:00 2001 From: Ianna Osborne Date: Thu, 12 Sep 2024 18:53:49 +0200 Subject: [PATCH 03/10] fix: GPU complex reducer prod for empty lists (#3235) * fix: make sure that both CPU and GPU produce identical results * fix: one more --- .../cuda/cuda_kernels/awkward_reduce_prod_complex.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_complex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_complex.cu index 9a0c66846f..2e84a0cf9c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_complex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_complex.cu @@ -30,8 +30,8 @@ awkward_reduce_prod_complex_a( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { - toptr[thread_id * 2] = (T)1.0f; - toptr[thread_id * 2 + 1] = (T)0.0f; + toptr[thread_id * 2] = (T)1; + toptr[thread_id * 2 + 1] = (T)0; } } } @@ -59,8 +59,8 @@ awkward_reduce_prod_complex_b( if (thread_id < lenparents) { for (int64_t stride = 1; stride < blockDim.x; stride *= 2) { - T real = (T)1.0f; - T imag = (T)0.0f; + T real = (T)1; + T imag = (T)0; if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) { real = temp[(idx - stride) * 2]; imag = temp[(idx - stride) * 2 + 1]; From af0cceab9988314d01ffd9719027c6f988e1d43d Mon Sep 17 00:00:00 2001 From: Henry Schreiner Date: Thu, 12 Sep 2024 14:32:57 -0400 Subject: [PATCH 04/10] ci: add 3.13 wheels (#3217) * ci: add 3.13 wheels Signed-off-by: Henry Schreiner * style: pre-commit fixes * Update requirements-test-full.txt * Update cibuildwheel.toml * ci: build pyo3 in forward compat mode * Update test.yml --------- Signed-off-by: Henry Schreiner Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski --- .github/workflows/build-wheels.yml | 6 +++--- .github/workflows/packaging-test.yml | 4 ++-- .github/workflows/test.yml | 6 ++++++ awkward-cpp/pyproject.toml | 4 ++-- cibuildwheel.toml | 4 ++-- pyproject.toml | 1 + requirements-test-full.txt | 10 +++++----- 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index d6f992588e..361d292285 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -105,7 +105,7 @@ jobs: - name: Prepare build files run: pipx run nox -s prepare - - uses: pypa/cibuildwheel@v2.19 + - uses: pypa/cibuildwheel@v2.20 env: CIBW_BUILD: "${{ matrix.build }}*" CIBW_ARCHS: ${{ matrix.arch }} @@ -136,7 +136,7 @@ jobs: SOURCE_DATE_EPOCH: ${{ needs.determine-source-date-epoch.outputs.source-date-epoch }} strategy: matrix: - python: [38, 39, 310, 311, 312] + python: [38, 39, 310, 311, 312, 313] arch: [aarch64] steps: @@ -157,7 +157,7 @@ jobs: - uses: docker/setup-qemu-action@v3.2.0 - - uses: pypa/cibuildwheel@v2.19 + - uses: pypa/cibuildwheel@v2.20 env: CIBW_BUILD: cp${{ matrix.python }}-* CIBW_ARCHS: ${{ matrix.arch }} diff --git a/.github/workflows/packaging-test.yml b/.github/workflows/packaging-test.yml index 8317cc58a2..20ae0fe714 100644 --- a/.github/workflows/packaging-test.yml +++ b/.github/workflows/packaging-test.yml @@ -68,7 +68,7 @@ jobs: - name: Prepare build files run: pipx run nox -s prepare - - uses: pypa/cibuildwheel@v2.19 + - uses: pypa/cibuildwheel@v2.20 env: CIBW_ARCHS_MACOS: universal2 CIBW_BUILD: cp39-win_amd64 cp310-manylinux_x86_64 cp38-macosx_universal2 @@ -76,7 +76,7 @@ jobs: config-file: cibuildwheel.toml package-dir: awkward-cpp - - uses: pypa/cibuildwheel@v2.19 + - uses: pypa/cibuildwheel@v2.20 if: matrix.os == 'ubuntu-latest' env: CIBW_BUILD: cp312-manylinux_x86_64 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5ef43543af..2c58f66746 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,6 +39,7 @@ jobs: - ubuntu-latest - macos-13 python-version: + - '3.13' - '3.12' - '3.11' - '3.10' @@ -110,6 +111,11 @@ jobs: files: | awkward-cpp/dist/*.whl + - name: Add workaround for 3.13 + cramjam + if: matrix.python-version == '3.13' + run: echo 'PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1' >> $GITHUB_ENV + shell: bash + - name: Install awkward, awkward-cpp, and dependencies run: >- python -m pip install -v . ${{ steps.find-wheel.outputs.paths }} pytest-github-actions-annotate-failures diff --git a/awkward-cpp/pyproject.toml b/awkward-cpp/pyproject.toml index 9d64da31ab..9c5637b5d3 100644 --- a/awkward-cpp/pyproject.toml +++ b/awkward-cpp/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "scikit-build-core>=0.9", + "scikit-build-core>=0.10", "pybind11", ] build-backend = "scikit_build_core.build" @@ -59,7 +59,7 @@ Releases = "https://github.com/scikit-hep/awkward-1.0/releases" [tool.scikit-build] -minimum-version = "0.9" +minimum-version = "build-system.requires" build-dir = "build/{cache_tag}" sdist.reproducible = true sdist.include = [ diff --git a/cibuildwheel.toml b/cibuildwheel.toml index 96f94455fe..91827c69a0 100644 --- a/cibuildwheel.toml +++ b/cibuildwheel.toml @@ -23,5 +23,5 @@ build-verbosity = 1 PIP_ONLY_BINARY = "cmake,numpy" [[tool.cibuildwheel.overrides]] -select = "cp312-*" -environment.PIP_PRE = "1" +select = "cp313*" +environment.PYO3_USE_ABI3_FORWARD_COMPATIBILITY = "1" diff --git a/pyproject.toml b/pyproject.toml index 20e701f5de..b985ae05cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Mathematics", diff --git a/requirements-test-full.txt b/requirements-test-full.txt index 51e47e53f2..ded83569b7 100644 --- a/requirements-test-full.txt +++ b/requirements-test-full.txt @@ -1,9 +1,9 @@ fsspec>=2022.11.0;sys_platform != "win32" -jax[cpu]>=0.2.15;sys_platform != "win32" and python_version < "3.12" -numba>=0.50.0;sys_platform != "win32" and python_version < "3.12" -numexpr>=2.7; python_version < "3.12" -pandas>=0.24.0;sys_platform != "win32" and python_version < "3.12" -pyarrow==16.0.0;sys_platform != "win32" and python_version < "3.12" +jax[cpu]>=0.2.15;sys_platform != "win32" and python_version < "3.13" +numba>=0.50.0;sys_platform != "win32" and python_version < "3.13" +numexpr>=2.7; python_version < "3.13" +pandas>=0.24.0;sys_platform != "win32" and python_version < "3.13" +pyarrow==16.0.0;sys_platform != "win32" and python_version < "3.13" pytest>=6 pytest-cov pytest-xdist From 181fa9844d85b73fa24931a44368173c86bf9e0a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 14:43:07 -0500 Subject: [PATCH 05/10] chore(deps): bump the actions group across 1 directory with 3 updates (#3233) Bumps the actions group with 3 updates in the / directory: [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel), [actions/attest-build-provenance](https://github.com/actions/attest-build-provenance) and [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish). Updates `pypa/cibuildwheel` from 2.19 to 2.20 - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.19...v2.20) Updates `actions/attest-build-provenance` from 1.3.3 to 1.4.3 - [Release notes](https://github.com/actions/attest-build-provenance/releases) - [Changelog](https://github.com/actions/attest-build-provenance/blob/main/RELEASE.md) - [Commits](https://github.com/actions/attest-build-provenance/compare/5e9cb68e95676991667494a6a4e59b8a2f13e1d0...1c608d11d69870c2092266b3f9a6f3abbf17002c) Updates `pypa/gh-action-pypi-publish` from 1.9.0 to 1.10.1 - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.9.0...v1.10.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions - dependency-name: actions/attest-build-provenance dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski --- .github/workflows/deploy-cpp.yml | 4 ++-- .github/workflows/deploy.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-cpp.yml b/.github/workflows/deploy-cpp.yml index cfa540fbc2..67fa3624bf 100644 --- a/.github/workflows/deploy-cpp.yml +++ b/.github/workflows/deploy-cpp.yml @@ -35,8 +35,8 @@ jobs: run: ls -l dist/ - name: Generate artifact attestation for sdist and wheel - uses: actions/attest-build-provenance@5e9cb68e95676991667494a6a4e59b8a2f13e1d0 # v1.3.3 + uses: actions/attest-build-provenance@1c608d11d69870c2092266b3f9a6f3abbf17002c # v1.4.3 with: subject-path: "dist/awkward*cpp-*" - - uses: pypa/gh-action-pypi-publish@v1.9.0 + - uses: pypa/gh-action-pypi-publish@v1.10.1 diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 4e6c2aa746..e62bca6c2b 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -88,7 +88,7 @@ jobs: run: pipx run twine check dist/* - name: Generate artifact attestation for sdist and wheel - uses: actions/attest-build-provenance@5e9cb68e95676991667494a6a4e59b8a2f13e1d0 # v1.3.3 + uses: actions/attest-build-provenance@1c608d11d69870c2092266b3f9a6f3abbf17002c # v1.4.3 with: subject-path: "dist/awkward-*" @@ -135,7 +135,7 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: gh attestation verify dist/awkward-*.whl --repo ${{ github.repository }} - - uses: pypa/gh-action-pypi-publish@v1.9.0 + - uses: pypa/gh-action-pypi-publish@v1.10.1 publish-headers: name: "Publish header-only libraries alongside release" From af113fd6da98290c2d95dec982cbfed30d66c930 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 12 Sep 2024 14:52:33 -0500 Subject: [PATCH 06/10] The next awkward-cpp release is 38. --- awkward-cpp/pyproject.toml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/awkward-cpp/pyproject.toml b/awkward-cpp/pyproject.toml index 9c5637b5d3..43411b9299 100644 --- a/awkward-cpp/pyproject.toml +++ b/awkward-cpp/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build" [project] name = "awkward_cpp" -version = "37" +version = "38" dependencies = [ "numpy>=1.18.0", "importlib_resources;python_version < \"3.9\"" diff --git a/pyproject.toml b/pyproject.toml index b985ae05cd..c90871835e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ classifiers = [ "Topic :: Utilities", ] dependencies = [ - "awkward_cpp==37", + "awkward_cpp==38", "importlib_metadata>=4.13.0;python_version < \"3.12\"", "numpy>=1.18.0", "packaging", From dea122a71a2be7cd1b534814c0284b64993a4d10 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Thu, 12 Sep 2024 15:59:19 -0500 Subject: [PATCH 07/10] The next release will be 2.6.8. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c90871835e..b2876972bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "hatchling.build" [project] name = "awkward" -version = "2.6.7" +version = "2.6.8" description = "Manipulate JSON-like data with NumPy-like idioms." license = { text = "BSD-3-Clause" } requires-python = ">=3.8" From 8619299f32d7abcc2cd13aeb61421f39a3999e34 Mon Sep 17 00:00:00 2001 From: "allcontributors[bot]" <46447321+allcontributors[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:03:27 -0500 Subject: [PATCH 08/10] docs: add ariostas as a contributor for code (#3240) * docs: update README.md [skip ci] * docs: update .all-contributorsrc [skip ci] --------- Co-authored-by: allcontributors[bot] <46447321+allcontributors[bot]@users.noreply.github.com> --- .all-contributorsrc | 9 +++++++++ README.md | 1 + 2 files changed, 10 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 68800f50f0..b629839afd 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -465,6 +465,15 @@ "contributions": [ "code" ] + }, + { + "login": "ariostas", + "name": "Andres Rios Tascon", + "avatar_url": "https://avatars.githubusercontent.com/u/7596837?v=4", + "profile": "http://www.ariostas.com", + "contributions": [ + "code" + ] } ], "contributorsPerLine": 7, diff --git a/README.md b/README.md index dd5d38399f..53faad33df 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,7 @@ Thanks especially to the gracious help of Awkward Array contributors (including Peter Fackeldey
Peter Fackeldey

💻 + Andres Rios Tascon
Andres Rios Tascon

💻 From 14201213315d2c15465e999471ff4c3b4e5af59a Mon Sep 17 00:00:00 2001 From: "allcontributors[bot]" <46447321+allcontributors[bot]@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:04:07 -0500 Subject: [PATCH 09/10] docs: add maxymnaumchyk as a contributor for code (#3241) * docs: update README.md [skip ci] * docs: update .all-contributorsrc [skip ci] --------- Co-authored-by: allcontributors[bot] <46447321+allcontributors[bot]@users.noreply.github.com> --- .all-contributorsrc | 9 +++++++++ README.md | 1 + 2 files changed, 10 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index b629839afd..ccaf3440b8 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -474,6 +474,15 @@ "contributions": [ "code" ] + }, + { + "login": "maxymnaumchyk", + "name": "maxymnaumchyk", + "avatar_url": "https://avatars.githubusercontent.com/u/70752300?v=4", + "profile": "https://github.com/maxymnaumchyk", + "contributions": [ + "code" + ] } ], "contributorsPerLine": 7, diff --git a/README.md b/README.md index 53faad33df..b71f4e7e9b 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,7 @@ Thanks especially to the gracious help of Awkward Array contributors (including Peter Fackeldey
Peter Fackeldey

💻 Andres Rios Tascon
Andres Rios Tascon

💻 + maxymnaumchyk
maxymnaumchyk

💻 From 1c368f16f27e5f18e9c1d55ff0993e40a67dbd88 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 13 Sep 2024 14:00:01 -0400 Subject: [PATCH 10/10] feat: Add to_cudf (#3051) * Start * style: pre-commit fixes * results of chatting * style: pre-commit fixes * Add toplevel func * fix * fix in indexedoptionarray * Add tests and fixes * style: pre-commit fixes * use direct np module (for now) * style: pre-commit fixes * Don't accidentally iterate cupy with CPU * style: pre-commit fixes * Update src/awkward/_errors.py * add docstring * Add string * style: pre-commit fixes * Simpler for numerics This works also for time types, without going via cupy --------- Co-authored-by: Martin Durant Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- awkward-cpp/rapidjson | 2 +- src/awkward/contents/bitmaskedarray.py | 19 ++++++++ src/awkward/contents/bytemaskedarray.py | 13 +++++ src/awkward/contents/content.py | 4 ++ src/awkward/contents/emptyarray.py | 9 ++++ src/awkward/contents/indexedarray.py | 10 ++++ src/awkward/contents/indexedoptionarray.py | 3 ++ src/awkward/contents/listarray.py | 3 ++ src/awkward/contents/listoffsetarray.py | 34 +++++++++++++ src/awkward/contents/numpyarray.py | 15 ++++++ src/awkward/contents/recordarray.py | 17 +++++++ src/awkward/contents/unmaskedarray.py | 3 ++ src/awkward/operations/__init__.py | 1 + src/awkward/operations/ak_to_cudf.py | 21 ++++++++ tests-cuda/test_3051_to_cuda.py | 57 ++++++++++++++++++++++ 15 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 src/awkward/operations/ak_to_cudf.py create mode 100644 tests-cuda/test_3051_to_cuda.py diff --git a/awkward-cpp/rapidjson b/awkward-cpp/rapidjson index 3b2441b87f..f54b0e47a0 160000 --- a/awkward-cpp/rapidjson +++ b/awkward-cpp/rapidjson @@ -1 +1 @@ -Subproject commit 3b2441b87f99ab65f37b141a7b548ebadb607b96 +Subproject commit f54b0e47a08782a6131cc3d60f94d038fa6e0a51 diff --git a/src/awkward/contents/bitmaskedarray.py b/src/awkward/contents/bitmaskedarray.py index 0e12133d6e..9c70bfc4b5 100644 --- a/src/awkward/contents/bitmaskedarray.py +++ b/src/awkward/contents/bitmaskedarray.py @@ -11,6 +11,7 @@ from awkward._backends.backend import Backend from awkward._meta.bitmaskedmeta import BitMaskedMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -687,6 +688,24 @@ def _to_arrow( pyarrow, mask_node, validbytes, length, options ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cp = Cupy.instance()._module + + assert mask is None # this class has its own mask + if not self.lsb_order: + m = cp.flip( + cp.packbits(cp.flip(cp.unpackbits(cp.asarray(self._mask.data)))) + ) + else: + m = self._mask.data + + if m.nbytes % 64: + m = cp.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + inner = self._content._to_cudf(cudf, mask=None, length=length) + inner.set_base_mask(m) + return inner + def _to_backend_array(self, allow_missing, backend): return self.to_ByteMaskedArray()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/bytemaskedarray.py b/src/awkward/contents/bytemaskedarray.py index 65ad948a16..87beb5f59f 100644 --- a/src/awkward/contents/bytemaskedarray.py +++ b/src/awkward/contents/bytemaskedarray.py @@ -12,6 +12,7 @@ from awkward._layout import maybe_posaxis from awkward._meta.bytemaskedmeta import ByteMaskedMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -1051,6 +1052,18 @@ def _to_arrow( options, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cp = Cupy.instance()._module + + assert mask is None # this class has its own mask + m = cp.packbits(cp.asarray(self._mask), bitorder="little") + if m.nbytes % 64: + m = cp.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + inner = self._content._to_cudf(cudf, mask=None, length=length) + inner.set_base_mask(m) + return inner + def _to_backend_array(self, allow_missing, backend): return self.to_IndexedOptionArray64()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/content.py b/src/awkward/contents/content.py index 1a0fe080a9..d0169ee2eb 100644 --- a/src/awkward/contents/content.py +++ b/src/awkward/contents/content.py @@ -1010,6 +1010,10 @@ def _to_arrow( ): raise NotImplementedError + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + # prototype abstract signature + raise NotImplementedError + def to_backend_array( self, allow_missing: bool = True, *, backend: Backend | str | None = None ): diff --git a/src/awkward/contents/emptyarray.py b/src/awkward/contents/emptyarray.py index 112effddf0..06447f2d8b 100644 --- a/src/awkward/contents/emptyarray.py +++ b/src/awkward/contents/emptyarray.py @@ -387,6 +387,15 @@ def _to_arrow( ) return next._to_arrow(pyarrow, mask_node, validbytes, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + dtype = np.dtype("float64") + next = ak.contents.NumpyArray( + numpy.empty(length, dtype=dtype), + parameters=self._parameters, + backend=self._backend, + ) + return next._to_cudf(cudf, None, 0) + @classmethod def _arrow_needs_option_type(cls): return True # This overrides Content._arrow_needs_option_type diff --git a/src/awkward/contents/indexedarray.py b/src/awkward/contents/indexedarray.py index 6fb4ea3c69..6421f51742 100644 --- a/src/awkward/contents/indexedarray.py +++ b/src/awkward/contents/indexedarray.py @@ -1049,6 +1049,16 @@ def _to_arrow( ) return next2._to_arrow(pyarrow, mask_node, validbytes, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + if self._content.length == 0: + # IndexedOptionArray._to_arrow replaces -1 in the index with 0. So behind + # every masked value is self._content[0], unless self._content.length == 0. + # In that case, don't call self._content[index]; it's empty anyway. + next = self._content + else: + next = self._content._carry(self._index, False) + return next._to_cudf(cudf, None, len(next)) + def _to_backend_array(self, allow_missing, backend): return self.project()._to_backend_array(allow_missing, backend) diff --git a/src/awkward/contents/indexedoptionarray.py b/src/awkward/contents/indexedoptionarray.py index 0e68461dc5..2162fb72c4 100644 --- a/src/awkward/contents/indexedoptionarray.py +++ b/src/awkward/contents/indexedoptionarray.py @@ -1576,6 +1576,9 @@ def _to_arrow( options, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self.to_ByteMaskedArray(True)._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): nplike = backend.nplike index_nplike = backend.index_nplike diff --git a/src/awkward/contents/listarray.py b/src/awkward/contents/listarray.py index 722b9044dd..a05eeaea55 100644 --- a/src/awkward/contents/listarray.py +++ b/src/awkward/contents/listarray.py @@ -1498,6 +1498,9 @@ def _to_arrow( pyarrow, mask_node, validbytes, length, options ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self.to_ListOffsetArray64(False)._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") if array_param in {"bytestring", "string"}: diff --git a/src/awkward/contents/listoffsetarray.py b/src/awkward/contents/listoffsetarray.py index 4aa149b69d..003467c24b 100644 --- a/src/awkward/contents/listoffsetarray.py +++ b/src/awkward/contents/listoffsetarray.py @@ -10,6 +10,7 @@ from awkward._layout import maybe_posaxis from awkward._meta.listoffsetmeta import ListOffsetMeta from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata from awkward._nplikes.placeholder import PlaceholderArray @@ -1999,6 +2000,39 @@ def _to_arrow( ), ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cupy = Cupy.instance() + index = self._offsets.raw(cupy).astype("int32") + buf = cudf.core.buffer.as_buffer(index) + ind_buf = cudf.core.column.numerical.NumericalColumn( + buf, index.dtype, None, size=len(index) + ) + cont = self._content._to_cudf(cudf, None, len(self._content)) + if mask is not None: + m = np._module.packbits(mask, bitorder="little") + if m.nbytes % 64: + m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(cupy.asarray(m)) + else: + m = None + if self.parameters.get("__array__") == "string": + from cudf.core.column.string import StringColumn + + data = cudf.core.buffer.as_buffer(cupy.asarray(self._content.data)) + # docs for StringColumn says there should be two children instead of a data= + return StringColumn( + data=data, + children=(ind_buf,), + mask=m, + ) + + return cudf.core.column.lists.ListColumn( + length, + mask=m, + children=(ind_buf, cont), + dtype=cudf.core.dtypes.ListDtype(cont.dtype), + ) + def _to_backend_array(self, allow_missing, backend): array_param = self.parameter("__array__") if array_param == "string": diff --git a/src/awkward/contents/numpyarray.py b/src/awkward/contents/numpyarray.py index 11a73bb124..315d9383b7 100644 --- a/src/awkward/contents/numpyarray.py +++ b/src/awkward/contents/numpyarray.py @@ -14,6 +14,7 @@ from awkward._meta.numpymeta import NumpyMeta from awkward._nplikes import to_nplike from awkward._nplikes.array_like import ArrayLike +from awkward._nplikes.cupy import Cupy from awkward._nplikes.jax import Jax from awkward._nplikes.numpy import Numpy from awkward._nplikes.numpy_like import IndexType, NumpyMetadata @@ -1220,6 +1221,20 @@ def _to_arrow( ), ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + cupy = Cupy.instance() + from cudf.core.column.column import as_column + + assert self._backend.nplike.known_data + data = as_column(self._data) + if mask is not None: + m = cupy.packbits(cupy.asarray(mask), bitorder="little") + if m.nbytes % 64: + m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64) + m = cudf.core.buffer.as_buffer(m) + data.set_base_data(m) + return data + def _to_backend_array(self, allow_missing, backend): return to_nplike(self.data, backend.nplike, from_nplike=self._backend.nplike) diff --git a/src/awkward/contents/recordarray.py b/src/awkward/contents/recordarray.py index c091d45365..4aafcfd6b2 100644 --- a/src/awkward/contents/recordarray.py +++ b/src/awkward/contents/recordarray.py @@ -1101,6 +1101,23 @@ def _to_arrow( children=values, ) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + children = tuple( + c._to_cudf(cudf, mask=None, length=length) for c in self.contents + ) + dt = cudf.core.dtypes.StructDtype( + {field: c.dtype for field, c in zip(self.fields, children)} + ) + m = mask._to_cudf(cudf, None, length) if mask else None + return cudf.core.column.struct.StructColumn( + data=None, + children=children, + dtype=dt, + mask=m, + size=length, + offset=0, + ) + def _to_backend_array(self, allow_missing, backend): if self.fields is None: return backend.nplike.empty(self.length, dtype=[]) diff --git a/src/awkward/contents/unmaskedarray.py b/src/awkward/contents/unmaskedarray.py index cbc726b310..0dd500ebc1 100644 --- a/src/awkward/contents/unmaskedarray.py +++ b/src/awkward/contents/unmaskedarray.py @@ -498,6 +498,9 @@ def _to_arrow( ): return self._content._to_arrow(pyarrow, self, None, length, options) + def _to_cudf(self, cudf: Any, mask: Content | None, length: int): + return self._content._to_cudf(cudf, mask, length) + def _to_backend_array(self, allow_missing, backend): content = self.content._to_backend_array(allow_missing, backend) if allow_missing: diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py index 6d4a84c565..e9b1a3818b 100644 --- a/src/awkward/operations/__init__.py +++ b/src/awkward/operations/__init__.py @@ -86,6 +86,7 @@ from awkward.operations.ak_to_arrow_table import * from awkward.operations.ak_to_backend import * from awkward.operations.ak_to_buffers import * +from awkward.operations.ak_to_cudf import * from awkward.operations.ak_to_cupy import * from awkward.operations.ak_to_dataframe import * from awkward.operations.ak_to_feather import * diff --git a/src/awkward/operations/ak_to_cudf.py b/src/awkward/operations/ak_to_cudf.py new file mode 100644 index 0000000000..e45fe041a2 --- /dev/null +++ b/src/awkward/operations/ak_to_cudf.py @@ -0,0 +1,21 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE +from __future__ import annotations + +import awkward as ak +from awkward._dispatch import high_level_function + +__all__ = ("to_cudf",) + + +@high_level_function() +def to_cudf( + array: ak.Array, +): + """Create a cuDF.Series out of the given ak array + + Buffers that are not already in GPU memory will be transferred, and some + structural reformatting may happen to account for differences in architecture. + """ + import cudf + + return cudf.Series(array.layout._to_cudf(cudf, None, len(array))) diff --git a/tests-cuda/test_3051_to_cuda.py b/tests-cuda/test_3051_to_cuda.py new file mode 100644 index 0000000000..af02ed798f --- /dev/null +++ b/tests-cuda/test_3051_to_cuda.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import pytest + +import awkward as ak + +cudf = pytest.importorskip("cudf") +cupy = pytest.importorskip("cupy") + + +def test_jagged(): + arr = ak.Array([[[1, 2, 3], [], [3, 4]], []]) + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [[[1, 2, 3], [], [3, 4]], []] + + +def test_nested(): + arr = ak.Array( + [{"a": 0, "b": 1.0, "c": {"d": 0}}, {"a": 1, "b": 0.0, "c": {"d": 1}}] + ) + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [ + {"a": 0, "b": 1.0, "c": {"d": 0}}, + {"a": 1, "b": 0.0, "c": {"d": 1}}, + ] + + +def test_null(): + arr = ak.Array([12, None, 21, 12]) + # calls ByteMaskedArray._to_cudf not NumpyArray + out = ak.to_cudf(arr) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + # True is valid, LSB order + arr2 = ak.Array(arr.layout.to_BitMaskedArray(True, True)) + out = ak.to_cudf(arr2) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + # reversed LSB (should be rare, involves extra work!) + arr3 = ak.Array(arr.layout.to_BitMaskedArray(True, False)) + out = ak.to_cudf(arr3) + assert isinstance(out, cudf.Series) + assert out.to_arrow().tolist() == [12, None, 21, 12] + + +def test_strings(): + arr = ak.Array(["hey", "hi", "hum"]) + out = ak.to_cudf(arr) + assert out.to_arrow().tolist() == ["hey", "hi", "hum"] + + arr = ak.Array(["hey", "hi", None, "hum"]) + out = ak.to_cudf(arr) + assert out.to_arrow().tolist() == ["hey", "hi", None, "hum"]